|
- # -*- coding: utf-8 -*-
- # MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
- #
- # Copyright (c) 2014-2020 Megvii Inc. All rights reserved.
- #
- # Unless required by applicable law or agreed to in writing,
- # software distributed under the License is distributed on an
- # "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- from io import BytesIO
-
- import numpy as np
- from helpers import MLP, graph_mode
-
- import megengine.functional as F
- from megengine import load, save
- from megengine.core import TensorDict, tensor
- from megengine.jit import trace
- from megengine.optimizer import SGD, Adam
- from megengine.test import assertTensorClose
-
-
- def get_input():
- batch_size = 2
- input_dim = 28
- data_shape = (batch_size, input_dim)
- label_shape = (batch_size,)
- data = tensor()
- label = tensor(dtype=np.int32)
- data.set_value(np.random.random(data_shape).astype(np.float32))
- label.set_value(np.random.randint(0, 10, label_shape))
- return data, data_shape, label, label_shape
-
-
- def test_sgd_simple():
- data, data_shape, label, label_shape = get_input()
- mlp = MLP()
- opt = SGD(mlp.parameters(), lr=0.01, weight_decay=0.1)
- for idx in range(3):
- data.set_value(np.random.random(data_shape).astype(np.float32))
- label.set_value(np.random.randint(0, 10, label_shape))
- pred = mlp(data)
- loss = F.square_loss(pred, label.reshape(-1, 1))
- if idx % 2:
- opt.zero_grad()
- else:
- mlp.zero_grad()
- opt.backward(loss)
- grads = TensorDict()
- orig_params = TensorDict()
- for param in mlp.parameters():
- grad = F.grad(loss, param, use_virtual_grad=False)
- assertTensorClose(grad.numpy(), param.grad.numpy())
- grads[param] = np.copy(grad.numpy())
- orig_params[param] = np.copy(param.numpy())
- opt.step()
- for param in mlp.parameters():
- assertTensorClose(
- param.numpy(), orig_params[param] * 0.999 - grads[param] * 0.01
- )
-
-
- def test_sgd_momentum():
- data, data_shape, label, label_shape = get_input()
- mlp = MLP()
- opt = SGD(mlp.parameters(), lr=0.01, momentum=0.9)
- slots = TensorDict()
- for param in mlp.parameters():
- slots[param] = np.zeros(param.shape).astype(np.float32)
- for _ in range(3):
- data.set_value(np.random.random(data_shape).astype(np.float32))
- label.set_value(np.random.randint(0, 10, label_shape))
- pred = mlp(data)
- loss = F.square_loss(pred, label.reshape(-1, 1))
- opt.zero_grad()
- opt.backward(loss)
- orig_params = TensorDict()
- grads = TensorDict()
- for param in mlp.parameters():
- orig_params[param] = np.copy(param.numpy())
- grads[param] = np.copy(param.grad.numpy())
- opt.step()
- for param in mlp.parameters():
- slot = slots[param]
- orig_param = orig_params[param]
- slot *= 0.9
- slot -= param.grad.numpy() * 0.01
- assertTensorClose(param.numpy(), orig_param + slot)
-
-
- # TODO: put opt.step() inside trace
- def test_sgd_momentum_static():
- _, data_shape, _, label_shape = get_input()
- mlp = MLP()
- opt = SGD(mlp.parameters(), lr=0.01, momentum=0.9)
-
- @trace
- def f(data, label):
- pred = mlp(data)
- loss = F.square_loss(pred, label.reshape(-1, 1))
- opt.zero_grad()
- opt.backward(loss)
-
- slots = TensorDict()
- for param in mlp.parameters():
- slots[param] = np.zeros(param.shape).astype(np.float32)
- for _ in range(3):
- f(
- np.random.random(data_shape).astype(np.float32),
- np.random.randint(0, 10, label_shape).astype(np.int32),
- )
- orig_params = TensorDict()
- grads = TensorDict()
- for param in mlp.parameters():
- orig_params[param] = np.copy(param.numpy())
- grads[param] = np.copy(param.grad.numpy())
- opt.step()
- for param in mlp.parameters():
- slot = slots[param]
- orig_param = orig_params[param]
- slot *= 0.9
- slot -= param.grad.numpy() * 0.01
- assertTensorClose(param.numpy(), orig_param + slot)
-
-
- def test_update_lr():
- data, data_shape, label, label_shape = get_input()
- mlp = MLP()
- opt = SGD(mlp.parameters(), lr=0.01)
- pred = mlp(data)
- loss = F.square_loss(pred, label.reshape(-1, 1))
- opt.zero_grad()
- opt.backward(loss)
- opt.step()
- for group in opt.param_groups:
- group["lr"] += 0.02
- for _ in range(3):
- data.set_value(np.random.random(data_shape).astype(np.float32))
- label.set_value(np.random.randint(0, 10, label_shape))
- pred = mlp(data)
- loss = F.square_loss(pred, label.reshape(-1, 1))
- opt.zero_grad()
- opt.backward(loss)
- for param in mlp.parameters():
- grad = F.grad(loss, param, use_virtual_grad=False)
- assertTensorClose(grad.numpy(), param.grad.numpy())
- orig_params = []
- for param in mlp.parameters():
- orig_params.append(np.copy(param.numpy()))
- opt.step()
- for param, orig_param in zip(mlp.parameters(), orig_params):
- assertTensorClose(param.numpy(), orig_param - param.grad.numpy() * 0.03)
-
-
- def test_adam():
- data, data_shape, label, label_shape = get_input()
- mlp = MLP()
- beta0 = 0.8
- beta1 = 0.9
- eps = 1e-4
- opt = Adam(mlp.parameters(), lr=0.01, betas=(beta0, beta1), eps=eps)
- m_slots = TensorDict()
- v_slots = TensorDict()
- for param in mlp.parameters():
- m_slots[param] = np.zeros(param.shape).astype(np.float32)
- v_slots[param] = np.zeros(param.shape).astype(np.float32)
- step_size = 0
-
- def check_value():
- for param in mlp.parameters():
- grad = param.grad.numpy()
- orig_param = orig_params[param]
- m = m_slots[param]
- v = v_slots[param]
- m *= beta0
- m += (1 - beta0) * grad
- v *= beta1
- v += (1 - beta1) * grad * grad
- update = (m / (1 - beta0 ** step_size)) / (
- np.sqrt(v / (1 - beta1 ** step_size)) + eps
- )
- assertTensorClose(param.numpy(), orig_param - 0.01 * update)
-
- # eager
- for _ in range(3):
- data.set_value(np.random.random(data_shape).astype(np.float32))
- label.set_value(np.random.randint(0, 10, label_shape))
- pred = mlp(data)
- loss = F.square_loss(pred, label.reshape(-1, 1))
- opt.zero_grad()
- grads = opt.backward(loss)
- orig_params = TensorDict()
- for param in mlp.parameters():
- orig_params[param] = np.copy(param.numpy())
- opt.step()
- step_size += 1
- check_value()
-
- # static
- @trace
- def f(data, label):
- pred = mlp(data)
- loss = F.square_loss(pred, label.reshape(-1, 1))
- opt.backward(loss)
-
- for _ in range(3):
- opt.zero_grad()
- orig_params = TensorDict()
- for param in mlp.parameters():
- orig_params[param] = np.copy(param.numpy())
- f(
- np.random.random(data_shape).astype(np.float32),
- np.random.randint(0, 10, label_shape).astype(np.int32),
- )
- opt.step()
- step_size += 1
- check_value()
-
-
- @graph_mode("eager", "static")
- def test_optimizer_serialization():
- data, data_shape, label, label_shape = get_input()
- mlp = MLP()
- opt = SGD(mlp.parameters(), lr=0.01, momentum=0.9)
- slots = TensorDict()
- for param in mlp.parameters():
- slots[param] = np.zeros(param.shape).astype(np.float32)
-
- pred = mlp(data)
- loss = F.square_loss(pred, label.reshape(-1, 1))
- opt.zero_grad()
- opt.backward(loss)
- opt.step()
- for param in mlp.parameters():
- slot = slots[param]
- slot *= 0.9
- slot -= param.grad.numpy() * 0.01
-
- with BytesIO() as fout:
- save(opt.state_dict(), fout)
- fout.seek(0)
- state_dict = load(fout)
- opt1 = SGD(mlp.parameters(), lr=0.02, momentum=0.8)
- opt1.load_state_dict(state_dict)
-
- data.set_value(np.random.random(data_shape).astype(np.float32))
- label.set_value(np.random.randint(0, 10, label_shape))
- pred = mlp(data)
- loss = F.square_loss(pred, label.reshape(-1, 1))
- opt1.zero_grad()
- opt1.backward(loss)
- orig_params = TensorDict()
- for param in mlp.parameters():
- orig_params[param] = np.copy(param.numpy())
- opt1.step()
- for param in mlp.parameters():
- orig_param = orig_params[param]
- slot = slots[param]
- slot *= 0.9
- slot -= param.grad.numpy() * 0.01
- assertTensorClose(param.numpy(), orig_param + slot)
|