|
- # -*- coding: utf-8 -*-
- # MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
- #
- # Copyright (c) 2014-2020 Megvii Inc. All rights reserved.
- #
- # Unless required by applicable law or agreed to in writing,
- # software distributed under the License is distributed on an
- # "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- from io import BytesIO
-
- import numpy as np
- from helpers import MLP, graph_mode
-
- import megengine.functional as F
- from megengine import load, optimizer, save
- from megengine.core import TensorDict, tensor
- from megengine.jit import trace
- from megengine.test import assertTensorClose
-
-
- def get_input():
- batch_size, input_dim = 2, 28
- data_shape, label_shape = (batch_size, input_dim), (batch_size,)
- data, label = tensor(dtype=np.float32), tensor(dtype=np.int32)
- data.set_value(np.random.random(data_shape).astype(np.float32))
- label.set_value(np.random.randint(0, 10, label_shape))
- return data, data_shape, label, label_shape
-
-
- @graph_mode("eager", "static")
- def test_optimizer_serialization():
- data, data_shape, label, label_shape = get_input()
- mlp = MLP()
- opt = optimizer.SGD(mlp.parameters(), lr=0.01, momentum=0.9)
- slots = TensorDict()
- for param in mlp.parameters():
- slots[param] = np.zeros(param.shape).astype(np.float32)
-
- pred = mlp(data)
- loss = F.square_loss(pred, label.reshape(-1, 1))
- opt.zero_grad()
- opt.backward(loss)
- opt.step()
- for param in mlp.parameters():
- slots[param] = slots[param] * 0.9 + param.grad.numpy()
-
- with BytesIO() as fout:
- save(opt.state_dict(), fout)
- fout.seek(0)
- state_dict = load(fout)
- opt1 = optimizer.SGD(mlp.parameters(), lr=0.02, momentum=0.8)
- opt1.load_state_dict(state_dict)
-
- data.set_value(np.random.random(data_shape).astype(np.float32))
- label.set_value(np.random.randint(0, 10, label_shape))
- pred = mlp(data)
- loss = F.square_loss(pred, label.reshape(-1, 1))
- opt1.zero_grad()
- opt1.backward(loss)
- orig_params = TensorDict()
- for param in mlp.parameters():
- orig_params[param] = np.copy(param.numpy())
- opt1.step()
- for param in mlp.parameters():
- orig_param = orig_params[param]
- slots[param] = slots[param] * 0.9 + param.grad.numpy()
- assertTensorClose(param.numpy(), orig_param - 0.01 * slots[param])
-
-
- def _test_optimizer(opt_str, test_case, check_class, update_lr=False):
- iter_num = 3
- data, data_shape, label, label_shape = get_input()
-
- net = MLP()
- opt = getattr(optimizer, opt_str)(net.parameters(), **test_case)
- check_func = check_class(net, **test_case)
-
- step = 0
-
- # eager graph
- for i in range(iter_num):
- if update_lr and i == 1: # change learning rate
- for group in opt.param_groups:
- group["lr"] += 0.01
- check_func.lr += 0.01
- data.set_value(np.random.random(data_shape).astype(np.float32))
- label.set_value(np.random.randint(0, 10, label_shape))
- pred = net(data)
- loss = F.square_loss(pred, label.reshape(-1, 1))
- opt.zero_grad()
- opt.backward(loss)
- ori_params = TensorDict()
- for param in net.parameters():
- ori_params[param] = np.copy(param.numpy())
- opt.step()
- step += 1
- check_func(ori_params, net.parameters(), step)
-
- # static graph
- @trace
- def train_func(data, label):
- pred = net(data)
- loss = F.square_loss(pred, label.reshape(-1, 1))
- opt.backward(loss)
-
- for i in range(iter_num):
- if update_lr and i == 1: # change learning rate
- for group in opt.param_groups:
- group["lr"] += 0.01
- check_func.lr += 0.01
- opt.zero_grad()
- ori_params = TensorDict()
- for param in net.parameters():
- ori_params[param] = np.copy(param.numpy())
- train_func(
- np.random.random(data_shape).astype(np.float32),
- np.random.randint(0, 10, label_shape).astype(np.int32),
- )
- opt.step()
- step += 1
- check_func(ori_params, net.parameters(), step)
-
-
- def test_sgd():
- class CheckValue:
- def __init__(self, net, **kwarg):
- self.slots = TensorDict()
- for param in net.parameters():
- self.slots[param] = np.zeros(param.shape).astype(np.float32)
- for k, v in kwarg.items():
- setattr(self, k, v)
-
- def __call__(self, ori_params, new_params, step):
- for param in new_params:
- grad = param.grad.numpy()
- if hasattr(self, "momentum"):
- self.slots[param] = grad + self.slots[param] * self.momentum
- delta = -self.lr * self.slots[param]
- else:
- delta = -self.lr * grad
- assertTensorClose(param.numpy(), ori_params[param] + delta)
-
- cases = [
- {"momentum": 0.9, "lr": 0.01}, # SGD with momentum
- {"lr": 0.01}, # simple SGD
- {"weight_decay": 0.1, "lr": 0.01}, # with weight_decay
- ]
- for case in cases:
- _test_optimizer("SGD", case, CheckValue)
- _test_optimizer("SGD", case, CheckValue, update_lr=True)
-
-
- def test_adam():
- class CheckValue:
- def __init__(self, net, **kwarg):
- self.m_slots = TensorDict()
- self.v_slots = TensorDict()
- for param in net.parameters():
- self.m_slots[param] = np.zeros(param.shape).astype(np.float32)
- self.v_slots[param] = np.zeros(param.shape).astype(np.float32)
- for k, v in kwarg.items():
- setattr(self, k, v)
-
- def __call__(self, ori_params, new_params, step):
- for param in new_params:
- grad = param.grad.numpy()
- m = self.m_slots[param]
- v = self.v_slots[param]
- m *= self.betas[0]
- m += (1 - self.betas[0]) * grad
- v *= self.betas[1]
- v += (1 - self.betas[1]) * grad * grad
- delta = (m / (1 - self.betas[0] ** step)) / (
- np.sqrt(v / (1 - self.betas[1] ** step)) + self.eps
- )
- assertTensorClose(param.numpy(), ori_params[param] - self.lr * delta)
-
- cases = [
- {"betas": (0.8, 0.9), "eps": 1e-04, "lr": 0.01},
- {
- "betas": (0.8, 0.9),
- "eps": 1e-04,
- "lr": 0.01,
- "weight_decay": 0.1,
- }, # with weight_decay
- ]
- for case in cases:
- _test_optimizer("Adam", case, CheckValue)
- _test_optimizer("Adam", case, CheckValue, update_lr=True)
-
-
- def test_adagrad():
- class CheckValue:
- def __init__(self, net, **kwarg):
- self.s_slots = TensorDict()
- for param in net.parameters():
- self.s_slots[param] = np.zeros(param.shape).astype(np.float32)
- for k, v in kwarg.items():
- setattr(self, k, v)
-
- def __call__(self, ori_params, new_params, step):
- for param in new_params:
- grad = param.grad.numpy()
- self.s_slots[param] += grad ** 2
- delta = grad / (self.s_slots[param] + self.eps) ** 0.5
- delta *= -(self.lr / (1 + (step - 1) * self.lr_decay))
- assertTensorClose(param.numpy(), ori_params[param] + delta)
-
- cases = [
- {"lr": 0.01, "eps": 1e-06, "lr_decay": 0.01},
- {"lr": 0.01, "eps": 1e-06, "lr_decay": 0.0}, # without lr_decay
- {
- "lr": 0.01,
- "eps": 1e-06,
- "lr_decay": 0.01,
- "weight_decay": 0.1,
- }, # with weight_decay
- ]
- for case in cases:
- _test_optimizer("Adagrad", case, CheckValue)
- _test_optimizer("Adagrad", case, CheckValue, update_lr=True)
-
-
- def test_adadelta():
- class CheckValue:
- def __init__(self, net, **kwarg):
- self.s_slots = TensorDict()
- self.a_slots = TensorDict()
- for param in net.parameters():
- self.s_slots[param] = np.zeros(param.shape).astype(np.float32)
- self.a_slots[param] = np.zeros(param.shape).astype(np.float32)
- for k, v in kwarg.items():
- setattr(self, k, v)
-
- def __call__(self, ori_params, new_params, step):
- for param in new_params:
- grad = param.grad.numpy()
- self.s_slots[param] = self.s_slots[param] * self.rho + grad ** 2 * (
- 1 - self.rho
- )
- delta = (
- grad
- * ((self.a_slots[param] + self.eps) ** 0.5)
- / (self.s_slots[param] + self.eps) ** 0.5
- )
- self.a_slots[param] = self.a_slots[param] * self.rho + delta ** 2 * (
- 1 - self.rho
- )
- delta *= -self.lr
- assertTensorClose(param.numpy(), ori_params[param] + delta)
-
- cases = [
- {"lr": 1.0, "eps": 1e-06, "rho": 0.9},
- {"lr": 1.0, "eps": 1e-06, "rho": 0.9, "weight_decay": 0.9}, # with weight_decay
- ]
- for case in cases:
- _test_optimizer("Adadelta", case, CheckValue)
- _test_optimizer("Adadelta", case, CheckValue, update_lr=True)
|