You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

test_optimizer.py 9.4 kB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258
  1. # -*- coding: utf-8 -*-
  2. # MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
  3. #
  4. # Copyright (c) 2014-2020 Megvii Inc. All rights reserved.
  5. #
  6. # Unless required by applicable law or agreed to in writing,
  7. # software distributed under the License is distributed on an
  8. # "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  9. from io import BytesIO
  10. import numpy as np
  11. from helpers import MLP, graph_mode
  12. import megengine.functional as F
  13. from megengine import load, optimizer, save
  14. from megengine.core import TensorDict, tensor
  15. from megengine.jit import trace
  16. from megengine.test import assertTensorClose
  17. def get_input():
  18. batch_size, input_dim = 2, 28
  19. data_shape, label_shape = (batch_size, input_dim), (batch_size,)
  20. data, label = tensor(dtype=np.float32), tensor(dtype=np.int32)
  21. data.set_value(np.random.random(data_shape).astype(np.float32))
  22. label.set_value(np.random.randint(0, 10, label_shape))
  23. return data, data_shape, label, label_shape
  24. @graph_mode("eager", "static")
  25. def test_optimizer_serialization():
  26. data, data_shape, label, label_shape = get_input()
  27. mlp = MLP()
  28. opt = optimizer.SGD(mlp.parameters(), lr=0.01, momentum=0.9)
  29. slots = TensorDict()
  30. for param in mlp.parameters():
  31. slots[param] = np.zeros(param.shape).astype(np.float32)
  32. pred = mlp(data)
  33. loss = F.square_loss(pred, label.reshape(-1, 1))
  34. opt.zero_grad()
  35. opt.backward(loss)
  36. opt.step()
  37. for param in mlp.parameters():
  38. slots[param] = slots[param] * 0.9 + param.grad.numpy()
  39. with BytesIO() as fout:
  40. save(opt.state_dict(), fout)
  41. fout.seek(0)
  42. state_dict = load(fout)
  43. opt1 = optimizer.SGD(mlp.parameters(), lr=0.02, momentum=0.8)
  44. opt1.load_state_dict(state_dict)
  45. data.set_value(np.random.random(data_shape).astype(np.float32))
  46. label.set_value(np.random.randint(0, 10, label_shape))
  47. pred = mlp(data)
  48. loss = F.square_loss(pred, label.reshape(-1, 1))
  49. opt1.zero_grad()
  50. opt1.backward(loss)
  51. orig_params = TensorDict()
  52. for param in mlp.parameters():
  53. orig_params[param] = np.copy(param.numpy())
  54. opt1.step()
  55. for param in mlp.parameters():
  56. orig_param = orig_params[param]
  57. slots[param] = slots[param] * 0.9 + param.grad.numpy()
  58. assertTensorClose(param.numpy(), orig_param - 0.01 * slots[param])
  59. def _test_optimizer(opt_str, test_case, check_class, update_lr=False):
  60. iter_num = 3
  61. data, data_shape, label, label_shape = get_input()
  62. net = MLP()
  63. opt = getattr(optimizer, opt_str)(net.parameters(), **test_case)
  64. check_func = check_class(net, **test_case)
  65. step = 0
  66. # eager graph
  67. for i in range(iter_num):
  68. if update_lr and i == 1: # change learning rate
  69. for group in opt.param_groups:
  70. group["lr"] += 0.01
  71. check_func.lr += 0.01
  72. data.set_value(np.random.random(data_shape).astype(np.float32))
  73. label.set_value(np.random.randint(0, 10, label_shape))
  74. pred = net(data)
  75. loss = F.square_loss(pred, label.reshape(-1, 1))
  76. opt.zero_grad()
  77. opt.backward(loss)
  78. ori_params = TensorDict()
  79. for param in net.parameters():
  80. ori_params[param] = np.copy(param.numpy())
  81. opt.step()
  82. step += 1
  83. check_func(ori_params, net.parameters(), step)
  84. # static graph
  85. @trace
  86. def train_func(data, label):
  87. pred = net(data)
  88. loss = F.square_loss(pred, label.reshape(-1, 1))
  89. opt.backward(loss)
  90. for i in range(iter_num):
  91. if update_lr and i == 1: # change learning rate
  92. for group in opt.param_groups:
  93. group["lr"] += 0.01
  94. check_func.lr += 0.01
  95. opt.zero_grad()
  96. ori_params = TensorDict()
  97. for param in net.parameters():
  98. ori_params[param] = np.copy(param.numpy())
  99. train_func(
  100. np.random.random(data_shape).astype(np.float32),
  101. np.random.randint(0, 10, label_shape).astype(np.int32),
  102. )
  103. opt.step()
  104. step += 1
  105. check_func(ori_params, net.parameters(), step)
  106. def test_sgd():
  107. class CheckValue:
  108. def __init__(self, net, **kwarg):
  109. self.slots = TensorDict()
  110. for param in net.parameters():
  111. self.slots[param] = np.zeros(param.shape).astype(np.float32)
  112. for k, v in kwarg.items():
  113. setattr(self, k, v)
  114. def __call__(self, ori_params, new_params, step):
  115. for param in new_params:
  116. grad = param.grad.numpy()
  117. if hasattr(self, "momentum"):
  118. self.slots[param] = grad + self.slots[param] * self.momentum
  119. delta = -self.lr * self.slots[param]
  120. else:
  121. delta = -self.lr * grad
  122. assertTensorClose(param.numpy(), ori_params[param] + delta)
  123. cases = [
  124. {"momentum": 0.9, "lr": 0.01}, # SGD with momentum
  125. {"lr": 0.01}, # simple SGD
  126. {"weight_decay": 0.1, "lr": 0.01}, # with weight_decay
  127. ]
  128. for case in cases:
  129. _test_optimizer("SGD", case, CheckValue)
  130. _test_optimizer("SGD", case, CheckValue, update_lr=True)
  131. def test_adam():
  132. class CheckValue:
  133. def __init__(self, net, **kwarg):
  134. self.m_slots = TensorDict()
  135. self.v_slots = TensorDict()
  136. for param in net.parameters():
  137. self.m_slots[param] = np.zeros(param.shape).astype(np.float32)
  138. self.v_slots[param] = np.zeros(param.shape).astype(np.float32)
  139. for k, v in kwarg.items():
  140. setattr(self, k, v)
  141. def __call__(self, ori_params, new_params, step):
  142. for param in new_params:
  143. grad = param.grad.numpy()
  144. m = self.m_slots[param]
  145. v = self.v_slots[param]
  146. m *= self.betas[0]
  147. m += (1 - self.betas[0]) * grad
  148. v *= self.betas[1]
  149. v += (1 - self.betas[1]) * grad * grad
  150. delta = (m / (1 - self.betas[0] ** step)) / (
  151. np.sqrt(v / (1 - self.betas[1] ** step)) + self.eps
  152. )
  153. assertTensorClose(param.numpy(), ori_params[param] - self.lr * delta)
  154. cases = [
  155. {"betas": (0.8, 0.9), "eps": 1e-04, "lr": 0.01},
  156. {
  157. "betas": (0.8, 0.9),
  158. "eps": 1e-04,
  159. "lr": 0.01,
  160. "weight_decay": 0.1,
  161. }, # with weight_decay
  162. ]
  163. for case in cases:
  164. _test_optimizer("Adam", case, CheckValue)
  165. _test_optimizer("Adam", case, CheckValue, update_lr=True)
  166. def test_adagrad():
  167. class CheckValue:
  168. def __init__(self, net, **kwarg):
  169. self.s_slots = TensorDict()
  170. for param in net.parameters():
  171. self.s_slots[param] = np.zeros(param.shape).astype(np.float32)
  172. for k, v in kwarg.items():
  173. setattr(self, k, v)
  174. def __call__(self, ori_params, new_params, step):
  175. for param in new_params:
  176. grad = param.grad.numpy()
  177. self.s_slots[param] += grad ** 2
  178. delta = grad / (self.s_slots[param] + self.eps) ** 0.5
  179. delta *= -(self.lr / (1 + (step - 1) * self.lr_decay))
  180. assertTensorClose(param.numpy(), ori_params[param] + delta)
  181. cases = [
  182. {"lr": 0.01, "eps": 1e-06, "lr_decay": 0.01},
  183. {"lr": 0.01, "eps": 1e-06, "lr_decay": 0.0}, # without lr_decay
  184. {
  185. "lr": 0.01,
  186. "eps": 1e-06,
  187. "lr_decay": 0.01,
  188. "weight_decay": 0.1,
  189. }, # with weight_decay
  190. ]
  191. for case in cases:
  192. _test_optimizer("Adagrad", case, CheckValue)
  193. _test_optimizer("Adagrad", case, CheckValue, update_lr=True)
  194. def test_adadelta():
  195. class CheckValue:
  196. def __init__(self, net, **kwarg):
  197. self.s_slots = TensorDict()
  198. self.a_slots = TensorDict()
  199. for param in net.parameters():
  200. self.s_slots[param] = np.zeros(param.shape).astype(np.float32)
  201. self.a_slots[param] = np.zeros(param.shape).astype(np.float32)
  202. for k, v in kwarg.items():
  203. setattr(self, k, v)
  204. def __call__(self, ori_params, new_params, step):
  205. for param in new_params:
  206. grad = param.grad.numpy()
  207. self.s_slots[param] = self.s_slots[param] * self.rho + grad ** 2 * (
  208. 1 - self.rho
  209. )
  210. delta = (
  211. grad
  212. * ((self.a_slots[param] + self.eps) ** 0.5)
  213. / (self.s_slots[param] + self.eps) ** 0.5
  214. )
  215. self.a_slots[param] = self.a_slots[param] * self.rho + delta ** 2 * (
  216. 1 - self.rho
  217. )
  218. delta *= -self.lr
  219. assertTensorClose(param.numpy(), ori_params[param] + delta)
  220. cases = [
  221. {"lr": 1.0, "eps": 1e-06, "rho": 0.9},
  222. {"lr": 1.0, "eps": 1e-06, "rho": 0.9, "weight_decay": 0.9}, # with weight_decay
  223. ]
  224. for case in cases:
  225. _test_optimizer("Adadelta", case, CheckValue)
  226. _test_optimizer("Adadelta", case, CheckValue, update_lr=True)

MegEngine 安装包中集成了使用 GPU 运行代码所需的 CUDA 环境,不用区分 CPU 和 GPU 版。 如果想要运行 GPU 程序,请确保机器本身配有 GPU 硬件设备并安装好驱动。 如果你想体验在云端 GPU 算力平台进行深度学习开发的感觉,欢迎访问 MegStudio 平台