You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

test_optimizer.py 8.9 kB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260
  1. # -*- coding: utf-8 -*-
  2. # MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
  3. #
  4. # Copyright (c) 2014-2020 Megvii Inc. All rights reserved.
  5. #
  6. # Unless required by applicable law or agreed to in writing,
  7. # software distributed under the License is distributed on an
  8. # "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  9. from io import BytesIO
  10. import numpy as np
  11. from helpers import MLP, graph_mode
  12. import megengine.functional as F
  13. from megengine import load, save
  14. from megengine.core import TensorDict, tensor
  15. from megengine.jit import trace
  16. from megengine.optimizer import SGD, Adam
  17. from megengine.test import assertTensorClose
  18. def get_input():
  19. batch_size = 2
  20. input_dim = 28
  21. data_shape = (batch_size, input_dim)
  22. label_shape = (batch_size,)
  23. data = tensor()
  24. label = tensor(dtype=np.int32)
  25. data.set_value(np.random.random(data_shape).astype(np.float32))
  26. label.set_value(np.random.randint(0, 10, label_shape))
  27. return data, data_shape, label, label_shape
  28. def test_sgd_simple():
  29. data, data_shape, label, label_shape = get_input()
  30. mlp = MLP()
  31. opt = SGD(mlp.parameters(), lr=0.01, weight_decay=0.1)
  32. for idx in range(3):
  33. data.set_value(np.random.random(data_shape).astype(np.float32))
  34. label.set_value(np.random.randint(0, 10, label_shape))
  35. pred = mlp(data)
  36. loss = F.square_loss(pred, label.reshape(-1, 1))
  37. if idx % 2:
  38. opt.zero_grad()
  39. else:
  40. mlp.zero_grad()
  41. opt.backward(loss)
  42. grads = TensorDict()
  43. orig_params = TensorDict()
  44. for param in mlp.parameters():
  45. grad = F.grad(loss, param, use_virtual_grad=False)
  46. assertTensorClose(grad.numpy(), param.grad.numpy())
  47. grads[param] = np.copy(grad.numpy())
  48. orig_params[param] = np.copy(param.numpy())
  49. opt.step()
  50. for param in mlp.parameters():
  51. assertTensorClose(
  52. param.numpy(), orig_params[param] * 0.999 - grads[param] * 0.01
  53. )
  54. def test_sgd_momentum():
  55. data, data_shape, label, label_shape = get_input()
  56. mlp = MLP()
  57. opt = SGD(mlp.parameters(), lr=0.01, momentum=0.9)
  58. slots = TensorDict()
  59. for param in mlp.parameters():
  60. slots[param] = np.zeros(param.shape).astype(np.float32)
  61. for _ in range(3):
  62. data.set_value(np.random.random(data_shape).astype(np.float32))
  63. label.set_value(np.random.randint(0, 10, label_shape))
  64. pred = mlp(data)
  65. loss = F.square_loss(pred, label.reshape(-1, 1))
  66. opt.zero_grad()
  67. opt.backward(loss)
  68. orig_params = TensorDict()
  69. grads = TensorDict()
  70. for param in mlp.parameters():
  71. orig_params[param] = np.copy(param.numpy())
  72. grads[param] = np.copy(param.grad.numpy())
  73. opt.step()
  74. for param in mlp.parameters():
  75. slot = slots[param]
  76. orig_param = orig_params[param]
  77. slot *= 0.9
  78. slot -= param.grad.numpy() * 0.01
  79. assertTensorClose(param.numpy(), orig_param + slot)
  80. # TODO: put opt.step() inside trace
  81. def test_sgd_momentum_static():
  82. _, data_shape, _, label_shape = get_input()
  83. mlp = MLP()
  84. opt = SGD(mlp.parameters(), lr=0.01, momentum=0.9)
  85. @trace
  86. def f(data, label):
  87. pred = mlp(data)
  88. loss = F.square_loss(pred, label.reshape(-1, 1))
  89. opt.zero_grad()
  90. opt.backward(loss)
  91. slots = TensorDict()
  92. for param in mlp.parameters():
  93. slots[param] = np.zeros(param.shape).astype(np.float32)
  94. for _ in range(3):
  95. f(
  96. np.random.random(data_shape).astype(np.float32),
  97. np.random.randint(0, 10, label_shape).astype(np.int32),
  98. )
  99. orig_params = TensorDict()
  100. grads = TensorDict()
  101. for param in mlp.parameters():
  102. orig_params[param] = np.copy(param.numpy())
  103. grads[param] = np.copy(param.grad.numpy())
  104. opt.step()
  105. for param in mlp.parameters():
  106. slot = slots[param]
  107. orig_param = orig_params[param]
  108. slot *= 0.9
  109. slot -= param.grad.numpy() * 0.01
  110. assertTensorClose(param.numpy(), orig_param + slot)
  111. def test_update_lr():
  112. data, data_shape, label, label_shape = get_input()
  113. mlp = MLP()
  114. opt = SGD(mlp.parameters(), lr=0.01)
  115. pred = mlp(data)
  116. loss = F.square_loss(pred, label.reshape(-1, 1))
  117. opt.zero_grad()
  118. opt.backward(loss)
  119. opt.step()
  120. for group in opt.param_groups:
  121. group["lr"] += 0.02
  122. for _ in range(3):
  123. data.set_value(np.random.random(data_shape).astype(np.float32))
  124. label.set_value(np.random.randint(0, 10, label_shape))
  125. pred = mlp(data)
  126. loss = F.square_loss(pred, label.reshape(-1, 1))
  127. opt.zero_grad()
  128. opt.backward(loss)
  129. for param in mlp.parameters():
  130. grad = F.grad(loss, param, use_virtual_grad=False)
  131. assertTensorClose(grad.numpy(), param.grad.numpy())
  132. orig_params = []
  133. for param in mlp.parameters():
  134. orig_params.append(np.copy(param.numpy()))
  135. opt.step()
  136. for param, orig_param in zip(mlp.parameters(), orig_params):
  137. assertTensorClose(param.numpy(), orig_param - param.grad.numpy() * 0.03)
  138. def test_adam():
  139. data, data_shape, label, label_shape = get_input()
  140. mlp = MLP()
  141. beta0 = 0.8
  142. beta1 = 0.9
  143. eps = 1e-4
  144. opt = Adam(mlp.parameters(), lr=0.01, betas=(beta0, beta1), eps=eps)
  145. m_slots = TensorDict()
  146. v_slots = TensorDict()
  147. for param in mlp.parameters():
  148. m_slots[param] = np.zeros(param.shape).astype(np.float32)
  149. v_slots[param] = np.zeros(param.shape).astype(np.float32)
  150. step_size = 0
  151. def check_value():
  152. for param in mlp.parameters():
  153. grad = param.grad.numpy()
  154. orig_param = orig_params[param]
  155. m = m_slots[param]
  156. v = v_slots[param]
  157. m *= beta0
  158. m += (1 - beta0) * grad
  159. v *= beta1
  160. v += (1 - beta1) * grad * grad
  161. update = (m / (1 - beta0 ** step_size)) / (
  162. np.sqrt(v / (1 - beta1 ** step_size)) + eps
  163. )
  164. assertTensorClose(param.numpy(), orig_param - 0.01 * update)
  165. # eager
  166. for _ in range(3):
  167. data.set_value(np.random.random(data_shape).astype(np.float32))
  168. label.set_value(np.random.randint(0, 10, label_shape))
  169. pred = mlp(data)
  170. loss = F.square_loss(pred, label.reshape(-1, 1))
  171. opt.zero_grad()
  172. grads = opt.backward(loss)
  173. orig_params = TensorDict()
  174. for param in mlp.parameters():
  175. orig_params[param] = np.copy(param.numpy())
  176. opt.step()
  177. step_size += 1
  178. check_value()
  179. # static
  180. @trace
  181. def f(data, label):
  182. pred = mlp(data)
  183. loss = F.square_loss(pred, label.reshape(-1, 1))
  184. opt.backward(loss)
  185. for _ in range(3):
  186. opt.zero_grad()
  187. orig_params = TensorDict()
  188. for param in mlp.parameters():
  189. orig_params[param] = np.copy(param.numpy())
  190. f(
  191. np.random.random(data_shape).astype(np.float32),
  192. np.random.randint(0, 10, label_shape).astype(np.int32),
  193. )
  194. opt.step()
  195. step_size += 1
  196. check_value()
  197. @graph_mode("eager", "static")
  198. def test_optimizer_serialization():
  199. data, data_shape, label, label_shape = get_input()
  200. mlp = MLP()
  201. opt = SGD(mlp.parameters(), lr=0.01, momentum=0.9)
  202. slots = TensorDict()
  203. for param in mlp.parameters():
  204. slots[param] = np.zeros(param.shape).astype(np.float32)
  205. pred = mlp(data)
  206. loss = F.square_loss(pred, label.reshape(-1, 1))
  207. opt.zero_grad()
  208. opt.backward(loss)
  209. opt.step()
  210. for param in mlp.parameters():
  211. slot = slots[param]
  212. slot *= 0.9
  213. slot -= param.grad.numpy() * 0.01
  214. with BytesIO() as fout:
  215. save(opt.state_dict(), fout)
  216. fout.seek(0)
  217. state_dict = load(fout)
  218. opt1 = SGD(mlp.parameters(), lr=0.02, momentum=0.8)
  219. opt1.load_state_dict(state_dict)
  220. data.set_value(np.random.random(data_shape).astype(np.float32))
  221. label.set_value(np.random.randint(0, 10, label_shape))
  222. pred = mlp(data)
  223. loss = F.square_loss(pred, label.reshape(-1, 1))
  224. opt1.zero_grad()
  225. opt1.backward(loss)
  226. orig_params = TensorDict()
  227. for param in mlp.parameters():
  228. orig_params[param] = np.copy(param.numpy())
  229. opt1.step()
  230. for param in mlp.parameters():
  231. orig_param = orig_params[param]
  232. slot = slots[param]
  233. slot *= 0.9
  234. slot -= param.grad.numpy() * 0.01
  235. assertTensorClose(param.numpy(), orig_param + slot)

MegEngine 安装包中集成了使用 GPU 运行代码所需的 CUDA 环境,不用区分 CPU 和 GPU 版。 如果想要运行 GPU 程序,请确保机器本身配有 GPU 硬件设备并安装好驱动。 如果你想体验在云端 GPU 算力平台进行深度学习开发的感觉,欢迎访问 MegStudio 平台