You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

optimizer.py 10 kB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290
  1. # -*- coding: utf-8 -*-
  2. # MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
  3. #
  4. # Copyright (c) 2014-2020 Megvii Inc. All rights reserved.
  5. #
  6. # Unless required by applicable law or agreed to in writing,
  7. # software distributed under the License is distributed on an
  8. # "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  9. from abc import ABCMeta, abstractmethod
  10. from collections import Iterable
  11. from typing import Dict
  12. from typing import Iterable as Iter
  13. from typing import Union
  14. import numpy as np
  15. from .._internal.config import opr_priority_scope
  16. from ..core import Buffer, Parameter, Tensor, TensorDict
  17. from ..core.graph import get_default_graph
  18. from ..distributed import all_reduce_sum, bcast_param, get_world_size, is_distributed
  19. from ..functional import add_update
  20. from ..functional import grad as grad_func
  21. from ..jit import sideeffect
  22. class _RequiredParameter:
  23. def __repr__(self):
  24. return "<required parameter>"
  25. required = _RequiredParameter()
  26. class Optimizer(metaclass=ABCMeta):
  27. r"""Base class for all optimizers.
  28. :param params: specifies what Tensors should be optimized.
  29. :param defaults: a dict of default parameters of Optimizer, like learning rate or momentum.
  30. :param bcast_period: interval time between two broadcast of distributed training. Default: 500
  31. """
  32. def __init__( # pylint: disable=too-many-branches
  33. self,
  34. params: Union[Iter[Parameter], dict],
  35. defaults: dict,
  36. bcast_period: int = 500,
  37. ):
  38. self._state = TensorDict()
  39. self._defaults = defaults
  40. self._bcast_iter = 0
  41. self._bcast_period = bcast_period
  42. if isinstance(params, (Parameter, dict)):
  43. params = [params]
  44. else:
  45. if not isinstance(params, Iterable):
  46. raise TypeError(
  47. "params argument given to the optimizer should be "
  48. "Parameter or dict, or Iterable of them"
  49. )
  50. self.param_groups = [] # type: list
  51. param_groups = list(params)
  52. if len(param_groups) == 0:
  53. raise ValueError("optimizer got an empty parameter list")
  54. param_type = type(param_groups[0])
  55. for param in param_groups:
  56. if not isinstance(param, param_type):
  57. raise TypeError(
  58. "types of params argument given to the optimizer shoud be same"
  59. )
  60. if not isinstance(param_groups[0], dict):
  61. param_groups = [{"params": param_groups}]
  62. for group in param_groups:
  63. self.add_param_group(group)
  64. for group in self.param_groups:
  65. self._create_state(group)
  66. if is_distributed() and bcast_period != -1:
  67. self.bcast_param()
  68. def add_param_group(self, param_group: dict):
  69. r"""Add a param group to ``param_groups`` of the :class:`~megengine.optim.optimizer.Optimizer`.
  70. This can be useful when fine tuning a pre-trained network as frozen layers can be made
  71. trainable and added to the :class:`~megengine.optim.optimizer.Optimizer` as training progresses.
  72. :param param_group: specifies what tensors should be optimized along with group.
  73. """
  74. assert isinstance(param_group, dict), "param group must be a dict"
  75. if isinstance(param_group["params"], Parameter):
  76. param_group["params"] = [param_group["params"]]
  77. else:
  78. param_group["params"] = list(param_group["params"])
  79. for param in param_group["params"]:
  80. if not isinstance(param, Parameter):
  81. raise TypeError(
  82. "optimizer can only optimize Parameters, but one of the params is "
  83. + type(param)
  84. )
  85. if not param.requires_grad:
  86. raise ValueError(
  87. "optimizer can only optimize Parameters with requires_grad=True"
  88. )
  89. for name, default in self._defaults.items():
  90. if default is required and name not in param_group:
  91. raise ValueError(
  92. "parameter group didn't specify a value of "
  93. "required optimization parameter " + name
  94. )
  95. param_group.setdefault(name, default)
  96. param_set = set()
  97. for group in self.param_groups:
  98. param_set.update(set(map(id, group["params"])))
  99. assert param_set.isdisjoint(
  100. set(map(id, param_group["params"]))
  101. ), "some parameters appear in more than one parameter group"
  102. self.param_groups.append(param_group)
  103. def _add_state(self, param, state_name, initializer=None):
  104. if initializer is None:
  105. initializer = np.zeros(param.shape, dtype=np.float32)
  106. state_dict = self._state.setdefault(param, {})
  107. assert state_name not in state_dict
  108. state = Buffer(value=initializer)
  109. state_dict[state_name] = state
  110. @abstractmethod
  111. def _create_state(self, param_group):
  112. pass
  113. @abstractmethod
  114. def _updates(self, param_group):
  115. pass
  116. def backward(self, loss: Tensor):
  117. """Computes the back-propagation of the network given loss.
  118. :param loss: The obtained loss tensor
  119. """
  120. rst = []
  121. key = 0
  122. params = []
  123. for group in self.param_groups:
  124. for param in group["params"]:
  125. if param.grad is None:
  126. param.grad = Buffer(
  127. value=np.zeros(shape=param.shape, dtype=np.float32)
  128. )
  129. params.append(param)
  130. assert hasattr(param, "grad"), "param has no grad"
  131. assert isinstance(param.grad, Buffer), "grad must be a buffer"
  132. cg = get_default_graph()
  133. grads = grad_func(loss, params, use_virtual_grad=not cg.is_eager())
  134. if not isinstance(grads, list):
  135. grads = [grads]
  136. assert len(grads) == len(params)
  137. for param, grad in zip(params, grads):
  138. if is_distributed():
  139. key += 1
  140. with opr_priority_scope(cg, -key):
  141. # all_reduce_mean
  142. grad = all_reduce_sum(grad, key) / get_world_size()
  143. with opr_priority_scope(cg, (1 << 30) - key):
  144. grad_update = add_update(param.grad, grad)
  145. else:
  146. grad_update = add_update(param.grad, grad)
  147. rst.append(grad_update)
  148. return rst
  149. @sideeffect
  150. def step(self):
  151. r"""Performs a single optimization step.
  152. """
  153. for group in self.param_groups:
  154. if isinstance(group["params"], set):
  155. raise TypeError(
  156. "optimized parameters need to be organized in ordered collections, "
  157. "but the ordering of parameters in sets will change between runs. "
  158. "Please use a list instead."
  159. )
  160. self._updates(group)
  161. if is_distributed() and self._bcast_period != -1:
  162. self._bcast_iter += 1
  163. if self._bcast_iter == self._bcast_period:
  164. self.bcast_param()
  165. self._bcast_iter = 0
  166. @sideeffect
  167. def zero_grad(self):
  168. r"""Reset the grad to zeros.
  169. """
  170. for param_group in self.param_groups:
  171. for param in param_group["params"]:
  172. if param.grad is not None:
  173. param.grad.reset_zero()
  174. def bcast_param(self):
  175. key = 0
  176. for group in self.param_groups:
  177. for param in group["params"]:
  178. bcast_param(param, key)
  179. key += 1
  180. def state_dict(self) -> Dict:
  181. r"""Export the optimizer state.
  182. :return: optimizer state. Can be loaded by :meth:`load_state_dict`.
  183. """
  184. param_groups = []
  185. state = dict()
  186. param2id = TensorDict()
  187. cur_id = 0
  188. for group in self.param_groups:
  189. for param in group["params"]:
  190. if param not in param2id:
  191. param2id[param] = cur_id
  192. cur_id += 1
  193. for param, st in self._state.items():
  194. state[param2id[param]] = st
  195. for group in self.param_groups:
  196. param_group = {k: v for k, v in group.items() if k != "params"}
  197. param_group["params"] = [param2id[param] for param in group["params"]]
  198. param_groups.append(param_group)
  199. return {"param_groups": param_groups, "state": state}
  200. def load_state_dict(self, state: dict):
  201. r"""Loads the optimizer state.
  202. :param state: optimizer state. Should be an object returned
  203. from a call to :meth:`state_dict`.
  204. """
  205. if len(self.param_groups) != len(state["param_groups"]):
  206. raise ValueError(
  207. "loaded state dict has a different number of parameter groups"
  208. )
  209. parameter_map = dict() # type: Dict
  210. for group_new, group_saved in zip(self.param_groups, state["param_groups"]):
  211. if len(group_new["params"]) != len(group_saved["params"]):
  212. raise ValueError(
  213. "loaded state dict contains a parameter group that "
  214. "doesn't match the size of optimizer's group"
  215. )
  216. for param_new, param_saved in zip(
  217. group_new["params"], group_saved["params"]
  218. ):
  219. p = param_new
  220. self._state[p] = state["state"][param_saved].copy()
  221. for k, v in self._state[p].items():
  222. if isinstance(v, Buffer) and v._comp_graph != p._comp_graph:
  223. self._state[p][k] = Buffer(v.numpy())
  224. if set(group_new.keys()) != set(group_saved.keys()):
  225. raise ValueError(
  226. "loaded state dict contains a parameter group that "
  227. "doesn't match the keys of optimizer's group"
  228. )
  229. for key in group_new.keys():
  230. if key != "params":
  231. group_new[key] = group_saved[key]
  232. if len(self._state.keys()) != len(state["state"].keys()):
  233. raise ValueError(
  234. "loaded state dict contains a state that doesn't match "
  235. "the size of optimizer's state"
  236. )

MegEngine 安装包中集成了使用 GPU 运行代码所需的 CUDA 环境,不用区分 CPU 和 GPU 版。 如果想要运行 GPU 程序,请确保机器本身配有 GPU 硬件设备并安装好驱动。 如果你想体验在云端 GPU 算力平台进行深度学习开发的感觉,欢迎访问 MegStudio 平台