You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

grad_manager.py 15 kB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400
  1. # MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
  2. #
  3. # Copyright (c) 2014-2021 Megvii Inc. All rights reserved.
  4. #
  5. # Unless required by applicable law or agreed to in writing,
  6. # software distributed under the License is distributed on an
  7. # "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  8. import weakref
  9. from collections import OrderedDict
  10. from typing import Callable, Iterable, List, Union
  11. from ..core._imperative_rt.core2 import pop_scope, push_scope, set_option
  12. from ..core.autodiff.grad import Grad
  13. from ..logger import get_logger
  14. from ..tensor import Tensor
  15. from ..utils.future import Future
  16. logger = get_logger(__name__)
  17. backwarding_grad_manager = None
  18. def get_backwarding_grad_manager():
  19. return backwarding_grad_manager
  20. class AttachSpec:
  21. __slots__ = "tensor", "callbacks"
  22. _global_priority = 0
  23. class GradManager:
  24. r"""
  25. GradManager computes gradients or more generally, vector-Jacobian product, by reverse mode
  26. automatic differentiation (a.k.a. back propagation).
  27. Reverse mode autodiff normally reuses many intermediate tensors for best computation efficiency.
  28. In a read-eval-print-loop (REPL) environment however, it is impossible to known how the user
  29. would take gradients later thus which tensors to keep. To solve this problem, the user must
  30. somehow declare beforehand which gradient could possibly be taken. With GradManager, users are
  31. required to call the :meth:`attach` method on a tensor if they want to take gradients with
  32. respect to it later. Furthermore, any computation on a tensor before it is attached is
  33. completely ignored from the autodiff perspective, so :meth:`attach` must be called before any
  34. computation that needs differentiation.
  35. For example, the following symbolic differentiation code
  36. .. code-block::
  37. x = get_x()
  38. y = f(x)
  39. dy = ones_like(y)
  40. dx = vjp(y, x, dy) # vector-Jacobian product
  41. can be rewriten using GradManager for REPL environment as
  42. .. code-block::
  43. with GradManager() as gm:
  44. x = get_x()
  45. gm.attach(x) # must be placed before any computation on x that needs differentiation
  46. y = f(x)
  47. dy = ones_like(y)
  48. gm.backward(y, dy) # doesn't need x, already known via attach()
  49. dx = x.grad # backward() saves result to .grad attribute
  50. A more realistic example of training a neural network would be like
  51. .. code-block::
  52. gm = GradManager()
  53. gm.attach(model.parameters())
  54. for data in dataset:
  55. with gm:
  56. loss = model(data)
  57. gm.backward(loss)
  58. # gradients w.r.t. parameters is accumulated into their .grad attributes
  59. You can also use ``record()`` and ``release()`` method instead of ``with`` context:
  60. .. code-block::
  61. gm = GradManager()
  62. gm.attach(model.parameters())
  63. for data in dataset:
  64. gm.record()
  65. loss = model(data)
  66. gm.backward(loss)
  67. # backward() will clear recorded history and free resources
  68. # call release() if backward() is not called
  69. # gm.release()
  70. For your convenience, GradManager may (not must) be reused. As shown in the examples, you
  71. only need to attach a tensor once and GradManager will remember it afterwards.
  72. However, a single GradManager can record only one computation history at a time. To run
  73. multiple differentiations simultaneously or perform high order differentiation, create
  74. as many GradManager as you need.
  75. .. note::
  76. Mutable tensors introduce ambiguities when doing symbolic differentiation: which version
  77. of the tensor are we referring to? For attached tensors, GradManager resolves this
  78. ambiguity by "snapshoting" them on first encounter, either on :meth:`record` (or entering
  79. with statement) if tensor is attached before :meth:`record`, or on :meth:`attach` if
  80. GradManager is already recording. Attached tensors will then be interpreted as their
  81. snapshotted version for differentiation purpose. The same ambiguity on the first parameter
  82. of :meth:`backward` is simply resolved by using the latest version.
  83. Typically, in data parallel, we would like to average the gradients across
  84. processes. Users will finally get the averaged gradients if an "AllReduce"
  85. callback is registered as follows:
  86. .. code-block::
  87. import megengine.distributed as dist
  88. gm = GradManager()
  89. gm.attach(model.parameters(), callback=dist.make_allreduce_cb("MEAN"))
  90. """
  91. def __init__(self):
  92. self._attach_specs = {} # id(Tensor) -> AttachSpec
  93. self._recording = False
  94. self._grad = None
  95. self._after_backward_callback = []
  96. self._gradients = {}
  97. self._priority = None
  98. def attached_tensors(self):
  99. r"""Return attached tensor list from :meth:`attach`."""
  100. return [spec.tensor() for spec in self._attach_specs.values()]
  101. def attach(self, tensors: Iterable[Tensor], callbacks=None):
  102. r"""
  103. Instruct GradManager to track operations on tensors, so that gradients with respect
  104. to those tensors could be evaluated later.
  105. :meth:`attach` also accepts a list of callbacks, which will be called with the tensor and
  106. its gradient during :meth:`backward`. The signature of callbacks should look like:
  107. .. code-block::
  108. def callback(tensor: Tensor, grad: Tensor) -> Tensor:
  109. ...
  110. # returned grad is passed to subsequent callbacks
  111. # and finally accumulated to the .grad attribute of tensor
  112. return grad
  113. :meth:`attach` calls with overlapping tensors will result in their callbacks concatenated,
  114. independently for each tensor. For example,
  115. .. code-block::
  116. gm.attach([x, y], callbacks=[f])
  117. gm.attach([y], callbacks=[g])
  118. is equivalent to
  119. .. code-block::
  120. gm.attach([x], callbacks=[f])
  121. gm.attach([y], callbacks=[f, g])
  122. The effect of :meth:`attach` will persist across multiple uses of the GradManager. When
  123. reusing a GradManager, it is likely a mistake to call :meth:`attach` on the same set of
  124. tensors and callbacks repeatedly, which may grow the callback list indefinitely.
  125. .. note::
  126. When reusing a GradManager, it is sometimes desirable to attach temporary tensors each
  127. time, e.g. for computing gradients of inputs of a neural network. GradManager tries to
  128. accommodate such usages by holding weak references to attached tensors. Most of the
  129. times, this should be enough to prevent resource leak. Unfortunately, there are still
  130. some pitfalls left:
  131. - Callbacks should not hold strong references, directly or indirectly, to attached
  132. tensors. Any strong reference, including those from callbacks, will prevent
  133. garbage collection (even by the cycle collector!) of a attached tensor, until
  134. the GradManager object is garbage collected.
  135. Please also note that GradManager might hold additional strong references to attached
  136. tensors when it is in use. This note only covers potential resource leaks across
  137. multiple uses of a GradManager, which is unrelated to whether resources is timely
  138. released within a single use.
  139. :param tensors: tensor or list of tensors to track
  140. :param callbacks: callback or list of callbacks
  141. """
  142. if callbacks is None:
  143. callbacks = []
  144. if isinstance(callbacks, Callable):
  145. callbacks = [callbacks]
  146. if isinstance(tensors, Tensor):
  147. tensors = [tensors]
  148. def make_spec(tensor):
  149. selfref = weakref.ref(self)
  150. key = id(tensor)
  151. def deleter(_):
  152. self = selfref()
  153. if self is not None:
  154. del self._attach_specs[key]
  155. spec = AttachSpec()
  156. spec.tensor = weakref.ref(tensor, deleter)
  157. spec.callbacks = []
  158. return spec
  159. for x in tensors:
  160. assert isinstance(x, Tensor), "Object to be attached should be Tensor"
  161. spec = self._attach_specs.get(id(x))
  162. new_attach = spec is None
  163. if spec is None:
  164. spec = make_spec(x)
  165. self._attach_specs[id(x)] = spec
  166. spec.callbacks.extend(callbacks)
  167. if new_attach and self._recording:
  168. self._do_record(spec)
  169. return self
  170. def _register_after_backward_callback(self, callback):
  171. self._after_backward_callback.append(callback)
  172. return self
  173. def backward(
  174. self,
  175. y: Union[Tensor, List[Tensor]] = None,
  176. dy: Union[Tensor, List[Tensor]] = None,
  177. ):
  178. r"""
  179. Compute gradients (or vector-Jacobian product) for all attached tensors, accumulate to
  180. corresponding .grad attribute, and release resources along the way.
  181. :meth:`backward` computes the vector-Jacobian product :math:`dx_j = \sum_{i} dy_i J_{ij}`
  182. where :math:`J_{ij} = ∂y_i/∂x_j` is the Jacobian matrix between vector variables :math:`y`
  183. and :math:`x`, with all vectors involved represented as a list of tensors, in the sense of
  184. direct sums (or flatten-and-concatenate). :math:`y` and :math:`dy` are passed as the first
  185. and second parameter respectively, whereas :math:`x` is directly taken from the list of
  186. all attached tensors. The result :math:`dx` is also not returned. Instead, it is directly
  187. accumulated into the .grad attribute of matching attached tensors (a.k.a. :math:`x`). This
  188. can be done unambiguously since :math:`dx` as a list of tensors has the same structure as
  189. :math:`x`.
  190. If :math:`y` is a scalar and :math:`dy` is chosen to be 1, the vector-Jacobian product
  191. yield gradient of :math:`y` with repect to :math:`x` as a special case. In that case,
  192. you will be able to omit the :math:`dy` parameter and :meth:`backward` will automatically
  193. use 1 for it and compute the gradient.
  194. :meth:`backward` consumes all resources held by this GradManager and releases them in the
  195. process of this call. When the call successfully finishes, the GradManager will be put back
  196. to an inactive state.
  197. :param y: tensor or list of tensors
  198. :param dy: tensor or list of tensors. Defaults to 1 if y is scalar
  199. """
  200. push_scope("backward")
  201. set_option("record_computing_path", 0)
  202. from ..functional import ones_like
  203. global backwarding_grad_manager
  204. cache = backwarding_grad_manager
  205. backwarding_grad_manager = self
  206. if not self._recording:
  207. raise RuntimeError(
  208. "no computation history. "
  209. "did you forget record() or "
  210. "call a method that clears the history?"
  211. )
  212. assert self._grad is not None
  213. # These checks should be consistent with GradScaler's
  214. if y is None:
  215. ys = []
  216. elif isinstance(y, (tuple, list)):
  217. ys = y
  218. else:
  219. ys = [y]
  220. if dy is None:
  221. dys = [ones_like(y) for y in ys]
  222. elif isinstance(dy, (tuple, list)):
  223. dys = dy
  224. else:
  225. dys = [dy]
  226. try:
  227. self._grad(ys, dys)
  228. for callback in self._after_backward_callback:
  229. callback()
  230. for id_, grad in self._gradients.items():
  231. if isinstance(grad, Future):
  232. grad = grad.get()
  233. spec = self._attach_specs.get(id_)
  234. tensor = spec and spec.tensor()
  235. if tensor is not None:
  236. if tensor.grad is None:
  237. tensor.grad = grad
  238. else:
  239. tensor.grad += grad
  240. if tensor._isscalar() and tensor.grad is not None:
  241. tensor.grad._setscalar()
  242. finally:
  243. self.release()
  244. backwarding_grad_manager = cache
  245. set_option("record_computing_path", 1)
  246. pop_scope("backward")
  247. def record(self):
  248. r"""
  249. Start recording operations
  250. After this call, you will be able to call :meth:`backward`.
  251. """
  252. global _global_priority
  253. if self._recording:
  254. raise RuntimeError("already recording")
  255. grad = Grad()
  256. self._recording = True
  257. self._grad = grad
  258. for spec in self._attach_specs.values():
  259. self._do_record(spec)
  260. if self._priority is None:
  261. grad._priority = _global_priority
  262. _global_priority -= 1
  263. grad.__enter__()
  264. def _do_record(self, spec):
  265. tensor = spec.tensor()
  266. if tensor is None:
  267. return
  268. def callback(grad, callbacks=spec.callbacks):
  269. for cb in callbacks:
  270. grad = cb(tensor, grad)
  271. self._gradients[id(tensor)] = grad
  272. # NOTE: override prev callback wrt when called serval times
  273. self._grad.wrt(tensor, callback=callback)
  274. def release(self):
  275. r"""
  276. Stop recording operations and release resources kept for gradient computation
  277. After this call, you will not be able to call :meth:`backward`.
  278. """
  279. global _global_priority
  280. if self._grad is not None:
  281. self._grad.__exit__(None, None, None)
  282. self._grad = None
  283. self._recording = False
  284. self._gradients = dict()
  285. if self._priority is None:
  286. _global_priority += 1
  287. def __enter__(self):
  288. self.record()
  289. return self
  290. def __exit__(self, exc_type, exc_val, exc_tb):
  291. self.release()
  292. def __or__(self, other):
  293. if isinstance(other, GradManager):
  294. return GradManagerGroup([self, other])
  295. return NotImplemented
  296. __ror__ = __or__
  297. class GradManagerGroup:
  298. def __init__(self, gms) -> None:
  299. self._gms = list(gms)
  300. def merge_with(self, other):
  301. if isinstance(other, GradManager):
  302. other = GradManagerGroup([other])
  303. elif not isinstance(other, GradManagerGroup):
  304. return NotImplemented
  305. return GradManagerGroup([*self._gms, *other._gms])
  306. __or__ = merge_with
  307. __ror__ = merge_with
  308. def __enter__(self):
  309. global _global_priority
  310. _global_priority += 1
  311. for gm in self._gms:
  312. gm._priority = _global_priority
  313. gm.record()
  314. def __exit__(self, exc_type, exc_val, exc_tb):
  315. global _global_priority
  316. _global_priority -= 1
  317. for gm in self._gms:
  318. gm.release()
  319. gm._priority = None

MegEngine 安装包中集成了使用 GPU 运行代码所需的 CUDA 环境,不用区分 CPU 和 GPU 版。 如果想要运行 GPU 程序,请确保机器本身配有 GPU 硬件设备并安装好驱动。 如果你想体验在云端 GPU 算力平台进行深度学习开发的感觉,欢迎访问 MegStudio 平台