You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

functional.py 8.3 kB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301
  1. # -*- coding: utf-8 -*-
  2. # MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
  3. #
  4. # Copyright (c) 2014-2020 Megvii Inc. All rights reserved.
  5. #
  6. # Unless required by applicable law or agreed to in writing,
  7. # software distributed under the License is distributed on an
  8. # "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  9. from typing import Optional, Tuple
  10. from ..core._imperative_rt.core2 import apply
  11. from ..core.autodiff.grad import _grad_manager_dict
  12. from ..core.ops.builtin import CollectiveComm, Copy, PyOpBase, RemoteRecv, RemoteSend
  13. from ..device import get_default_device
  14. from ..tensor import Tensor
  15. from .group import WORLD, Group, get_backend, get_client, get_mm_server_addr, get_rank
  16. __all__ = [
  17. "reduce_sum",
  18. "broadcast",
  19. "all_gather",
  20. "reduce_scatter_sum",
  21. "all_reduce_sum",
  22. "all_reduce_max",
  23. "all_reduce_min",
  24. "gather",
  25. "scatter",
  26. "all_to_all",
  27. "remote_send",
  28. "remote_recv",
  29. ]
  30. def collective_comm(inp, mode, group, device):
  31. """Helper function for applying collective communication functions."""
  32. assert isinstance(group, Group)
  33. if group is None:
  34. return inp
  35. addr, port = get_mm_server_addr()
  36. op = CollectiveComm(
  37. key=group.key,
  38. nr_devices=group.size,
  39. rank=group.rank,
  40. is_root=(group.rank == 0),
  41. local_grad=False,
  42. addr=addr,
  43. port=port,
  44. mode=mode,
  45. dtype=inp.dtype,
  46. backend=get_backend(),
  47. comp_node=device,
  48. )
  49. return apply(op, inp)[0]
  50. def reduce_sum(
  51. inp: Tensor, group: Optional[Group] = WORLD, device: Optional[str] = ""
  52. ) -> Tensor:
  53. """
  54. Create reduce_sum operator for collective communication.
  55. :param inp: input tensor.
  56. :param group: communication group.
  57. :param device: execution device.
  58. """
  59. mode = CollectiveComm.Mode.REDUCE_SUM
  60. return collective_comm(inp, mode, group, device)
  61. def broadcast(
  62. inp: Tensor, group: Optional[Group] = WORLD, device: Optional[str] = ""
  63. ) -> Tensor:
  64. """
  65. Create broadcast operator for collective communication.
  66. :param inp: input tensor.
  67. :param group: communication group.
  68. :param device: execution device.
  69. """
  70. mode = CollectiveComm.Mode.BROADCAST
  71. return collective_comm(inp, mode, group, device)
  72. def all_gather(
  73. inp: Tensor, group: Optional[Group] = WORLD, device: Optional[str] = ""
  74. ) -> Tensor:
  75. """
  76. Create all_gather operator for collective communication.
  77. :param inp: input tensor.
  78. :param group: communication group.
  79. :param device: execution device.
  80. """
  81. mode = CollectiveComm.Mode.ALL_GATHER
  82. return collective_comm(inp, mode, group, device)
  83. def reduce_scatter_sum(
  84. inp: Tensor, group: Optional[Group] = WORLD, device: Optional[str] = ""
  85. ) -> Tensor:
  86. """
  87. Create reduce_scatter_sum operator for collective communication.
  88. :param inp: input tensor.
  89. :param group: communication group.
  90. :param device: execution device.
  91. """
  92. mode = CollectiveComm.Mode.REDUCE_SCATTER_SUM
  93. return collective_comm(inp, mode, group, device)
  94. def all_reduce_sum(
  95. inp: Tensor, group: Optional[Group] = WORLD, device: Optional[str] = ""
  96. ) -> Tensor:
  97. """
  98. Create all_reduce_sum operator for collective communication.
  99. :param inp: input tensor.
  100. :param group: communication group.
  101. :param device: execution device.
  102. """
  103. mode = CollectiveComm.Mode.ALL_REDUCE_SUM
  104. return collective_comm(inp, mode, group, device)
  105. def all_reduce_max(
  106. inp: Tensor, group: Optional[Group] = WORLD, device: Optional[str] = ""
  107. ) -> Tensor:
  108. """
  109. Create all_reduce_max operator for collective communication.
  110. :param inp: input tensor.
  111. :param group: communication group.
  112. :param device: execution device.
  113. """
  114. mode = CollectiveComm.Mode.ALL_REDUCE_MAX
  115. return collective_comm(inp, mode, group, device)
  116. def all_reduce_min(
  117. inp: Tensor, group: Optional[Group] = WORLD, device: Optional[str] = ""
  118. ) -> Tensor:
  119. """
  120. Create all_reduce_min operator for collective communication.
  121. :param inp: input tensor.
  122. :param group: communication group.
  123. :param device: execution device.
  124. """
  125. mode = CollectiveComm.Mode.ALL_REDUCE_MIN
  126. return collective_comm(inp, mode, group, device)
  127. def gather(
  128. inp: Tensor, group: Optional[Group] = WORLD, device: Optional[str] = ""
  129. ) -> Tensor:
  130. """
  131. Create gather operator for collective communication.
  132. :param inp: input tensor.
  133. :param group: communication group.
  134. :param device: execution device.
  135. """
  136. mode = CollectiveComm.Mode.GATHER
  137. return collective_comm(inp, mode, group, device)
  138. def scatter(
  139. inp: Tensor, group: Optional[Group] = WORLD, device: Optional[str] = ""
  140. ) -> Tensor:
  141. """
  142. Create scatter operator for collective communication.
  143. :param inp: input tensor.
  144. :param group: communication group.
  145. :param device: execution device.
  146. """
  147. mode = CollectiveComm.Mode.SCATTER
  148. return collective_comm(inp, mode, group, device)
  149. def all_to_all(
  150. inp: Tensor, group: Optional[Group] = WORLD, device: Optional[str] = ""
  151. ) -> Tensor:
  152. """
  153. Create all_to_all operator for collective communication.
  154. :param inp: input tensor.
  155. :param group: communication group.
  156. :param device: execution device.
  157. """
  158. mode = CollectiveComm.Mode.ALL_TO_ALL
  159. return collective_comm(inp, mode, group, device)
  160. class _RemoteSend(PyOpBase):
  161. def __init__(self, op: RemoteSend):
  162. self.op = op
  163. def _default_rule(self, data):
  164. return apply(self.op, data)
  165. def _grad_rule(self, data):
  166. self.dtype = data.dtype
  167. self.shape = data.shape
  168. self.device = data.device
  169. (self.dummy,) = self._default_rule(data)
  170. return self.dummy, self.backward
  171. def backward(self, grad):
  172. assert grad is None
  173. if get_client().check_is_grad(self.op.key):
  174. return remote_recv(
  175. self.op.rank_to,
  176. self.shape,
  177. self.dtype,
  178. device=str(self.device),
  179. inp=self.dummy,
  180. )
  181. class _RemoteRecv(PyOpBase):
  182. def __init__(self, op: RemoteRecv):
  183. self.op = op
  184. def _default_rule(self, dummy):
  185. return apply(self.op, dummy)
  186. def _grad_rule(self, dummy):
  187. return self._default_rule(dummy), self.backward
  188. def backward(self, grad):
  189. get_client().set_is_grad(self.op.key, grad is not None)
  190. if grad is not None:
  191. remote_send(grad, self.op.rank_from)
  192. def remote_send(inp: Tensor, dest_rank: int) -> Tensor:
  193. """
  194. Send a Tensor to a remote process.
  195. :param inp: tensor to send.
  196. :param dest_rank: destination process rank.
  197. """
  198. key = "{}->{}".format(get_rank(), dest_rank)
  199. grad_keys = {}
  200. for n, g in _grad_manager_dict.items():
  201. if g._is_attached_to(inp):
  202. grad_keys[n] = g
  203. get_client().set_remote_tracer(key, grad_keys)
  204. op = RemoteSend()
  205. op.key = key
  206. op.addr, op.port = get_mm_server_addr()
  207. op.rank_to = dest_rank
  208. (dummy,) = apply(_RemoteSend(op), inp)
  209. for g in grad_keys.values():
  210. g._refkeeper.append(dummy)
  211. def remote_recv(
  212. src_rank: int,
  213. shape: Tuple[int],
  214. dtype: type,
  215. device: Optional[str] = None,
  216. inp=None,
  217. ) -> Tensor:
  218. """
  219. Receive a Tensor from a remote process.
  220. :param src_rank: source process rank.
  221. :param shape: the shape of the tensor to receive.
  222. :param dtype: the data type of the tensor to receive.
  223. :param device: the device to place the received tensor.
  224. :param inp: dummy input to determine recved tensor type
  225. """
  226. key = "{}->{}".format(src_rank, get_rank())
  227. if device is None:
  228. device = get_default_device()
  229. # dummy input
  230. if inp is None:
  231. inp = Tensor([0], device=device)
  232. tracer_set = get_client().check_remote_tracer(key)
  233. for n in tracer_set:
  234. g = _grad_manager_dict.get(n)
  235. if g is not None:
  236. g.wrt(inp)
  237. g._refkeeper.append(inp)
  238. op = RemoteRecv()
  239. op.key = key
  240. op.cn = device
  241. op.shape = shape
  242. op.dtype = dtype
  243. op.addr, op.port = get_mm_server_addr()
  244. op.rank_from = src_rank
  245. (ret,) = apply(_RemoteRecv(op), inp)
  246. return ret

MegEngine 安装包中集成了使用 GPU 运行代码所需的 CUDA 环境,不用区分 CPU 和 GPU 版。 如果想要运行 GPU 程序,请确保机器本身配有 GPU 硬件设备并安装好驱动。 如果你想体验在云端 GPU 算力平台进行深度学习开发的感觉,欢迎访问 MegStudio 平台