You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

distributed.py 8.3 kB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299
  1. # -*- coding: utf-8 -*-
  2. # MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
  3. #
  4. # Copyright (c) 2014-2020 Megvii Inc. All rights reserved.
  5. #
  6. # Unless required by applicable law or agreed to in writing,
  7. # software distributed under the License is distributed on an
  8. # "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  9. from typing import Optional, Tuple
  10. from ..core._imperative_rt.ops import CollectiveCommMode
  11. from ..core.autodiff.builtin_op_utils import builtin_op_get_backward_fn
  12. from ..core.autodiff.grad import (
  13. Tracer,
  14. check_backward_allow_noinput,
  15. get_grad_managers,
  16. get_op_has_grad_fn,
  17. tracer_apply,
  18. )
  19. from ..core.ops.builtin import CollectiveComm, Copy, RemoteRecv, RemoteSend
  20. from ..core.tensor.core import apply
  21. from ..core.tensor.tensor import Tensor, tensor_apply
  22. from ..distributed.group import (
  23. WORLD,
  24. Group,
  25. get_backend,
  26. get_client,
  27. get_mm_server_addr,
  28. get_rank,
  29. )
  30. from ..tensor import tensor
  31. __all__ = [
  32. "reduce_sum",
  33. "broadcast",
  34. "all_gather",
  35. "reduce_scatter_sum",
  36. "all_reduce_sum",
  37. "all_reduce_max",
  38. "all_reduce_min",
  39. "gather",
  40. "scatter",
  41. "all_to_all",
  42. "remote_send",
  43. "remote_recv",
  44. ]
  45. @apply.register()
  46. def _(op: RemoteSend, *args: Tensor):
  47. ret = tensor_apply(op, *args)
  48. # set extra information
  49. tracer_set = dict()
  50. for k in set().union(*(i._extra_data for i in args if isinstance(i, Tensor))):
  51. tracer_set[k.name] = True
  52. # check tracer_set in remote_recv
  53. get_client().set_remote_tracer(op.key, tracer_set)
  54. return ret
  55. @builtin_op_get_backward_fn.register(RemoteSend)
  56. def _(op: RemoteSend, inputs, outputs, input_requires_grad):
  57. def backward(*args):
  58. return [
  59. remote_recv(
  60. op.rank_to, inputs[0].shape, inputs[0].dtype, str(inputs[0].device)
  61. )
  62. ]
  63. return backward, [True]
  64. @get_op_has_grad_fn.register(RemoteSend)
  65. def _(op: RemoteSend):
  66. def has_grad(opnode, reached):
  67. return get_client().check_is_grad(op.key)
  68. return has_grad
  69. @check_backward_allow_noinput.register(RemoteSend)
  70. def _(op: RemoteSend):
  71. return True
  72. @builtin_op_get_backward_fn.register(RemoteRecv)
  73. def _(op: RemoteRecv, inputs, outputs, input_requires_grad):
  74. def backward(*output_grads):
  75. return [remote_send(output_grads[0], op.rank_from)]
  76. return backward, [True]
  77. @get_op_has_grad_fn.register(RemoteRecv)
  78. def _(op: RemoteRecv):
  79. def has_grad(opnode, reached):
  80. ret = False
  81. for v in opnode.outputs:
  82. if v() in reached:
  83. ret = True
  84. break
  85. get_client().set_is_grad(op.key, ret)
  86. return ret
  87. return has_grad
  88. def collective_comm(inp, mode, group, device):
  89. """Helper function for applying collective communication functions"""
  90. assert isinstance(group, Group)
  91. if group is None:
  92. return inp
  93. op = CollectiveComm()
  94. op.key = group.key
  95. op.nr_devices = group.size
  96. op.rank = group.rank
  97. op.is_root = op.rank == 0
  98. op.local_grad = False
  99. op.addr, op.port = get_mm_server_addr()
  100. op.mode = mode
  101. op.dtype = inp.dtype
  102. op.backend = get_backend()
  103. op.comp_node = device
  104. return apply(op, inp)[0]
  105. def reduce_sum(
  106. inp: Tensor, group: Optional[Group] = WORLD, device: Optional[str] = ""
  107. ) -> Tensor:
  108. """Create reduce_sum operator for collective communication
  109. :param inp: input tensor
  110. :param group: communication group
  111. :param device: execute placement
  112. """
  113. mode = CollectiveCommMode.REDUCE_SUM
  114. return collective_comm(inp, mode, group, device)
  115. def broadcast(
  116. inp: Tensor, group: Optional[Group] = WORLD, device: Optional[str] = ""
  117. ) -> Tensor:
  118. """Create broadcast operator for collective communication
  119. :param inp: input tensor
  120. :param group: communication group
  121. :param device: execute placement
  122. """
  123. mode = CollectiveCommMode.BROADCAST
  124. return collective_comm(inp, mode, group, device)
  125. def all_gather(
  126. inp: Tensor, group: Optional[Group] = WORLD, device: Optional[str] = ""
  127. ) -> Tensor:
  128. """Create all_gather operator for collective communication
  129. :param inp: input tensor
  130. :param group: communication group
  131. :param device: execute placement
  132. """
  133. mode = CollectiveCommMode.ALL_GATHER
  134. return collective_comm(inp, mode, group, device)
  135. def reduce_scatter_sum(
  136. inp: Tensor, group: Optional[Group] = WORLD, device: Optional[str] = ""
  137. ) -> Tensor:
  138. """Create reduce_scatter_sum operator for collective communication
  139. :param inp: input tensor
  140. :param group: communication group
  141. :param device: execute placement
  142. """
  143. mode = CollectiveCommMode.REDUCE_SCATTER_SUM
  144. return collective_comm(inp, mode, group, device)
  145. def all_reduce_sum(
  146. inp: Tensor, group: Optional[Group] = WORLD, device: Optional[str] = ""
  147. ) -> Tensor:
  148. """Create all_reduce_sum operator for collective communication
  149. :param inp: input tensor
  150. :param group: communication group
  151. :param device: execute placement
  152. """
  153. mode = CollectiveCommMode.ALL_REDUCE_SUM
  154. return collective_comm(inp, mode, group, device)
  155. def all_reduce_max(
  156. inp: Tensor, group: Optional[Group] = WORLD, device: Optional[str] = ""
  157. ) -> Tensor:
  158. """Create all_reduce_max operator for collective communication
  159. :param inp: input tensor
  160. :param group: communication group
  161. :param device: execute placement
  162. """
  163. mode = CollectiveCommMode.ALL_REDUCE_MAX
  164. return collective_comm(inp, mode, group, device)
  165. def all_reduce_min(
  166. inp: Tensor, group: Optional[Group] = WORLD, device: Optional[str] = ""
  167. ) -> Tensor:
  168. """Create all_reduce_min operator for collective communication
  169. :param inp: input tensor
  170. :param group: communication group
  171. :param device: execute placement
  172. """
  173. mode = CollectiveCommMode.ALL_REDUCE_MIN
  174. return collective_comm(inp, mode, group, device)
  175. def gather(
  176. inp: Tensor, group: Optional[Group] = WORLD, device: Optional[str] = ""
  177. ) -> Tensor:
  178. """Create gather operator for collective communication
  179. :param inp: input tensor
  180. :param group: communication group
  181. :param device: execute placement
  182. """
  183. mode = CollectiveCommMode.GATHER
  184. return collective_comm(inp, mode, group, device)
  185. def scatter(
  186. inp: Tensor, group: Optional[Group] = WORLD, device: Optional[str] = ""
  187. ) -> Tensor:
  188. """Create scatter operator for collective communication
  189. :param inp: input tensor
  190. :param group: communication group
  191. :param device: execute placement
  192. """
  193. mode = CollectiveCommMode.SCATTER
  194. return collective_comm(inp, mode, group, device)
  195. def all_to_all(
  196. inp: Tensor, group: Optional[Group] = WORLD, device: Optional[str] = ""
  197. ) -> Tensor:
  198. """Create all_to_all operator for collective communication
  199. :param inp: input tensor
  200. :param group: communication group
  201. :param device: execute placement
  202. """
  203. mode = CollectiveCommMode.ALL_TO_ALL
  204. return collective_comm(inp, mode, group, device)
  205. def remote_send(inp: Tensor, dest_rank: int) -> Tensor:
  206. """Send a Tensor to a remote process
  207. :param inp: tensor to send
  208. :param dest_rank: destination process rank
  209. """
  210. op = RemoteSend()
  211. op.key = "{}->{}".format(get_rank(), dest_rank)
  212. op.addr, op.port = get_mm_server_addr()
  213. op.rank_to = dest_rank
  214. return apply(op, inp)[0]
  215. def remote_recv(
  216. src_rank: int, shape: Tuple[int], dtype: type, cn: Optional[str] = "gpu0"
  217. ) -> Tensor:
  218. """Receive a Tensor from a remote process
  219. :param src_rank: source process rank
  220. :param shape: the shape of the tensor to receive
  221. :param dtype: the data type of the tensor to receive
  222. :param cn: the comp node to place the received tensor
  223. """
  224. key = "{}->{}".format(src_rank, get_rank())
  225. # dummpy input
  226. inp = tensor([0])
  227. tracer_set = get_client().check_remote_tracer(key)
  228. for grad_manager in get_grad_managers():
  229. if grad_manager.name in tracer_set:
  230. grad_manager.wrt(inp)
  231. op = RemoteRecv()
  232. op.key = key
  233. op.cn = cn
  234. op.shape = shape
  235. op.dtype = dtype
  236. op.addr, op.port = get_mm_server_addr()
  237. op.rank_from = src_rank
  238. return apply(op, inp)[0]

MegEngine 安装包中集成了使用 GPU 运行代码所需的 CUDA 环境,不用区分 CPU 和 GPU 版。 如果想要运行 GPU 程序,请确保机器本身配有 GPU 硬件设备并安装好驱动。 如果你想体验在云端 GPU 算力平台进行深度学习开发的感觉,欢迎访问 MegStudio 平台