You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

functional.py 8.4 kB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295
  1. # -*- coding: utf-8 -*-
  2. # MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
  3. #
  4. # Copyright (c) 2014-2020 Megvii Inc. All rights reserved.
  5. #
  6. # Unless required by applicable law or agreed to in writing,
  7. # software distributed under the License is distributed on an
  8. # "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  9. from typing import Optional, Tuple
  10. from ..core._imperative_rt.ops import CollectiveCommMode
  11. from ..core.autodiff.builtin_op_utils import builtin_op_get_backward_fn
  12. from ..core.autodiff.grad import (
  13. Tracer,
  14. check_backward_allow_noinput,
  15. get_grad_managers,
  16. get_op_has_grad_fn,
  17. tracer_apply,
  18. )
  19. from ..core.ops.builtin import CollectiveComm, Copy, RemoteRecv, RemoteSend
  20. from ..core.tensor.core import apply
  21. from ..core.tensor.tensor import Tensor, tensor_apply
  22. from ..device import get_default_device
  23. from ..tensor import tensor
  24. from .group import WORLD, Group, get_backend, get_client, get_mm_server_addr, get_rank
  25. __all__ = [
  26. "reduce_sum",
  27. "broadcast",
  28. "all_gather",
  29. "reduce_scatter_sum",
  30. "all_reduce_sum",
  31. "all_reduce_max",
  32. "all_reduce_min",
  33. "gather",
  34. "scatter",
  35. "all_to_all",
  36. "remote_send",
  37. "remote_recv",
  38. ]
  39. @apply.register()
  40. def _(op: RemoteSend, *args: Tensor):
  41. ret = tensor_apply(op, *args)
  42. # set extra information
  43. tracer_set = dict()
  44. for k in set().union(*(i._extra_data for i in args if isinstance(i, Tensor))):
  45. tracer_set[k.name] = True
  46. # check tracer_set in remote_recv
  47. get_client().set_remote_tracer(op.key, tracer_set)
  48. return ret
  49. @builtin_op_get_backward_fn.register(RemoteSend)
  50. def _(op: RemoteSend, inputs, outputs, input_requires_grad):
  51. def backward(*args):
  52. return [
  53. remote_recv(
  54. op.rank_to, inputs[0].shape, inputs[0].dtype, str(inputs[0].device)
  55. )
  56. ]
  57. return backward, [True]
  58. @get_op_has_grad_fn.register(RemoteSend)
  59. def _(op: RemoteSend):
  60. def has_grad(opnode, reached):
  61. return get_client().check_is_grad(op.key)
  62. return has_grad
  63. @check_backward_allow_noinput.register(RemoteSend)
  64. def _(op: RemoteSend):
  65. return True
  66. @builtin_op_get_backward_fn.register(RemoteRecv)
  67. def _(op: RemoteRecv, inputs, outputs, input_requires_grad):
  68. def backward(*output_grads):
  69. return [remote_send(output_grads[0], op.rank_from)]
  70. return backward, [True]
  71. @get_op_has_grad_fn.register(RemoteRecv)
  72. def _(op: RemoteRecv):
  73. def has_grad(opnode, reached):
  74. ret = False
  75. for v in opnode.outputs:
  76. if v() in reached:
  77. ret = True
  78. break
  79. get_client().set_is_grad(op.key, ret)
  80. return ret
  81. return has_grad
  82. def collective_comm(inp, mode, group, device):
  83. """Helper function for applying collective communication functions."""
  84. assert isinstance(group, Group)
  85. if group is None:
  86. return inp
  87. op = CollectiveComm()
  88. op.key = group.key
  89. op.nr_devices = group.size
  90. op.rank = group.rank
  91. op.is_root = op.rank == 0
  92. op.local_grad = False
  93. op.addr, op.port = get_mm_server_addr()
  94. op.mode = mode
  95. op.dtype = inp.dtype
  96. op.backend = get_backend()
  97. op.comp_node = device
  98. return apply(op, inp)[0]
  99. def reduce_sum(
  100. inp: Tensor, group: Optional[Group] = WORLD, device: Optional[str] = ""
  101. ) -> Tensor:
  102. """Create reduce_sum operator for collective communication.
  103. :param inp: input tensor.
  104. :param group: communication group.
  105. :param device: execution device.
  106. """
  107. mode = CollectiveCommMode.REDUCE_SUM
  108. return collective_comm(inp, mode, group, device)
  109. def broadcast(
  110. inp: Tensor, group: Optional[Group] = WORLD, device: Optional[str] = ""
  111. ) -> Tensor:
  112. """Create broadcast operator for collective communication.
  113. :param inp: input tensor.
  114. :param group: communication group.
  115. :param device: execution device.
  116. """
  117. mode = CollectiveCommMode.BROADCAST
  118. return collective_comm(inp, mode, group, device)
  119. def all_gather(
  120. inp: Tensor, group: Optional[Group] = WORLD, device: Optional[str] = ""
  121. ) -> Tensor:
  122. """Create all_gather operator for collective communication.
  123. :param inp: input tensor.
  124. :param group: communication group.
  125. :param device: execution device.
  126. """
  127. mode = CollectiveCommMode.ALL_GATHER
  128. return collective_comm(inp, mode, group, device)
  129. def reduce_scatter_sum(
  130. inp: Tensor, group: Optional[Group] = WORLD, device: Optional[str] = ""
  131. ) -> Tensor:
  132. """Create reduce_scatter_sum operator for collective communication.
  133. :param inp: input tensor.
  134. :param group: communication group.
  135. :param device: execution device.
  136. """
  137. mode = CollectiveCommMode.REDUCE_SCATTER_SUM
  138. return collective_comm(inp, mode, group, device)
  139. def all_reduce_sum(
  140. inp: Tensor, group: Optional[Group] = WORLD, device: Optional[str] = ""
  141. ) -> Tensor:
  142. """Create all_reduce_sum operator for collective communication.
  143. :param inp: input tensor.
  144. :param group: communication group.
  145. :param device: execution device.
  146. """
  147. mode = CollectiveCommMode.ALL_REDUCE_SUM
  148. return collective_comm(inp, mode, group, device)
  149. def all_reduce_max(
  150. inp: Tensor, group: Optional[Group] = WORLD, device: Optional[str] = ""
  151. ) -> Tensor:
  152. """Create all_reduce_max operator for collective communication.
  153. :param inp: input tensor.
  154. :param group: communication group.
  155. :param device: execution device.
  156. """
  157. mode = CollectiveCommMode.ALL_REDUCE_MAX
  158. return collective_comm(inp, mode, group, device)
  159. def all_reduce_min(
  160. inp: Tensor, group: Optional[Group] = WORLD, device: Optional[str] = ""
  161. ) -> Tensor:
  162. """Create all_reduce_min operator for collective communication.
  163. :param inp: input tensor.
  164. :param group: communication group.
  165. :param device: execution device.
  166. """
  167. mode = CollectiveCommMode.ALL_REDUCE_MIN
  168. return collective_comm(inp, mode, group, device)
  169. def gather(
  170. inp: Tensor, group: Optional[Group] = WORLD, device: Optional[str] = ""
  171. ) -> Tensor:
  172. """Create gather operator for collective communication.
  173. :param inp: input tensor.
  174. :param group: communication group.
  175. :param device: execution device.
  176. """
  177. mode = CollectiveCommMode.GATHER
  178. return collective_comm(inp, mode, group, device)
  179. def scatter(
  180. inp: Tensor, group: Optional[Group] = WORLD, device: Optional[str] = ""
  181. ) -> Tensor:
  182. """Create scatter operator for collective communication.
  183. :param inp: input tensor.
  184. :param group: communication group.
  185. :param device: execution device.
  186. """
  187. mode = CollectiveCommMode.SCATTER
  188. return collective_comm(inp, mode, group, device)
  189. def all_to_all(
  190. inp: Tensor, group: Optional[Group] = WORLD, device: Optional[str] = ""
  191. ) -> Tensor:
  192. """Create all_to_all operator for collective communication.
  193. :param inp: input tensor.
  194. :param group: communication group.
  195. :param device: execution device.
  196. """
  197. mode = CollectiveCommMode.ALL_TO_ALL
  198. return collective_comm(inp, mode, group, device)
  199. def remote_send(inp: Tensor, dest_rank: int) -> Tensor:
  200. """Send a Tensor to a remote process.
  201. :param inp: tensor to send.
  202. :param dest_rank: destination process rank.
  203. """
  204. op = RemoteSend()
  205. op.key = "{}->{}".format(get_rank(), dest_rank)
  206. op.addr, op.port = get_mm_server_addr()
  207. op.rank_to = dest_rank
  208. return apply(op, inp)[0]
  209. def remote_recv(
  210. src_rank: int, shape: Tuple[int], dtype: type, device: Optional[str] = None
  211. ) -> Tensor:
  212. """Receive a Tensor from a remote process.
  213. :param src_rank: source process rank.
  214. :param shape: the shape of the tensor to receive.
  215. :param dtype: the data type of the tensor to receive.
  216. :param device: the device to place the received tensor.
  217. """
  218. key = "{}->{}".format(src_rank, get_rank())
  219. if device is None:
  220. device = get_default_device()
  221. # dummpy input
  222. inp = tensor([0])
  223. tracer_set = get_client().check_remote_tracer(key)
  224. for grad_manager in get_grad_managers():
  225. if grad_manager.name in tracer_set:
  226. grad_manager.wrt(inp)
  227. op = RemoteRecv()
  228. op.key = key
  229. op.cn = device
  230. op.shape = shape
  231. op.dtype = dtype
  232. op.addr, op.port = get_mm_server_addr()
  233. op.rank_from = src_rank
  234. return apply(op, inp)[0]

MegEngine 安装包中集成了使用 GPU 运行代码所需的 CUDA 环境,不用区分 CPU 和 GPU 版。 如果想要运行 GPU 程序,请确保机器本身配有 GPU 硬件设备并安装好驱动。 如果你想体验在云端 GPU 算力平台进行深度学习开发的感觉,欢迎访问 MegStudio 平台