You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

functional.py 7.1 kB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254
  1. # -*- coding: utf-8 -*-
  2. # MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
  3. #
  4. # Copyright (c) 2014-2020 Megvii Inc. All rights reserved.
  5. #
  6. # Unless required by applicable law or agreed to in writing,
  7. # software distributed under the License is distributed on an
  8. # "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  9. from typing import Optional, Tuple
  10. from ..core._imperative_rt.core2 import apply
  11. from ..core.autodiff.builtin_op_utils import builtin_op_get_backward_fn
  12. from ..core.autodiff.grad import (
  13. Tracer,
  14. check_backward_allow_noinput,
  15. get_grad_managers,
  16. get_op_has_grad_fn,
  17. tracer_apply,
  18. )
  19. from ..core.ops.builtin import CollectiveComm, Copy, RemoteRecv, RemoteSend
  20. from ..core.tensor.tensor import Tensor, tensor_apply
  21. from ..device import get_default_device
  22. from ..tensor import tensor
  23. from .group import WORLD, Group, get_backend, get_client, get_mm_server_addr, get_rank
  24. __all__ = [
  25. "reduce_sum",
  26. "broadcast",
  27. "all_gather",
  28. "reduce_scatter_sum",
  29. "all_reduce_sum",
  30. "all_reduce_max",
  31. "all_reduce_min",
  32. "gather",
  33. "scatter",
  34. "all_to_all",
  35. "remote_send",
  36. "remote_recv",
  37. ]
  38. def collective_comm(inp, mode, group, device):
  39. """Helper function for applying collective communication functions."""
  40. assert isinstance(group, Group)
  41. if group is None:
  42. return inp
  43. addr, port = get_mm_server_addr()
  44. op = CollectiveComm(
  45. key=group.key,
  46. nr_devices=group.size,
  47. rank=group.rank,
  48. is_root=(group.rank == 0),
  49. local_grad=False,
  50. addr=addr,
  51. port=port,
  52. mode=mode,
  53. dtype=inp.dtype,
  54. backend=get_backend(),
  55. comp_node=device,
  56. )
  57. return apply(op, inp)[0]
  58. def reduce_sum(
  59. inp: Tensor, group: Optional[Group] = WORLD, device: Optional[str] = ""
  60. ) -> Tensor:
  61. """
  62. Create reduce_sum operator for collective communication.
  63. :param inp: input tensor.
  64. :param group: communication group.
  65. :param device: execution device.
  66. """
  67. mode = CollectiveComm.Mode.REDUCE_SUM
  68. return collective_comm(inp, mode, group, device)
  69. def broadcast(
  70. inp: Tensor, group: Optional[Group] = WORLD, device: Optional[str] = ""
  71. ) -> Tensor:
  72. """
  73. Create broadcast operator for collective communication.
  74. :param inp: input tensor.
  75. :param group: communication group.
  76. :param device: execution device.
  77. """
  78. mode = CollectiveComm.Mode.BROADCAST
  79. return collective_comm(inp, mode, group, device)
  80. def all_gather(
  81. inp: Tensor, group: Optional[Group] = WORLD, device: Optional[str] = ""
  82. ) -> Tensor:
  83. """
  84. Create all_gather operator for collective communication.
  85. :param inp: input tensor.
  86. :param group: communication group.
  87. :param device: execution device.
  88. """
  89. mode = CollectiveComm.Mode.ALL_GATHER
  90. return collective_comm(inp, mode, group, device)
  91. def reduce_scatter_sum(
  92. inp: Tensor, group: Optional[Group] = WORLD, device: Optional[str] = ""
  93. ) -> Tensor:
  94. """
  95. Create reduce_scatter_sum operator for collective communication.
  96. :param inp: input tensor.
  97. :param group: communication group.
  98. :param device: execution device.
  99. """
  100. mode = CollectiveComm.Mode.REDUCE_SCATTER_SUM
  101. return collective_comm(inp, mode, group, device)
  102. def all_reduce_sum(
  103. inp: Tensor, group: Optional[Group] = WORLD, device: Optional[str] = ""
  104. ) -> Tensor:
  105. """
  106. Create all_reduce_sum operator for collective communication.
  107. :param inp: input tensor.
  108. :param group: communication group.
  109. :param device: execution device.
  110. """
  111. mode = CollectiveComm.Mode.ALL_REDUCE_SUM
  112. return collective_comm(inp, mode, group, device)
  113. def all_reduce_max(
  114. inp: Tensor, group: Optional[Group] = WORLD, device: Optional[str] = ""
  115. ) -> Tensor:
  116. """
  117. Create all_reduce_max operator for collective communication.
  118. :param inp: input tensor.
  119. :param group: communication group.
  120. :param device: execution device.
  121. """
  122. mode = CollectiveComm.Mode.ALL_REDUCE_MAX
  123. return collective_comm(inp, mode, group, device)
  124. def all_reduce_min(
  125. inp: Tensor, group: Optional[Group] = WORLD, device: Optional[str] = ""
  126. ) -> Tensor:
  127. """
  128. Create all_reduce_min operator for collective communication.
  129. :param inp: input tensor.
  130. :param group: communication group.
  131. :param device: execution device.
  132. """
  133. mode = CollectiveComm.Mode.ALL_REDUCE_MIN
  134. return collective_comm(inp, mode, group, device)
  135. def gather(
  136. inp: Tensor, group: Optional[Group] = WORLD, device: Optional[str] = ""
  137. ) -> Tensor:
  138. """
  139. Create gather operator for collective communication.
  140. :param inp: input tensor.
  141. :param group: communication group.
  142. :param device: execution device.
  143. """
  144. mode = CollectiveComm.Mode.GATHER
  145. return collective_comm(inp, mode, group, device)
  146. def scatter(
  147. inp: Tensor, group: Optional[Group] = WORLD, device: Optional[str] = ""
  148. ) -> Tensor:
  149. """
  150. Create scatter operator for collective communication.
  151. :param inp: input tensor.
  152. :param group: communication group.
  153. :param device: execution device.
  154. """
  155. mode = CollectiveComm.Mode.SCATTER
  156. return collective_comm(inp, mode, group, device)
  157. def all_to_all(
  158. inp: Tensor, group: Optional[Group] = WORLD, device: Optional[str] = ""
  159. ) -> Tensor:
  160. """
  161. Create all_to_all operator for collective communication.
  162. :param inp: input tensor.
  163. :param group: communication group.
  164. :param device: execution device.
  165. """
  166. mode = CollectiveComm.Mode.ALL_TO_ALL
  167. return collective_comm(inp, mode, group, device)
  168. def remote_send(inp: Tensor, dest_rank: int) -> Tensor:
  169. """
  170. Send a Tensor to a remote process.
  171. :param inp: tensor to send.
  172. :param dest_rank: destination process rank.
  173. """
  174. op = RemoteSend()
  175. op.key = "{}->{}".format(get_rank(), dest_rank)
  176. op.addr, op.port = get_mm_server_addr()
  177. op.rank_to = dest_rank
  178. return apply(op, inp)[0]
  179. def remote_recv(
  180. src_rank: int,
  181. shape: Tuple[int],
  182. dtype: type,
  183. device: Optional[str] = None,
  184. inp=None,
  185. ) -> Tensor:
  186. """
  187. Receive a Tensor from a remote process.
  188. :param src_rank: source process rank.
  189. :param shape: the shape of the tensor to receive.
  190. :param dtype: the data type of the tensor to receive.
  191. :param device: the device to place the received tensor.
  192. :param inp: dummy input to determine recved tensor type
  193. """
  194. key = "{}->{}".format(src_rank, get_rank())
  195. if device is None:
  196. device = get_default_device()
  197. # dummy input
  198. if inp == None:
  199. inp = tensor([0], device=device)
  200. tracer_set = get_client().check_remote_tracer(key)
  201. for grad_manager in get_grad_managers():
  202. if grad_manager.name in tracer_set:
  203. grad_manager.wrt(inp)
  204. op = RemoteRecv()
  205. op.key = key
  206. op.cn = device
  207. op.shape = shape
  208. op.dtype = dtype
  209. op.addr, op.port = get_mm_server_addr()
  210. op.rank_from = src_rank
  211. return apply(op, inp)[0]

MegEngine 安装包中集成了使用 GPU 运行代码所需的 CUDA 环境,不用区分 CPU 和 GPU 版。 如果想要运行 GPU 程序,请确保机器本身配有 GPU 硬件设备并安装好驱动。 如果你想体验在云端 GPU 算力平台进行深度学习开发的感觉,欢迎访问 MegStudio 平台