You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

test_functional_distributed.py 11 kB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374
  1. # -*- coding: utf-8 -*-
  2. import platform
  3. import numpy as np
  4. import pytest
  5. import megengine as mge
  6. import megengine.distributed as dist
  7. from megengine import Parameter, tensor
  8. from megengine.core._imperative_rt.core2 import sync
  9. from megengine.device import get_default_device, set_default_device
  10. from megengine.functional.distributed import (
  11. all_gather,
  12. all_reduce_max,
  13. all_reduce_min,
  14. all_reduce_sum,
  15. all_to_all,
  16. broadcast,
  17. gather,
  18. reduce_scatter_sum,
  19. reduce_sum,
  20. remote_recv,
  21. remote_send,
  22. scatter,
  23. )
  24. def run_reduce_sum(shape, dtype):
  25. @dist.launcher(n_gpus=2)
  26. def worker(data, expect):
  27. rank = dist.get_rank()
  28. inp = tensor(data[rank])
  29. output = reduce_sum(inp)
  30. if rank == 0:
  31. assert np.allclose(output.numpy(), expect[rank])
  32. else:
  33. assert output is None
  34. x = np.random.random_sample(shape).astype(dtype)
  35. y = np.random.random_sample(shape).astype(dtype)
  36. z = x + y
  37. data = (x, y)
  38. expect = (z, None)
  39. worker(data, expect)
  40. @pytest.mark.require_ngpu(2)
  41. @pytest.mark.parametrize("shape", [(), (1,), (2, 3), (8, 10), (99, 77)], ids=str)
  42. @pytest.mark.isolated_distributed
  43. def test_reduce_sum_multishape(shape):
  44. run_reduce_sum(shape, "float32")
  45. @pytest.mark.require_ngpu(2)
  46. @pytest.mark.parametrize("dtype", ["float32", "int32", "int8", "uint8"], ids=str)
  47. @pytest.mark.isolated_distributed
  48. def test_reduce_sum_multidtype(dtype):
  49. run_reduce_sum((8, 10), dtype)
  50. def run_broadcast(shape, dtype):
  51. @dist.launcher(n_gpus=2)
  52. def worker(data, expect):
  53. rank = dist.get_rank()
  54. inp = tensor(data[rank])
  55. output = broadcast(inp)
  56. assert np.allclose(output.numpy(), expect[rank])
  57. x = np.random.random_sample(shape).astype(dtype)
  58. y = x + 1
  59. data = (x, y)
  60. expect = (x, x)
  61. worker(data, expect)
  62. @pytest.mark.require_ngpu(2)
  63. @pytest.mark.parametrize("shape", [(), (1,), (2, 3), (8, 10), (99, 77)], ids=str)
  64. @pytest.mark.isolated_distributed
  65. def test_broadcast_multishape(shape):
  66. run_broadcast(shape, "float32")
  67. @pytest.mark.require_ngpu(2)
  68. @pytest.mark.parametrize("dtype", ["float32", "int32", "int8", "uint8"], ids=str)
  69. @pytest.mark.isolated_distributed
  70. def test_broadcast_multidtype(dtype):
  71. run_broadcast((8, 10), dtype)
  72. def run_all_gather(shape, dtype):
  73. @dist.launcher(n_gpus=2)
  74. def worker(data, expect):
  75. rank = dist.get_rank()
  76. inp = tensor(data[rank])
  77. output = all_gather(inp)
  78. assert np.allclose(output.numpy(), expect[rank])
  79. x = np.random.random_sample(shape).astype(dtype)
  80. y = np.random.random_sample(shape).astype(dtype)
  81. z = np.concatenate((x, y))
  82. data = (x, y)
  83. expect = (z, z)
  84. worker(data, expect)
  85. @pytest.mark.require_ngpu(2)
  86. @pytest.mark.parametrize("shape", [(1,), (2, 3), (8, 10), (99, 77)], ids=str)
  87. @pytest.mark.isolated_distributed
  88. def test_all_gather_multishape(shape):
  89. run_all_gather(shape, "float32")
  90. @pytest.mark.require_ngpu(2)
  91. @pytest.mark.parametrize("dtype", ["float32", "int32", "int8", "uint8"], ids=str)
  92. @pytest.mark.isolated_distributed
  93. def test_all_gather_multidtype(dtype):
  94. run_all_gather((8, 10), dtype)
  95. def run_reduce_scatter_sum(shape, dtype):
  96. @dist.launcher(n_gpus=2)
  97. def worker(data, expect):
  98. rank = dist.get_rank()
  99. inp = tensor(data[rank])
  100. output = reduce_scatter_sum(inp)
  101. assert np.allclose(output.numpy(), expect[rank])
  102. x = np.random.random_sample(shape).astype(dtype)
  103. y = np.random.random_sample(shape).astype(dtype)
  104. z = x + y
  105. data = (x, y)
  106. expect = (z[: shape[0] // 2], z[shape[0] // 2 :])
  107. worker(data, expect)
  108. @pytest.mark.require_ngpu(2)
  109. @pytest.mark.parametrize("shape", [(2, 3), (8, 10), (88, 44)], ids=str)
  110. @pytest.mark.isolated_distributed
  111. def test_reduce_scatter_sum_multishape(shape):
  112. run_reduce_scatter_sum(shape, "float32")
  113. @pytest.mark.require_ngpu(2)
  114. @pytest.mark.parametrize("dtype", ["float32", "int32", "int8", "uint8"], ids=str)
  115. @pytest.mark.isolated_distributed
  116. def test_reduce_scatter_sum_multidtype(dtype):
  117. run_reduce_scatter_sum((8, 10), dtype)
  118. def run_all_reduce_sum(shape, dtype):
  119. @dist.launcher(n_gpus=2)
  120. def worker(data, expect):
  121. rank = dist.get_rank()
  122. inp = tensor(data[rank])
  123. output = all_reduce_sum(inp)
  124. assert np.allclose(output.numpy(), expect[rank])
  125. x = np.random.random_sample(shape).astype(dtype)
  126. y = np.random.random_sample(shape).astype(dtype)
  127. z = x + y
  128. data = (x, y)
  129. expect = (z, z)
  130. worker(data, expect)
  131. @pytest.mark.require_ngpu(2)
  132. @pytest.mark.parametrize("shape", [(), (1,), (2, 3), (8, 10), (99, 77)], ids=str)
  133. @pytest.mark.isolated_distributed
  134. def test_all_reduce_sum_multishape(shape):
  135. run_all_reduce_sum(shape, "float32")
  136. @pytest.mark.require_ngpu(2)
  137. @pytest.mark.parametrize("dtype", ["float32", "int32", "int8", "uint8"], ids=str)
  138. @pytest.mark.isolated_distributed
  139. def test_all_reduce_sum_multidtype(dtype):
  140. run_all_reduce_sum((8, 10), dtype)
  141. def run_all_reduce_max(shape, dtype):
  142. @dist.launcher(n_gpus=2)
  143. def worker(data, expect):
  144. rank = dist.get_rank()
  145. inp = tensor(data[rank])
  146. output = all_reduce_max(inp)
  147. assert np.allclose(output.numpy(), expect[rank])
  148. x = np.random.random_sample(shape).astype(dtype)
  149. y = np.random.random_sample(shape).astype(dtype)
  150. z = np.maximum(x, y)
  151. data = (x, y)
  152. expect = (z, z)
  153. worker(data, expect)
  154. @pytest.mark.require_ngpu(2)
  155. @pytest.mark.parametrize("shape", [(), (1,), (2, 3), (8, 10), (99, 77)], ids=str)
  156. @pytest.mark.isolated_distributed
  157. def test_all_reduce_max_multishape(shape):
  158. run_all_reduce_max(shape, "float32")
  159. @pytest.mark.require_ngpu(2)
  160. @pytest.mark.parametrize("dtype", ["float32", "int32", "int8", "uint8"], ids=str)
  161. @pytest.mark.isolated_distributed
  162. def test_all_reduce_max_multidtype(dtype):
  163. run_all_reduce_max((8, 10), dtype)
  164. def run_all_reduce_min(shape, dtype):
  165. @dist.launcher(n_gpus=2)
  166. def worker(data, expect):
  167. rank = dist.get_rank()
  168. inp = tensor(data[rank])
  169. output = all_reduce_min(inp)
  170. assert np.allclose(output.numpy(), expect[rank])
  171. x = np.random.random_sample(shape).astype(dtype)
  172. y = np.random.random_sample(shape).astype(dtype)
  173. z = np.minimum(x, y)
  174. data = (x, y)
  175. expect = (z, z)
  176. worker(data, expect)
  177. @pytest.mark.require_ngpu(2)
  178. @pytest.mark.parametrize("shape", [(), (1,), (2, 3), (8, 10), (99, 77)], ids=str)
  179. @pytest.mark.isolated_distributed
  180. def test_all_reduce_min_multishape(shape):
  181. run_all_reduce_min(shape, "float32")
  182. @pytest.mark.require_ngpu(2)
  183. @pytest.mark.parametrize("dtype", ["float32", "int32", "int8", "uint8"], ids=str)
  184. @pytest.mark.isolated_distributed
  185. def test_all_reduce_min_multidtype(dtype):
  186. run_all_reduce_min((8, 10), dtype)
  187. def run_gather(shape, dtype):
  188. @dist.launcher(n_gpus=2)
  189. def worker(data, expect):
  190. rank = dist.get_rank()
  191. inp = tensor(data[rank])
  192. output = gather(inp)
  193. if rank == 0:
  194. assert np.allclose(output.numpy(), expect[rank])
  195. else:
  196. assert output is None
  197. x = np.random.random_sample(shape).astype(dtype)
  198. y = np.random.random_sample(shape).astype(dtype)
  199. z = np.concatenate((x, y))
  200. data = (x, y)
  201. expect = (z, None)
  202. worker(data, expect)
  203. @pytest.mark.require_ngpu(2)
  204. @pytest.mark.parametrize("shape", [(2, 3), (8, 10), (99, 77)], ids=str)
  205. @pytest.mark.isolated_distributed
  206. def test_gather_multishape(shape):
  207. run_gather(shape, "float32")
  208. @pytest.mark.require_ngpu(2)
  209. @pytest.mark.parametrize("dtype", ["float32", "int32", "int8", "uint8"], ids=str)
  210. @pytest.mark.isolated_distributed
  211. def test_gather_multidtype(dtype):
  212. run_gather((8, 10), dtype)
  213. def run_scatter(shape, dtype):
  214. @dist.launcher(n_gpus=2)
  215. def worker(data, expect):
  216. rank = dist.get_rank()
  217. inp = tensor(data[rank])
  218. output = scatter(inp)
  219. assert np.allclose(output.numpy(), expect[rank])
  220. x = np.random.random_sample(shape).astype(dtype)
  221. y = x + 1
  222. data = (x, y)
  223. expect = (x[: shape[0] // 2], x[shape[0] // 2 :])
  224. worker(data, expect)
  225. @pytest.mark.require_ngpu(2)
  226. @pytest.mark.parametrize("shape", [(2, 3), (8, 10), (100, 77)], ids=str)
  227. @pytest.mark.isolated_distributed
  228. def test_scatter_multishape(shape):
  229. run_scatter(shape, "float32")
  230. @pytest.mark.require_ngpu(2)
  231. @pytest.mark.parametrize("dtype", ["float32", "int32", "int8", "uint8"], ids=str)
  232. @pytest.mark.isolated_distributed
  233. def test_scatter_multidtype(dtype):
  234. run_scatter((8, 10), dtype)
  235. def run_all_to_all(shape, dtype):
  236. @dist.launcher(n_gpus=2)
  237. def worker(data, expect):
  238. rank = dist.get_rank()
  239. inp = tensor(data[rank])
  240. output = all_to_all(inp)
  241. assert np.allclose(output.numpy(), expect[rank])
  242. x = np.random.random_sample(shape).astype(dtype)
  243. y = np.random.random_sample(shape).astype(dtype)
  244. a = np.concatenate((x[: shape[0] // 2], y[: shape[0] // 2]))
  245. b = np.concatenate((x[shape[0] // 2 :], y[shape[0] // 2 :]))
  246. data = (x, y)
  247. expect = (a, b)
  248. worker(data, expect)
  249. @pytest.mark.require_ngpu(2)
  250. @pytest.mark.parametrize("shape", [(2, 3), (8, 10), (100, 77)], ids=str)
  251. @pytest.mark.isolated_distributed
  252. def test_all_to_all_multishape(shape):
  253. run_all_to_all(shape, "float32")
  254. @pytest.mark.require_ngpu(2)
  255. @pytest.mark.parametrize("dtype", ["float32", "int32", "int8", "uint8"], ids=str)
  256. @pytest.mark.isolated_distributed
  257. def test_all_to_all_multidtype(dtype):
  258. run_all_to_all((8, 10), dtype)
  259. def run_io_remote(shape, dtype):
  260. @dist.launcher(n_gpus=2)
  261. def worker(val, shape):
  262. rank = dist.get_rank()
  263. if rank == 0: # remote send
  264. x = tensor(val, device="xpu0")
  265. remote_send(x, 1)
  266. sync()
  267. else: # remote recv
  268. y = remote_recv(0)
  269. assert y.device == get_default_device()
  270. np.testing.assert_almost_equal(val, y.numpy())
  271. val = np.random.random_sample(shape).astype(dtype)
  272. worker(val, shape)
  273. @pytest.mark.require_ngpu(2)
  274. @pytest.mark.isolated_distributed
  275. @pytest.mark.parametrize("shape", [(), (1,), (4, 5)], ids=str)
  276. def test_io_remote_multishape(shape):
  277. run_io_remote(shape, "float32")
  278. @pytest.mark.require_ngpu(2)
  279. @pytest.mark.isolated_distributed
  280. @pytest.mark.parametrize("dtype", ["float32", "int32", "int8", "uint8"], ids=str)
  281. def test_io_remote_multidtype(dtype):
  282. run_io_remote((8, 10), dtype)
  283. @pytest.mark.require_ngpu(2)
  284. def test_cuda_init_before_fork():
  285. a = mge.tensor(1, device="gpu0")
  286. @dist.launcher(n_gpus=2)
  287. def worker():
  288. a += 1
  289. b = mge.tensor(2)
  290. with pytest.raises(AssertionError):
  291. worker()