Browse Source

refactor(mge/distributed): using nccl as default in distributed training

GitOrigin-RevId: 81268e84bc
release-1.5
Megvii Engine Team 3 years ago
parent
commit
1f0436967c
3 changed files with 9 additions and 2 deletions
  1. +1
    -1
      imperative/python/megengine/distributed/group.py
  2. +1
    -1
      imperative/python/megengine/distributed/launcher.py
  3. +7
    -0
      imperative/python/test/unit/utils/test_network_node.py

+ 1
- 1
imperative/python/megengine/distributed/group.py View File

@@ -104,7 +104,7 @@ class Group:
WORLD = Group([]) WORLD = Group([])


_devices = {"gpu", "cuda", "rocm"} _devices = {"gpu", "cuda", "rocm"}
_backends = {"nccl", "rccl", "ucx", "auto"}
_backends = {"nccl", "rccl", "shm", "auto"}




def init_process_group( def init_process_group(


+ 1
- 1
imperative/python/megengine/distributed/launcher.py View File

@@ -89,7 +89,7 @@ class launcher:
master_ip="localhost", master_ip="localhost",
port=0, port=0,
device_type="xpu", device_type="xpu",
backend="auto",
backend="nccl",
): ):
self.func = func self.func = func
self.n_gpus = n_gpus if n_gpus is not None else get_device_count(device_type) self.n_gpus = n_gpus if n_gpus is not None else get_device_count(device_type)


+ 7
- 0
imperative/python/test/unit/utils/test_network_node.py View File

@@ -14,6 +14,10 @@ from megengine.core._imperative_rt.core2 import apply
from megengine.core._wrap import Device from megengine.core._wrap import Device
from megengine.core.ops import builtin from megengine.core.ops import builtin
from megengine.device import get_device_count, is_cuda_available from megengine.device import get_device_count, is_cuda_available
from megengine.functional.debug_param import (
get_execution_strategy,
set_execution_strategy,
)
from megengine.functional.external import tensorrt_runtime_opr from megengine.functional.external import tensorrt_runtime_opr
from megengine.jit.tracing import trace from megengine.jit.tracing import trace
from megengine.tensor import Tensor from megengine.tensor import Tensor
@@ -106,10 +110,13 @@ def test_matmul():
def fwd(data1, data2): def fwd(data1, data2):
return F.matmul(data1, data2) return F.matmul(data1, data2)


old = get_execution_strategy()
set_execution_strategy("HEURISTIC_REPRODUCIBLE")
data1 = Tensor(np.random.random((32, 64))) data1 = Tensor(np.random.random((32, 64)))
data2 = Tensor(np.random.random((64, 16))) data2 = Tensor(np.random.random((64, 16)))
result = fwd(data1, data2) result = fwd(data1, data2)
check_pygraph_dump(fwd, [data1, data2], [result]) check_pygraph_dump(fwd, [data1, data2], [result])
set_execution_strategy(old)




def test_batchmatmul(): def test_batchmatmul():


Loading…
Cancel
Save