Browse Source

fix(mge/dist): fix gl oom error

GitOrigin-RevId: 4ba3d2cfd7
release-1.5
Megvii Engine Team huangxinda 3 years ago
parent
commit
497ef6c337
2 changed files with 3 additions and 3 deletions
  1. +2
    -2
      imperative/python/megengine/distributed/helper.py
  2. +1
    -1
      imperative/python/megengine/distributed/launcher.py

+ 2
- 2
imperative/python/megengine/distributed/helper.py View File

@@ -186,9 +186,9 @@ def _get_device_count_worker(queue, device_type):
queue.put(num)


def _check_device_initialized(device_type: str):
def _check_device_initialized(device_type: str, rank: int):
try:
test = Tensor(1, device=device_type)
test = Tensor(1, device=(device_type + str(rank)))
inited = False
del test
except:


+ 1
- 1
imperative/python/megengine/distributed/launcher.py View File

@@ -39,7 +39,7 @@ def _run_wrapped(
machine_ranks: list,
):
"""Init distributed process group and run wrapped function."""
_check_device_initialized(device_type)
_check_device_initialized(device_type, dev)
init_process_group(
master_ip=master_ip,
port=port,


Loading…
Cancel
Save