From 497ef6c337e0a986a13dc82518692724a2d0f510 Mon Sep 17 00:00:00 2001 From: Megvii Engine Team Date: Tue, 13 Jul 2021 12:51:02 +0800 Subject: [PATCH] fix(mge/dist): fix gl oom error GitOrigin-RevId: 4ba3d2cfd74ed1d63274f175e232eeead1ec7b6d --- imperative/python/megengine/distributed/helper.py | 4 ++-- imperative/python/megengine/distributed/launcher.py | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/imperative/python/megengine/distributed/helper.py b/imperative/python/megengine/distributed/helper.py index 5ad1b50f..f2d83ba0 100644 --- a/imperative/python/megengine/distributed/helper.py +++ b/imperative/python/megengine/distributed/helper.py @@ -186,9 +186,9 @@ def _get_device_count_worker(queue, device_type): queue.put(num) -def _check_device_initialized(device_type: str): +def _check_device_initialized(device_type: str, rank: int): try: - test = Tensor(1, device=device_type) + test = Tensor(1, device=(device_type + str(rank))) inited = False del test except: diff --git a/imperative/python/megengine/distributed/launcher.py b/imperative/python/megengine/distributed/launcher.py index b043705c..3e6d2b18 100644 --- a/imperative/python/megengine/distributed/launcher.py +++ b/imperative/python/megengine/distributed/launcher.py @@ -39,7 +39,7 @@ def _run_wrapped( machine_ranks: list, ): """Init distributed process group and run wrapped function.""" - _check_device_initialized(device_type) + _check_device_initialized(device_type, dev) init_process_group( master_ip=master_ip, port=port,