From 4f60fbbb2fc02fe658a0ce229c1ee9a45b651606 Mon Sep 17 00:00:00 2001 From: Megvii Engine Team Date: Wed, 9 Mar 2022 17:23:15 +0800 Subject: [PATCH] fix(mge/distributed): add polling to solve xmlrpc server io error GitOrigin-RevId: ca19b1566bc5e45a8c59b5945ccd2ee7f03e0736 --- imperative/python/megengine/distributed/server.py | 14 ++++++++++++-- 1 file changed, 12 insertions(+), 2 deletions(-) diff --git a/imperative/python/megengine/distributed/server.py b/imperative/python/megengine/distributed/server.py index 90a16413..8a17f2c3 100644 --- a/imperative/python/megengine/distributed/server.py +++ b/imperative/python/megengine/distributed/server.py @@ -231,7 +231,11 @@ class Client: def get_mm_server_port(self): r"""Get multiple machine server port.""" - return self.proxy.get_mm_server_port() + while True: + try: + return self.proxy.get_mm_server_port() + except: + time.sleep(0.5) def set_is_grad(self, key, is_grad): r"""Mark send/recv need gradiants by key. @@ -274,7 +278,13 @@ class Client: key: group key to match each other. size: group size. """ - self.proxy.group_barrier(key, size) + # FIXME: group_barrier is not idempotent + while True: + try: + self.proxy.group_barrier(key, size) + return + except: + time.sleep(0.5) def user_set(self, key, val): r"""Set user defined key-value pairs across processes."""