Browse Source

fix(mge/distributed): add polling to solve xmlrpc server io error

GitOrigin-RevId: ca19b1566b
tags/v1.9.0
Megvii Engine Team 3 years ago
parent
commit
4f60fbbb2f
1 changed files with 12 additions and 2 deletions
  1. +12
    -2
      imperative/python/megengine/distributed/server.py

+ 12
- 2
imperative/python/megengine/distributed/server.py View File

@@ -231,7 +231,11 @@ class Client:


def get_mm_server_port(self): def get_mm_server_port(self):
r"""Get multiple machine server port.""" r"""Get multiple machine server port."""
return self.proxy.get_mm_server_port()
while True:
try:
return self.proxy.get_mm_server_port()
except:
time.sleep(0.5)


def set_is_grad(self, key, is_grad): def set_is_grad(self, key, is_grad):
r"""Mark send/recv need gradiants by key. r"""Mark send/recv need gradiants by key.
@@ -274,7 +278,13 @@ class Client:
key: group key to match each other. key: group key to match each other.
size: group size. size: group size.
""" """
self.proxy.group_barrier(key, size)
# FIXME: group_barrier is not idempotent
while True:
try:
self.proxy.group_barrier(key, size)
return
except:
time.sleep(0.5)


def user_set(self, key, val): def user_set(self, key, val):
r"""Set user defined key-value pairs across processes.""" r"""Set user defined key-value pairs across processes."""


Loading…
Cancel
Save