Browse Source

fix(mgb/mm_server): return -1 when create_mm_server failed

GitOrigin-RevId: 117bb80c06
tags/v0.4.0
Megvii Engine Team Xinran Xu 5 years ago
parent
commit
b8cbd4510b
2 changed files with 8 additions and 3 deletions
  1. +4
    -2
      python_module/megengine/distributed/util.py
  2. +4
    -1
      python_module/src/cpp/mm_handler.cpp

+ 4
- 2
python_module/megengine/distributed/util.py View File

@@ -63,9 +63,11 @@ def init_process_group(
set_default_device(mgb.comp_node("gpu" + str(dev)))

if rank == 0:
res = mgb.config.create_mm_server("0.0.0.0", master_port)
if res != master_port:
_master_port = mgb.config.create_mm_server("0.0.0.0", master_port)
if _master_port == -1:
raise Exception("Failed to start server on port {}".format(master_port))
else:
assert master_port > 0, "master_port must be specified for non-zero rank"


def is_distributed() -> bool:


+ 4
- 1
python_module/src/cpp/mm_handler.cpp View File

@@ -214,11 +214,14 @@ public:
std::make_unique<ZmqRpc::ZmqRpcServer>("tcp://" + server_addr, port,
std::move(service));
port = server->port();
if (port == -1) {
return -1;
}

auto full_srv_addr = ssprintf("%s:%d", server_addr.c_str(), port);
server->run();
auto ins = m_addr2server.emplace(
full_srv_addr, ServerInfo{std::move(server)});
mgb_assert(ins.second);

return port;
}


Loading…
Cancel
Save