GitOrigin-RevId: 117bb80c06
tags/v0.4.0
@@ -63,9 +63,11 @@ def init_process_group( | |||||
set_default_device(mgb.comp_node("gpu" + str(dev))) | set_default_device(mgb.comp_node("gpu" + str(dev))) | ||||
if rank == 0: | if rank == 0: | ||||
res = mgb.config.create_mm_server("0.0.0.0", master_port) | |||||
if res != master_port: | |||||
_master_port = mgb.config.create_mm_server("0.0.0.0", master_port) | |||||
if _master_port == -1: | |||||
raise Exception("Failed to start server on port {}".format(master_port)) | raise Exception("Failed to start server on port {}".format(master_port)) | ||||
else: | |||||
assert master_port > 0, "master_port must be specified for non-zero rank" | |||||
def is_distributed() -> bool: | def is_distributed() -> bool: | ||||
@@ -214,11 +214,14 @@ public: | |||||
std::make_unique<ZmqRpc::ZmqRpcServer>("tcp://" + server_addr, port, | std::make_unique<ZmqRpc::ZmqRpcServer>("tcp://" + server_addr, port, | ||||
std::move(service)); | std::move(service)); | ||||
port = server->port(); | port = server->port(); | ||||
if (port == -1) { | |||||
return -1; | |||||
} | |||||
auto full_srv_addr = ssprintf("%s:%d", server_addr.c_str(), port); | auto full_srv_addr = ssprintf("%s:%d", server_addr.c_str(), port); | ||||
server->run(); | server->run(); | ||||
auto ins = m_addr2server.emplace( | auto ins = m_addr2server.emplace( | ||||
full_srv_addr, ServerInfo{std::move(server)}); | full_srv_addr, ServerInfo{std::move(server)}); | ||||
mgb_assert(ins.second); | |||||
return port; | return port; | ||||
} | } | ||||