diff --git a/python_module/megengine/distributed/util.py b/python_module/megengine/distributed/util.py index cdefc34a..52248d30 100644 --- a/python_module/megengine/distributed/util.py +++ b/python_module/megengine/distributed/util.py @@ -63,9 +63,11 @@ def init_process_group( set_default_device(mgb.comp_node("gpu" + str(dev))) if rank == 0: - res = mgb.config.create_mm_server("0.0.0.0", master_port) - if res != master_port: + _master_port = mgb.config.create_mm_server("0.0.0.0", master_port) + if _master_port == -1: raise Exception("Failed to start server on port {}".format(master_port)) + else: + assert master_port > 0, "master_port must be specified for non-zero rank" def is_distributed() -> bool: diff --git a/python_module/src/cpp/mm_handler.cpp b/python_module/src/cpp/mm_handler.cpp index b9da63c0..7225fb33 100644 --- a/python_module/src/cpp/mm_handler.cpp +++ b/python_module/src/cpp/mm_handler.cpp @@ -214,11 +214,14 @@ public: std::make_unique("tcp://" + server_addr, port, std::move(service)); port = server->port(); + if (port == -1) { + return -1; + } + auto full_srv_addr = ssprintf("%s:%d", server_addr.c_str(), port); server->run(); auto ins = m_addr2server.emplace( full_srv_addr, ServerInfo{std::move(server)}); - mgb_assert(ins.second); return port; }