From b8cbd4510bfbab17630014587919b595f6d6945c Mon Sep 17 00:00:00 2001 From: Megvii Engine Team Date: Sun, 26 Apr 2020 17:11:34 +0800 Subject: [PATCH] fix(mgb/mm_server): return -1 when create_mm_server failed GitOrigin-RevId: 117bb80c06388d7b7c7ab71a787a1f03849a3871 --- python_module/megengine/distributed/util.py | 6 ++++-- python_module/src/cpp/mm_handler.cpp | 5 ++++- 2 files changed, 8 insertions(+), 3 deletions(-) diff --git a/python_module/megengine/distributed/util.py b/python_module/megengine/distributed/util.py index cdefc34a..52248d30 100644 --- a/python_module/megengine/distributed/util.py +++ b/python_module/megengine/distributed/util.py @@ -63,9 +63,11 @@ def init_process_group( set_default_device(mgb.comp_node("gpu" + str(dev))) if rank == 0: - res = mgb.config.create_mm_server("0.0.0.0", master_port) - if res != master_port: + _master_port = mgb.config.create_mm_server("0.0.0.0", master_port) + if _master_port == -1: raise Exception("Failed to start server on port {}".format(master_port)) + else: + assert master_port > 0, "master_port must be specified for non-zero rank" def is_distributed() -> bool: diff --git a/python_module/src/cpp/mm_handler.cpp b/python_module/src/cpp/mm_handler.cpp index b9da63c0..7225fb33 100644 --- a/python_module/src/cpp/mm_handler.cpp +++ b/python_module/src/cpp/mm_handler.cpp @@ -214,11 +214,14 @@ public: std::make_unique("tcp://" + server_addr, port, std::move(service)); port = server->port(); + if (port == -1) { + return -1; + } + auto full_srv_addr = ssprintf("%s:%d", server_addr.c_str(), port); server->run(); auto ins = m_addr2server.emplace( full_srv_addr, ServerInfo{std::move(server)}); - mgb_assert(ins.second); return port; }