feat(whl/imperative): compat for build python whl imperative and legacy runtime

GitOrigin-RevId: 7f6629ae1f
4 years ago · 6e882c1a86
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -697,8 +697,10 @@ endif()
 if(MGE_WITH_PYTHON_MODULE)
    if(MGE_BUILD_IMPERATIVE_RT)
        add_subdirectory(imperative)
        message("-- Enable imperative python wrapper runtime")
    else()
        add_subdirectory(python_module)
        message("-- Enable legacy python wrapper runtime")
    endif()
 endif()
--- a/dnn/src/common/utils.h
+++ b/dnn/src/common/utils.h
@@ -342,7 +342,11 @@ template <typename T>
 struct SafeMultiplies;
 template <typename T>
 #if __cplusplus >= 201703L
 struct _SafeMultipliesImplUnsigned {
 #else
 struct _SafeMultipliesImplUnsigned : public std::binary_function<T, T, T> {
 #endif
    static MEGDNN_CONSTEXPR size_t nbits = sizeof(T) * 8;
    static size_t clz(unsigned x) {
--- a/dnn/test/CMakeLists.txt
+++ b/dnn/test/CMakeLists.txt
@@ -70,8 +70,10 @@ if (MEG_WITH_ROCM)
    target_link_libraries (megdnn_test ${MGE_ROCM_LIBS})
 endif ()
 if(APPLE OR ANDROID)
    target_link_libraries(megdnn_test dl)
 else()
    target_link_libraries(megdnn_test dl rt)
 if(UNIX)
    if(APPLE OR ANDROID)
        target_link_libraries(megdnn_test dl)
    else()
        target_link_libraries(megdnn_test dl rt)
    endif()
 endif()
--- a/dnn/test/common/mesh_indexing.h
+++ b/dnn/test/common/mesh_indexing.h
@@ -89,7 +89,7 @@ public:
        auto ptr = tensor.ptr<int>();
        for (size_t n = 0; n < size; ++n) {
            std::set<int> used;
            std::random_shuffle(seq.begin(), seq.end());
            COMPAT_RANDOM(seq.begin(), seq.end());
            for (size_t step = 0; step < stride; ++step) {
                megdnn_assert(used.size() < m_size);
                ptr[n * stride + step] = seq[step];
--- a/dnn/test/common/rng.cpp
+++ b/dnn/test/common/rng.cpp
@@ -75,7 +75,7 @@ Float16PeriodicalRNG::Float16PeriodicalRNG() : m_offset(0) {
        i2f.i = static_cast<uint16_t>(x);
        m_sequence.push_back(i2f.f);
    }
    std::random_shuffle(m_sequence.begin(), m_sequence.end());
    COMPAT_RANDOM(m_sequence.begin(), m_sequence.end());
 }
 Float16PeriodicalRNG::Float16PeriodicalRNG(size_t range) : m_offset(0) {
@@ -99,7 +99,7 @@ Float16PeriodicalRNG::Float16PeriodicalRNG(size_t range) : m_offset(0) {
        m_sequence.push_back(i2f.f);
    }
    std::random_shuffle(m_sequence.begin(), m_sequence.end());
    COMPAT_RANDOM(m_sequence.begin(), m_sequence.end());
 }
 void Float16PeriodicalRNG::gen(const TensorND& tensor) {
--- a/dnn/test/common/rng.h
+++ b/dnn/test/common/rng.h
@@ -19,6 +19,16 @@
 namespace megdnn {
 namespace test {
 #if __cplusplus >= 201703L
 #define COMPAT_RANDOM(begin, end)              \
    {                                          \
        std::default_random_engine rng_engine; \
        std::shuffle(begin, end, rng_engine);  \
    }
 #else
 #define COMPAT_RANDOM(begin, end) std::random_shuffle(begin, end);
 #endif
 class RNG {
 protected:
    class RNGxorshf;
--- a/dnn/test/cuda/argmxx.cpp
+++ b/dnn/test/cuda/argmxx.cpp
@@ -24,15 +24,16 @@ class ArgmxxRNG final: public RNG {
        void gen(const TensorND &tensor) override {
            auto offset = tensor.layout.span().low_elem;
            auto nr_elems = tensor.layout.span().dist_elem();
 #define cb(DType) \
            if (tensor.layout.dtype == DType()) { \
                using ctype = typename DTypeTrait<DType>::ctype; \
                auto ptr = tensor.ptr<ctype>(); \
                for (size_t i = 0; i < nr_elems; ++i) { \
                    ptr[offset+i] = i; \
                } \
                std::random_shuffle(ptr + offset, ptr + offset + nr_elems); \
            }
 #define cb(DType)                                             \
    if (tensor.layout.dtype == DType()) {                     \
        using ctype = typename DTypeTrait<DType>::ctype;      \
        auto ptr = tensor.ptr<ctype>();                       \
        for (size_t i = 0; i < nr_elems; ++i) {               \
            ptr[offset + i] = i;                              \
        }                                                     \
        COMPAT_RANDOM(ptr + offset, ptr + offset + nr_elems); \
    }
            MEGDNN_FOREACH_COMPUTING_DTYPE(cb);
 #undef cb
        }
--- a/dnn/test/cuda/argsort.cpp
+++ b/dnn/test/cuda/argsort.cpp
@@ -32,7 +32,7 @@ class ArgsortRNG final : public RNG {
        } else {
            for (int i = 0; i < n; ++i)
                ptr[i] = static_cast<T>(i - n / 2);
            std::random_shuffle(ptr, ptr + n);
            COMPAT_RANDOM(ptr, ptr + n);
        }
    }
@@ -86,7 +86,7 @@ void run_backward_test(Handle* handle, DType dtype) {
                for (size_t j = 0; j < n; ++j) {
                    ptr[j] = j;
                }
                std::random_shuffle(ptr, ptr + n);
                COMPAT_RANDOM(ptr, ptr + n);
                ptr += n;
            }
        }
--- a/dnn/test/cuda/relayout.cpp
+++ b/dnn/test/cuda/relayout.cpp
@@ -361,9 +361,8 @@ TEST_F(CUDA, BENCHMARK_RELAYOUT_7) {
    for (size_t r = 0; r < _dim.size(); r++)
        permutation[r] = r;
    for (int nsample = 0; nsample < 50; nsample++) {
        std::random_shuffle(_dim.begin(), _dim.end());
        std::random_shuffle(permutation.begin(), permutation.end());
        COMPAT_RANDOM(_dim.begin(), _dim.end());
        COMPAT_RANDOM(permutation.begin(), permutation.end());
        if (!isTrivial(permutation)) {
            run({{_dim[0], _dim[1], _dim[2], _dim[3], _dim[4], _dim[5],
                  _dim[6]},
@@ -451,9 +450,10 @@ TEST_F(CUDA, BENCHMARK_RELAYOUT_5) {
            printf("vol %d cur_ratio %lf | %lf\n", vol, cur_ratio, vol_re);
            // printVec(dim);
            std::random_shuffle(dim.begin(), dim.end());
            COMPAT_RANDOM(dim.begin(), dim.end());
            while (isTrivial(permutation)) {
                std::random_shuffle(permutation.begin(), permutation.end());
                COMPAT_RANDOM(permutation.begin(), permutation.end());
            }
            run({{dim[0], dim[1], dim[2], dim[3], dim[4]}, dtype::Int32()},
@@ -603,8 +603,9 @@ TEST_F(CUDA, BENCHMARK_LAST_CONTIG_ALIGN_TEST) {
    for (size_t r = 0; r < _dim.size(); r++)
        permutation[r] = r;
    for (int nsample = 0; nsample < 20; nsample++) {
        std::random_shuffle(_dim.begin(), _dim.end() - 1);
        std::random_shuffle(permutation.begin(), permutation.end() - 1);
        COMPAT_RANDOM(_dim.begin(), _dim.end() - 1);
        COMPAT_RANDOM(permutation.begin(), permutation.end() - 1);
        if (nsample < 5)
            _dim[5] = (u.gen_single_val() / 4 + 1) * 4;
--- a/dnn/test/cuda/sleep.cpp
+++ b/dnn/test/cuda/sleep.cpp
@@ -24,7 +24,7 @@ using namespace test;
 TEST_F(CUDA, SLEEP) {
    auto opr = this->handle_cuda()->create_operator<Sleep>();
    auto opr = this->handle_cuda()->create_operator<megdnn::SleepForward>();
    auto run = [&](float time) -> double {
        opr->param() = {time};
--- a/dnn/test/rocm/argmxx.cpp
+++ b/dnn/test/rocm/argmxx.cpp
@@ -24,16 +24,17 @@ class ArgmxxRNG final: public RNG {
        void gen(const TensorND &tensor) override {
            auto offset = tensor.layout.span().low_elem;
            auto nr_elems = tensor.layout.span().dist_elem();
 #define cb(DType) \
            if (tensor.layout.dtype == DType()) { \
                using ctype = typename DTypeTrait<DType>::ctype; \
                auto ptr = tensor.ptr<ctype>(); \
                for (size_t i = 0; i < nr_elems; ++i) { \
                    ptr[offset+i] = i; \
                } \
                std::random_shuffle(ptr + offset, ptr + offset + nr_elems); \
                return; \
            }
 #define cb(DType)                                             \
    if (tensor.layout.dtype == DType()) {                     \
        using ctype = typename DTypeTrait<DType>::ctype;      \
        auto ptr = tensor.ptr<ctype>();                       \
        for (size_t i = 0; i < nr_elems; ++i) {               \
            ptr[offset + i] = i;                              \
        }                                                     \
        COMPAT_RANDOM(ptr + offset, ptr + offset + nr_elems); \
        return;                                               \
    }
            MEGDNN_FOREACH_COMPUTING_DTYPE_FLOAT(cb);
 #undef cb
            megdnn_throw(megdnn_mangle(ssprintf("Unsupported DType: %s",
--- a/imperative/CMakeLists.txt
+++ b/imperative/CMakeLists.txt
@@ -76,7 +76,11 @@ add_custom_target(_version_ld SOURCES ${VERSION_SCRIPT})
 add_subdirectory(${PROJECT_SOURCE_DIR}/third_party/pybind11 ${PROJECT_BINARY_DIR}/third_party/pybind11)
 pybind11_add_module(${MODULE_NAME} NO_EXTRAS ${SRCS})
 target_link_libraries(${MODULE_NAME} PRIVATE gen_op_def megbrain megdnn -Wl,--version-script=${VERSION_SCRIPT})
 if (APPLE OR MSVC OR WIN32)
    target_link_libraries(${MODULE_NAME} PRIVATE gen_op_def megbrain megdnn)
 else()
    target_link_libraries(${MODULE_NAME} PRIVATE gen_op_def megbrain megdnn -Wl,--version-script=${VERSION_SCRIPT})
 endif()
 if (MGE_WITH_DISTRIBUTED)
    message("Imperative configured to link megray")
    target_link_libraries(${MODULE_NAME} PRIVATE megray)
@@ -91,6 +95,10 @@ set_target_properties(${MODULE_NAME} PROPERTIES
    SUFFIX ${CMAKE_SHARED_LIBRARY_SUFFIX}
    LIBRARY_OUTPUT_DIRECTORY ${MEGENGINE_DIR}/${PACKAGE_NAME}/core
 )
 if (APPLE OR MSVC OR WIN32)
    message("-- overwriting SUFFIX at macos and windows before config by set_target_properties")
    pybind11_extension(${MODULE_NAME})
 endif()
 add_dependencies(${MODULE_NAME} gen_opr_py _version_ld)
 if(MGE_WITH_TEST AND MGE_ENABLE_RTTI)
--- a/imperative/python/megengine/init.py
+++ b/imperative/python/megengine/init.py
@@ -8,6 +8,67 @@
 # "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 import os
 import sys
 import platform
 import ctypes
 if sys.platform == "win32":
    lib_path = os.path.join(os.path.dirname(__file__), "core/lib")
    dll_paths = list(filter(os.path.exists, [lib_path,]))
    assert len(dll_paths) > 0
    kernel32 = ctypes.WinDLL("kernel32.dll", use_last_error=True)
    has_load_library_attr = hasattr(kernel32, "AddDllDirectory")
    old_error_mode = kernel32.SetErrorMode(0x0001)
    kernel32.LoadLibraryW.restype = ctypes.c_void_p
    if has_load_library_attr:
        kernel32.AddDllDirectory.restype = ctypes.c_void_p
        kernel32.LoadLibraryExW.restype = ctypes.c_void_p
    for dll_path in dll_paths:
        if sys.version_info >= (3, 8):
            os.add_dll_directory(dll_path)
        elif has_load_library_attr:
            res = kernel32.AddDllDirectory(dll_path)
            if res is None:
                err = ctypes.WinError(ctypes.get_last_error())
                err.strerror += ' Error adding "{}" to the DLL search PATH.'.format(
                    dll_path
                )
                raise err
        else:
            print("WARN: python or OS env have some issue, may load DLL failed!!!")
    import glob
    dlls = glob.glob(os.path.join(lib_path, "*.dll"))
    path_patched = False
    for dll in dlls:
        is_loaded = False
        if has_load_library_attr:
            res = kernel32.LoadLibraryExW(dll, None, 0x00001100)
            last_error = ctypes.get_last_error()
            if res is None and last_error != 126:
                err = ctypes.WinError(last_error)
                err.strerror += ' Error loading "{}" or one of its dependencies.'.format(
                    dll
                )
                raise err
            elif res is not None:
                is_loaded = True
        if not is_loaded:
            if not path_patched:
                os.environ["PATH"] = ";".join(dll_paths + [os.environ["PATH"]])
                path_patched = True
            res = kernel32.LoadLibraryW(dll)
            if res is None:
                err = ctypes.WinError(ctypes.get_last_error())
                err.strerror += ' Error loading "{}" or one of its dependencies.'.format(
                    dll
                )
                raise err
    kernel32.SetErrorMode(old_error_mode)
 from .core._imperative_rt.utils import _set_fork_exec_path_for_timed_func
 from .device import *
--- a/imperative/python/megengine/utils/max_recursion_limit.py
+++ b/imperative/python/megengine/utils/max_recursion_limit.py
@@ -6,10 +6,14 @@
 # Unless required by applicable law or agreed to in writing,
 # software distributed under the License is distributed on an
 # "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 import resource
 import platform
 import sys
 import threading
 # Windows do not imp resource package
 if platform.system() != "Windows":
    import resource
 class AlternativeRecursionLimit:
    r"""A reentrant context manager for setting global recursion limits.
@@ -28,16 +32,24 @@ class AlternativeRecursionLimit:
        with self.lock:
            if self.count == 0:
                self.orig_py_limit = sys.getrecursionlimit()
            if platform.system() != "Windows":
                (
                    self.orig_rlim_stack_soft,
                    self.orig_rlim_stack_hard,
                ) = resource.getrlimit(resource.RLIMIT_STACK)
                resource.setrlimit(
                    resource.RLIMIT_STACK,
                    (self.orig_rlim_stack_hard, self.orig_rlim_stack_hard),
                )
                # increase recursion limit
                sys.setrecursionlimit(self.new_py_limit)
                # FIXME: https://bugs.python.org/issue34602, python3 release version
                # on Macos always have this issue, not all user install python3 from src
                try:
                    resource.setrlimit(
                        resource.RLIMIT_STACK,
                        (self.orig_rlim_stack_hard, self.orig_rlim_stack_hard),
                    )
                except ValueError as exc:
                    if platform.system() != "Darwin":
                        raise exc
            # increase recursion limit
            sys.setrecursionlimit(self.new_py_limit)
            self.count += 1
    def __exit__(self, type, value, traceback):
@@ -45,10 +57,16 @@ class AlternativeRecursionLimit:
            self.count -= 1
            if self.count == 0:
                sys.setrecursionlimit(self.orig_py_limit)
                resource.setrlimit(
                    resource.RLIMIT_STACK,
                    (self.orig_rlim_stack_soft, self.orig_rlim_stack_hard),
                )
            if platform.system() != "Windows":
                try:
                    resource.setrlimit(
                        resource.RLIMIT_STACK,
                        (self.orig_rlim_stack_soft, self.orig_rlim_stack_hard),
                    )
                except ValueError as exc:
                    if platform.system() != "Darwin":
                        raise exc
 _max_recursion_limit_context_manager = AlternativeRecursionLimit(2 ** 31 - 1)
--- a/imperative/python/setup.py
+++ b/imperative/python/setup.py
@@ -9,6 +9,7 @@
 import os
 import re
 import pathlib
 import platform
 from distutils.file_util import copy_file
 from setuptools import setup, find_packages, Extension
 from setuptools.command.build_ext import build_ext as _build_ext
@@ -29,7 +30,10 @@ class build_ext(_build_ext):
            extdir.parent.mkdir(parents=True, exist_ok=True)
            modpath = self.get_ext_fullname(ext.name).split('.')
            modpath[-1] += '.so'
            if platform.system() == 'Windows':
                modpath[-1] += '.pyd'
            else:
                modpath[-1] += '.so'
            modpath = str(pathlib.Path(*modpath).resolve())
            copy_file(modpath, fullpath, verbose=self.verbose, dry_run=self.dry_run)
@@ -47,6 +51,14 @@ if local_version:
    __version__ = '{}+{}'.format(__version__, local_version)
 packages = find_packages(exclude=['test'])
 package_data = [
    str(f.relative_to('megengine'))
    for f in pathlib.Path('megengine', 'core', 'include').glob('**/*')
 ]
 package_data += [
    str(f.relative_to('megengine'))
    for f in pathlib.Path('megengine', 'core', 'lib').glob('**/*')
 ]
 with open('requires.txt') as f:
    requires = f.read().splitlines()
@@ -63,6 +75,9 @@ setup_kwargs = dict(
    author='Megvii Engine Team',
    author_email=email,
    packages=packages,
    package_data={
        'megengine': package_data,
    },
    ext_modules=[PrecompiledExtesion('megengine.core._imperative_rt')],
    install_requires=requires,
    extras_require={
--- a/imperative/python/src/helper.cpp
+++ b/imperative/python/src/helper.cpp
@@ -9,15 +9,6 @@
 #include "megbrain/utils/mempool.h"
 #include "./numpy_dtypes.h"
 /*
 * demangle typeid, see
 * http://stackoverflow.com/questions/281818/unmangling-the-result-of-stdtype-infoname
 */
 #ifdef __GNUG__
 #include <cstdlib>
 #include <memory>
 #include <cxxabi.h>
 namespace py = pybind11;
 PyTaskDipatcher py_task_q = {};
@@ -34,10 +25,18 @@ py::module rel_import(py::str name, py::module m, int level) {
    return import(name, m.attr("__dict__"), py::arg("level")=level);
 }
 /*
 * demangle typeid, see
 * http://stackoverflow.com/questions/281818/unmangling-the-result-of-stdtype-infoname
 */
 #ifdef __GNUG__
 #include <cxxabi.h>
 #include <cstdlib>
 #include <memory>
 namespace {
 std::string demangle_typeid(const char* name) {
    int status = -4; // some arbitrary value to eliminate the compiler warning
    // enable c++11 by passing the flag -std=c++11 to g++
@@ -48,7 +47,7 @@ std::string demangle_typeid(const char* name) {
    return (status==0) ? res.get() : name ;
 }
 }
 }  // namespace
 #else
 namespace {
--- a/imperative/python/src/utils.cpp
+++ b/imperative/python/src/utils.cpp
@@ -1,4 +1,8 @@
 #include "utils.h"
 #ifdef WIN32
 #include <stdio.h>
 #include <windows.h>
 #endif
 #include <pybind11/operators.h>
 #include <atomic>
--- a/imperative/python/test/integration/test_dp_correctness.py
+++ b/imperative/python/test/integration/test_dp_correctness.py
@@ -8,6 +8,7 @@
 # "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 import multiprocessing as mp
 import os
 import platform
 import re
 import subprocess
 import sys
@@ -196,6 +197,9 @@ def run_test(
@pytest.mark.isolated_distributed
@pytest.mark.skipif(
    platform.system() == "Windows", reason="windows disable MGB_ENABLE_OPR_MM"
 )
 def test_dp_correctness():
    model_name = "mnist_model_with_test.mge"
    model_path = os.path.join(os.path.dirname(__file__), model_name)
--- a/imperative/python/test/unit/functional/test_distributed.py
+++ b/imperative/python/test/unit/functional/test_distributed.py
@@ -35,7 +35,7 @@ from megengine.functional.distributed import (
    platform.system() == "Darwin", reason="do not imp GPU mode at macos now"
 )
@pytest.mark.skipif(
    platform.system() == "Windows", reason="do not imp GPU mode at Windows now"
    platform.system() == "Windows", reason="windows disable MGB_ENABLE_OPR_MM"
 )
@pytest.mark.isolated_distributed
 def test_reduce_sum():
@@ -77,7 +77,7 @@ def test_reduce_sum():
    platform.system() == "Darwin", reason="do not imp GPU mode at macos now"
 )
@pytest.mark.skipif(
    platform.system() == "Windows", reason="do not imp GPU mode at Windows now"
    platform.system() == "Windows", reason="windows disable MGB_ENABLE_OPR_MM"
 )
@pytest.mark.isolated_distributed
 def test_broadcast():
@@ -115,7 +115,7 @@ def test_broadcast():
    platform.system() == "Darwin", reason="do not imp GPU mode at macos now"
 )
@pytest.mark.skipif(
    platform.system() == "Windows", reason="do not imp GPU mode at Windows now"
    platform.system() == "Windows", reason="windows disable MGB_ENABLE_OPR_MM"
 )
@pytest.mark.isolated_distributed
 def test_all_gather():
@@ -154,7 +154,7 @@ def test_all_gather():
    platform.system() == "Darwin", reason="do not imp GPU mode at macos now"
 )
@pytest.mark.skipif(
    platform.system() == "Windows", reason="do not imp GPU mode at Windows now"
    platform.system() == "Windows", reason="windows disable MGB_ENABLE_OPR_MM"
 )
@pytest.mark.isolated_distributed
 def test_reduce_scatter_sum():
@@ -193,7 +193,7 @@ def test_reduce_scatter_sum():
    platform.system() == "Darwin", reason="do not imp GPU mode at macos now"
 )
@pytest.mark.skipif(
    platform.system() == "Windows", reason="do not imp GPU mode at Windows now"
    platform.system() == "Windows", reason="windows disable MGB_ENABLE_OPR_MM"
 )
@pytest.mark.isolated_distributed
 def test_all_reduce_sum():
@@ -232,7 +232,7 @@ def test_all_reduce_sum():
    platform.system() == "Darwin", reason="do not imp GPU mode at macos now"
 )
@pytest.mark.skipif(
    platform.system() == "Windows", reason="do not imp GPU mode at Windows now"
    platform.system() == "Windows", reason="windows disable MGB_ENABLE_OPR_MM"
 )
@pytest.mark.isolated_distributed
 def test_all_reduce_max():
@@ -271,7 +271,7 @@ def test_all_reduce_max():
    platform.system() == "Darwin", reason="do not imp GPU mode at macos now"
 )
@pytest.mark.skipif(
    platform.system() == "Windows", reason="do not imp GPU mode at Windows now"
    platform.system() == "Windows", reason="windows disable MGB_ENABLE_OPR_MM"
 )
@pytest.mark.isolated_distributed
 def test_all_reduce_min():
@@ -310,7 +310,7 @@ def test_all_reduce_min():
    platform.system() == "Darwin", reason="do not imp GPU mode at macos now"
 )
@pytest.mark.skipif(
    platform.system() == "Windows", reason="do not imp GPU mode at Windows now"
    platform.system() == "Windows", reason="windows disable MGB_ENABLE_OPR_MM"
 )
@pytest.mark.isolated_distributed
 def test_gather():
@@ -352,7 +352,7 @@ def test_gather():
    platform.system() == "Darwin", reason="do not imp GPU mode at macos now"
 )
@pytest.mark.skipif(
    platform.system() == "Windows", reason="do not imp GPU mode at Windows now"
    platform.system() == "Windows", reason="windows disable MGB_ENABLE_OPR_MM"
 )
@pytest.mark.isolated_distributed
 def test_scatter():
@@ -390,7 +390,7 @@ def test_scatter():
    platform.system() == "Darwin", reason="do not imp GPU mode at macos now"
 )
@pytest.mark.skipif(
    platform.system() == "Windows", reason="do not imp GPU mode at Windows now"
    platform.system() == "Windows", reason="windows disable MGB_ENABLE_OPR_MM"
 )
@pytest.mark.isolated_distributed
 def test_all_to_all():
@@ -430,7 +430,7 @@ def test_all_to_all():
    platform.system() == "Darwin", reason="do not imp GPU mode at macos now"
 )
@pytest.mark.skipif(
    platform.system() == "Windows", reason="do not imp GPU mode at Windows now"
    platform.system() == "Windows", reason="windows disable MGB_ENABLE_OPR_MM"
 )
@pytest.mark.isolated_distributed
 def test_io_remote():
--- a/imperative/python/test/unit/test_autodiff.py
+++ b/imperative/python/test/unit/test_autodiff.py
@@ -6,6 +6,7 @@
 # Unless required by applicable law or agreed to in writing,
 # software distributed under the License is distributed on an
 # "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 import platform
 import weakref
 import numpy as np
@@ -51,6 +52,9 @@ def save_to(self, name="grad"):
@pytest.mark.isolated_distributed
@pytest.mark.skipif(
    platform.system() == "Windows", reason="windows disable MGB_ENABLE_OPR_MM"
 )
 def test_dist_grad():
    world_size = 2
    x_np = np.random.rand(10).astype("float32")
--- a/imperative/src/impl/profiler.cpp
+++ b/imperative/src/impl/profiler.cpp
@@ -9,7 +9,17 @@
 #include "megbrain/imperative/profiler.h"
 #if defined(_MSC_VER) || defined(WIN32)
 #include <windows.h>
 #define getpid GetCurrentProcessId
 #else
 #include <sys/unistd.h>
 #endif
 #if defined(__APPLE__) || defined(__MACOSX)
 #include <unistd.h>
 #endif
 #include <variant>
 #include "megbrain/imperative/ops/opr_attr.h"
--- a/imperative/src/impl/proxy_graph.cpp
+++ b/imperative/src/impl/proxy_graph.cpp
@@ -16,6 +16,10 @@
 #include "megbrain/imperative/ops/opr_attr.h"
 #include "megbrain/imperative/ops/backward_graph.h"
 #if __cplusplus >= 201703L
 #include <optional>
 #endif
 namespace mgb {
 namespace imperative {
--- a/imperative/test/CMakeLists.txt
+++ b/imperative/test/CMakeLists.txt
@@ -38,8 +38,11 @@ if(CXX_SUPPORT_WCLASS_MEMACCESS)
 endif()
 if(UNIX)
    target_link_libraries(imperative_test dl rt)
    if(APPLE OR ANDROID)
        target_link_libraries(imperative_test dl)
    else()
        target_link_libraries(imperative_test dl rt)
    endif()
 endif()
 install(TARGETS imperative_test RUNTIME DESTINATION test)
--- a/python_module/CMakeLists.txt
+++ b/python_module/CMakeLists.txt
@@ -81,7 +81,10 @@ else()
    target_link_libraries(mgb megbrain megdnn -Wl,--version-script=${VERSION_SCRIPT})
 endif()
 target_include_directories(mgb PRIVATE ${PYTHON_INCLUDE_DIRS} src/cpp ${CMAKE_CURRENT_BINARY_DIR} ${NUMPY_INCLUDE_DIR})
 target_link_libraries(mgb ${PYTHON_LIBRARIES})
 # only windows need link PYTHON_LIBRARIES
 if(MSVC OR WIN32)
    target_link_libraries(mgb ${PYTHON_LIBRARIES})
 endif()
 if (MGE_WITH_DISTRIBUTED)
    target_link_libraries(mgb megray)
--- a/scripts/cmake-build/BUILD_README.md
+++ b/scripts/cmake-build/BUILD_README.md
@@ -30,11 +30,17 @@
    4e: add C:\Program Files\NVIDIA GPU Computing Toolkit\cudnn-10.1-windows10-x64-v7.6.5.32\cuda\bin to system Path env
    4f: add C:\Program Files\NVIDIA GPU Computing Toolkit\TensorRT-6.0.1.5\lib Path
    if u do not do 4d/4e/4f, CUDA runtime can not find dll
    5: install python3 (DFT 3.8.3) to /c/Users/${USER}/mge_whl_python_env/3.8.3 and
    put it to PATH env and run python3 -m pip install numpy (if u want to build with training mode or build python whl)
    6: install swig from install gui (if u want to build with training mode or build python whl)
       a: download swig: https://nchc.dl.sourceforge.net/project/swig/swigwin/swigwin-4.0.2/swigwin-4.0.2.zip
       b: install swig to /c/Users/${USER}/swigwin-4.0.2
       c: apply scripts/whl/windows/fix-ptr-define-issue.patch to c/Users/${USER}/swigwin-4.0.2
    ```
 ### linux host build
    ```
    1: cmake, which version > 3.14.4
    2: gcc/g++, which version > 6
    2: gcc/g++, which version > 6, (gcc/g++ >= 7, if need build training)
    3: install build-essential git git-lfs gfortran libgfortran-6-dev autoconf gnupg flex bison gperf curl 
    4: zlib1g-dev gcc-multilib g++-multilib lib32ncurses5-dev libxml2-utils xsltproc unzip libtool:
    5: librdmacm-dev rdmacm-utils python3-dev swig python3-numpy texinfo
@@ -47,6 +53,7 @@
    3: brew install python python3 swig coreutils
    4: install at least xcode command line tool: https://developer.apple.com/xcode/
    5: about cuda: we do not support CUDA on macos
    6: python3 -m pip install numpy (if u want to build with training mode or build python whl)
    ```
 ### cross build for arm-android
    now we support windows/linux/macos cross build to arm-android
--- a/scripts/cmake-build/host_build.sh
+++ b/scripts/cmake-build/host_build.sh
@@ -9,6 +9,7 @@ function usage() {
    echo "-t : Build with training mode, default inference only"
    echo "-m : Build with m32 mode(only for windows build), default m64"
    echo "-r : remove old build dir before make, default off"
    echo "-n : enable new python runtime(valid when training mode with -t, default is legacy runtime)"
    echo "-h : show usage"
    echo "append other cmake config by export EXTRA_CMAKE_ARGS=..."
    echo "example: $0 -d"
@@ -22,9 +23,10 @@ MGE_WINDOWS_BUILD_ARCH=x64
 MGE_WINDOWS_BUILD_MARCH=m64
 MGE_ARCH=x86_64
 REMOVE_OLD_BUILD=false
 MGE_BUILD_IMPERATIVE_RT=OFF
 echo "EXTRA_CMAKE_ARGS: ${EXTRA_CMAKE_ARGS}"
 while getopts "rhdctm" arg
 while getopts "rhdctmn" arg
 do
    case $arg in
        d)
@@ -48,11 +50,15 @@ do
            REMOVE_OLD_BUILD=true
            ;;
        m)
            echo "build for m32(only use for windows)"
            echo "build for m32(only valid use for windows)"
            MGE_WINDOWS_BUILD_ARCH=x86
            MGE_WINDOWS_BUILD_MARCH=m32
            MGE_ARCH=i386
            ;;
        n)
            echo "Enable imperative python wrapper runtime"
            MGE_BUILD_IMPERATIVE_RT=ON
            ;;
        ?)
            echo "unkonw argument"
            usage
@@ -101,6 +107,7 @@ function cmake_build() {
    cmake \
        -DCMAKE_BUILD_TYPE=$BUILD_TYPE \
        -DMGE_INFERENCE_ONLY=$MGE_INFERENCE_ONLY \
        -DMGE_BUILD_IMPERATIVE_RT=${MGE_BUILD_IMPERATIVE_RT} \
        -DMGE_WITH_CUDA=$MGE_WITH_CUDA \
        -DCMAKE_INSTALL_PREFIX=$INSTALL_DIR \
        ${EXTRA_CMAKE_ARGS} \
@@ -112,7 +119,7 @@ function cmake_build() {
 function windows_env_err() {
    echo "check windows env failed!!"
    echo "please install LLVM/clang-cl/cmake/python at Visual Studio Extensions"
    echo "please install env refs for: scripts/cmake-build/BUILD_README.md"
    exit -1
 }
@@ -178,6 +185,25 @@ function prepare_env_for_windows_build() {
    export CPATH=$CPATH:$NIVIDA_INSTALL_PRE/${TRT_V}/include:$NIVIDA_INSTALL_PRE/CUDA/${CUDA_V}/include:$NIVIDA_INSTALL_PRE/CUDA/${CUDA_V}/include/nvtx3:$PC_CUDNN_INCLUDE_DIRS
    export LIBRARY_PATH=$LIBRARY_PATH:$LD_LIBRARY_PATH
    export INCLUDE=$INCLUDE:$CPATH
    # python version will be config by whl build script or ci script, we need
    # a DFT version for build success when we just call host_build.sh
    if [[ -z ${ALREADY_CONFIG_PYTHON_VER} ]]
    then
        echo "config a default python3"
        DFT_PYTHON_BIN=/c/Users/${USER}/mge_whl_python_env/3.8.3
        if [ ! -f "${DFT_PYTHON_BIN}/python3.exe" ]; then
            echo "ERR: can not find ${DFT_PYTHON_BIN}/python3.exe , Invalid env"
            windows_env_err
        else
            echo "put python3 to env..."
            export PATH=${DFT_PYTHON_BIN}:$PATH
            which python3
        fi
    fi
    echo "export swig pwd to PATH"
    export PATH=/c/Users/${USER}/swigwin-4.0.2::$PATH
 }
 WINDOWS_BUILD_TARGET="Ninja all > build.log"
@@ -218,6 +244,7 @@ function cmake_build_windows() {
        vcvarsall.bat $MGE_WINDOWS_BUILD_ARCH && cmake  -G "Ninja" \
        -DMGE_ARCH=$MGE_ARCH \
        -DMGE_INFERENCE_ONLY=$MGE_INFERENCE_ONLY \
        -DMGE_BUILD_IMPERATIVE_RT=${MGE_BUILD_IMPERATIVE_RT} \
        -DMGE_WITH_CUDA=$MGE_WITH_CUDA \
        -DCMAKE_BUILD_TYPE=$BUILD_TYPE \
        -DCMAKE_INSTALL_PREFIX:PATH=$INSTALL_DIR  \
@@ -230,8 +257,18 @@ function cmake_build_windows() {
        ${WINDOWS_BUILD_TARGET}"
 }
 if [ ${MGE_BUILD_IMPERATIVE_RT} = "ON" ] && [ ${MGE_INFERENCE_ONLY} = "ON" ]; then
    echo "ERR: MGE_BUILD_IMPERATIVE_RT(-n) only valid when enable training mode(-t)"
    echo "pls remove -n or add -t"
    exit -1
 fi
 if [[ $OS =~ "NT" ]]; then
    if [ ${MGE_ARCH} = "i386" ] && [ ${MGE_INFERENCE_ONLY} = "OFF" ]; then
        echo "ERR: training mode(-t) only support 64 bit mode"
        echo "pls remove -t or remove -m"
        exit -1
    fi
    config_windows_build_target
    cmake_build_windows $MGE_WITH_CUDA $MGE_INFERENCE_ONLY $BUILD_TYPE
 else
--- a/scripts/whl/BUILD_PYTHON_WHL_README.md
+++ b/scripts/whl/BUILD_PYTHON_WHL_README.md
@@ -53,10 +53,6 @@
       d0: /c/Users/${USER}/mge_whl_python_env/3.8.3/python3.exe -m pip install --upgrade pip
       d1: /c/Users/${USER}/mge_whl_python_env/3.8.3/python3.exe -m pip install -r python_module/requires-test.txt
       d2: /c/Users/${USER}/mge_whl_python_env/3.8.3/python3.exe -m pip install numpy wheel requests tqdm tabulate
    5: install swig from install gui
       a: download swig: https://nchc.dl.sourceforge.net/project/swig/swigwin/swigwin-4.0.2/swigwin-4.0.2.zip
       b: install swig to /c/Users/${USER}/swigwin-4.0.2
       c: apply scripts/whl/windows/fix-ptr-define-issue.patch to c/Users/${USER}/swigwin-4.0.2
    ```
 # how to build
@@ -90,6 +86,11 @@
    ```
    ALL_PYTHON=3.5.9 ./scripts/whl/macos/macos_build_whl.sh
    ```
    If you want to build with imperative rt, set env BUILD_IMPERATIVE="ON", eg:
    ```
    ALL_PYTHON=3.5.9 BUILD_IMPERATIVE="ON" ./scripts/whl/macos/macos_build_whl.sh
    ```
 ## build for windows
    ```
    ./scripts/whl/windows/windows_build_whl.sh
@@ -102,5 +103,7 @@
    If you want to build windows whl with cuda, also a specific Python verison. eg:
    ```
    WINDOWS_WHL_WITH_CUDA="true" ALL_PYTHON=3.5.4 ./scripts/whl/windows/windows_build_whl.sh
    WINDOWS_WHL_WITH_CUDA="ON" ALL_PYTHON=3.5.4 ./scripts/whl/windows/windows_build_whl.sh
    ```
    If you want to build with imperative rt, set env BUILD_IMPERATIVE="ON", eg:
    BUILD_IMPERATIVE="ON" WINDOWS_WHL_WITH_CUDA="ON" ALL_PYTHON=3.5.4 ./scripts/whl/windows/windows_build_whl.sh
--- a/scripts/whl/macos/macos_build_whl.sh
+++ b/scripts/whl/macos/macos_build_whl.sh
@@ -65,16 +65,18 @@ function config_python_env() {
    fi
    echo ${ver}
    #config a dir to trick cmake find a null pythonlib
    PYTHON_LIBRARY=${PYTHON_DIR}lib/
    if [ "$1" = "3.5.9" ]; then
        PYTHON_INCLUDE_DIR=${PYTHON_DIR}include/python3.5m
        PYTHON_LIBRARY=${PYTHON_DIR}/lib/libpython3.5m.dylib
    elif [ "$1" = "3.6.10" ]; then
        PYTHON_INCLUDE_DIR=${PYTHON_DIR}include/python3.6m
        PYTHON_LIBRARY=${PYTHON_DIR}/lib/libpython3.6m.dylib
    elif [ "$1" = "3.7.7" ]; then
        PYTHON_INCLUDE_DIR=${PYTHON_DIR}include/python3.7m
        PYTHON_LIBRARY=${PYTHON_DIR}/lib/libpython3.7m.dylib
    elif [ "$1" = "3.8.3" ]; then
        PYTHON_INCLUDE_DIR=${PYTHON_DIR}include/python3.8
        PYTHON_LIBRARY=${PYTHON_DIR}/lib/libpython3.8.dylib
    else
        echo "ERR: DO NOT SUPPORT PYTHON VERSION"
        echo "now support list: ${FULL_PYTHON_VER}"
@@ -82,6 +84,11 @@ function config_python_env() {
    fi
 }
 if [[ -z ${BUILD_IMPERATIVE} ]]
 then
    BUILD_IMPERATIVE="OFF"
 fi
 function do_build() {
    for ver in ${ALL_PYTHON}
    do
@@ -89,7 +96,7 @@ function do_build() {
        config_python_env ${ver}
        #check env
        if [ ! -d "$PYTHON_LIBRARY" ]; then
        if [ ! -f "$PYTHON_LIBRARY" ]; then
            echo "ERR: can not find $PYTHON_LIBRARY , Invalid python package"
            err_env
        fi
@@ -102,14 +109,20 @@ function do_build() {
        #append cmake args for config python
        export EXTRA_CMAKE_ARGS="-DCMAKE_PREFIX_PATH=${PYTHON_DIR} -DPYTHON_LIBRARY=${PYTHON_LIBRARY} -DPYTHON_INCLUDE_DIR=${PYTHON_INCLUDE_DIR} "
        #config build type to RelWithDebInfo to enable MGB_ENABLE_DEBUG_UTIL etc
        export EXTRA_CMAKE_ARGS=${EXTRA_CMAKE_ARGS}" -DCMAKE_BUILD_TYPE=RelWithDebInfo "
        export EXTRA_CMAKE_ARGS="${EXTRA_CMAKE_ARGS} -DCMAKE_BUILD_TYPE=RelWithDebInfo "
        #call build and install
        #FIXME: cmake do not triger update python config, after
        #change PYTHON_LIBRARY and PYTHON_INCLUDE_DIR, so add
        #-r to remove build cache after a new ver build, which
        #will be more slow build than without -r
        ${SRC_DIR}/scripts/cmake-build/host_build.sh -t -r
        if [ ${BUILD_IMPERATIVE} = "ON" ]; then
            echo "build whl with IMPERATIVE python rt"
            ${SRC_DIR}/scripts/cmake-build/host_build.sh -t -n -r
        else
            echo "build whl with legacy python rt"
            ${SRC_DIR}/scripts/cmake-build/host_build.sh -t -r
        fi
        #call setup.py
        BUILD_DIR=${SRC_DIR}/build_dir/host/MGE_WITH_CUDA_OFF/MGE_INFERENCE_ONLY_OFF/Release/build/
@@ -121,12 +134,47 @@ function do_build() {
        fi
        mkdir -p staging
        if [ ${BUILD_IMPERATIVE} = "ON" ]; then
            echo "build whl with IMPERATIVE python rt"
            cp -a imperative/python/{megengine,setup.py,requires.txt,requires-style.txt,requires-test.txt} staging/
            cd ${BUILD_DIR}/staging/megengine/core
            rt_file=`ls _imperative_rt.*.so`
            echo "rt file is: ${rt_file}"
            if [[ -z ${rt_file} ]]
            then
                echo "ERR: can not find valid rt file"
                exit -1
            fi
            llvm-strip -s ${rt_file}
            mv ${rt_file} _imperative_rt.so
            echo "check so valid or not..."
            otool_out=`otool -L _imperative_rt.so`
            if [[ "${otool_out}" =~ "ython" ]]; then
                echo "ERR: invalid _imperative_rt.so which depend on python lib, detail: log"
                echo ${otool_out}
                exit -1
            else
                echo "valid..."
            fi
        else
            echo "build whl with legacy python rt"
            cp -a python_module/{megengine,setup.py,requires.txt,requires-style.txt,requires-test.txt} staging/
            cd ${BUILD_DIR}/staging/megengine/_internal
            #FIXME: set lib suffix to dylib may be better, BUT we find after distutils.file_util.copy_file
            #will change to .so at macos even we set suffix to dylib, at the same time, macos also support .so
            echo "check so valid or not..."
            llvm-strip -s _mgb.so
            otool_out=`otool -L _mgb.so`
            if [[ "${otool_out}" =~ "ython" ]]; then
                echo "ERR: invalid _mgb.so which depend on python lib, detail: log"
                echo ${otool_out}
                exit -1
            else
                echo "valid..."
            fi
        fi
        cp -a python_module/{megengine,setup.py,requires.txt,requires-style.txt,requires-test.txt} staging/
        cd ${BUILD_DIR}/staging/megengine/_internal
        #FIXME: set lib suffix to dylib may be better, BUT we find after distutils.file_util.copy_file
        #will change to .so at macos even we set suffix to dylib, at the same time, macos also support .so
        llvm-strip -s _mgb.so
        cd ${BUILD_DIR}/staging
        ${PYTHON_DIR}/bin/python3 setup.py bdist_wheel
        cd ${BUILD_DIR}/staging/dist/
--- a/scripts/whl/windows/windows_build_whl.sh
+++ b/scripts/whl/windows/windows_build_whl.sh
@@ -14,8 +14,6 @@ function err_env() {
 }
 function append_path_env_and_check() {
    echo "export swig pwd to PATH"
    export PATH=/c/Users/${USER}/swigwin-4.0.2::$PATH
    echo  "export vs2019 install path"
    export VS_PATH=/c/Program\ Files\ \(x86\)/Microsoft\ Visual\ Studio/2019/Enterprise
    # for llvm-strip
@@ -62,7 +60,7 @@ function config_python_env() {
 if [[ -z ${WINDOWS_WHL_WITH_CUDA} ]]
 then
    WINDOWS_WHL_WITH_CUDA="false"
    WINDOWS_WHL_WITH_CUDA="OFF"
 fi
@@ -74,26 +72,46 @@ CUBLAS_LIB="/c/Program Files/NVIDIA GPU Computing Toolkit/CUDA/v10.1/bin/cublas6
 CURAND_LIB="/c/Program Files/NVIDIA GPU Computing Toolkit/CUDA/v10.1/bin/curand64_10.dll"
 CUBLASLT_LIB="/c/Program Files/NVIDIA GPU Computing Toolkit/CUDA/v10.1/bin/cublasLt64_10.dll"
 CUDART_LIB="/c/Program Files/NVIDIA GPU Computing Toolkit/CUDA/v10.1/bin/cudart64_101.dll"
 function depend_real_copy() {
    REAL_DST=$1
    echo "real copy lib to $1"
    cp "${TRT_LIB}" ${REAL_DST}
    cp "${CUDNN_LIB}" ${REAL_DST}
    cp "${CUSOLVER_LIB}" ${REAL_DST}
    cp "${CUBLAS_LIB}" ${REAL_DST}
    cp "${CURAND_LIB}" ${REAL_DST}
    cp "${CUBLASLT_LIB}" ${REAL_DST}
    cp "${CUDART_LIB}" ${REAL_DST}
 }
 function copy_more_dll() {
    # for python whl real use
    CP_DST=${BUILD_DIR}/staging/megengine/_internal/lib
    rm -rf ${CP_DST}
    mkdir ${CP_DST}
    if [ ${BUILD_IMPERATIVE} = "ON" ]; then
        echo "config BUILD_IMPERATIVE core lib dir"
        CP_WHL_DST=${BUILD_DIR}/staging/megengine/core/lib
    else
        echo "config legacy python lib dir"
        CP_WHL_DST=${BUILD_DIR}/staging/megengine/_internal/lib
    fi
    rm -rf ${CP_WHL_DST}
    mkdir ${CP_WHL_DST}
    # workround for cpu-only version import failed, use a
    # empty.file to triger setup.py to create a null empty
    echo "empty" > ${CP_WHL_DST}/empty.file
    if [ ${WINDOWS_WHL_WITH_CUDA} = "true" ]; then
    if [ ${WINDOWS_WHL_WITH_CUDA} = "ON" ]; then
        echo "copy nvidia lib to whl use...."
        cp "${TRT_LIB}" ${CP_DST}
        cp "${CUDNN_LIB}" ${CP_DST}
        cp "${CUSOLVER_LIB}" ${CP_DST}
        cp "${CUBLAS_LIB}" ${CP_DST}
        cp "${CURAND_LIB}" ${CP_DST}
        cp "${CUBLASLT_LIB}" ${CP_DST}
        cp "${CUDART_LIB}" ${CP_DST}
        depend_real_copy ${CP_WHL_DST}
    fi
 }
 if [[ -z ${BUILD_IMPERATIVE} ]]
 then
    BUILD_IMPERATIVE="OFF"
 fi
 function do_build() {
    for ver in ${ALL_PYTHON}
    do
@@ -118,21 +136,31 @@ function do_build() {
        #force LINK a real PYTHON_LIBRARY file, after test we do not find the symbols conflict with python
        #export EXTRA_CMAKE_ARGS="-DPYTHON_LIBRARY=${PYTHON_LIBRARY} -DPYTHON_INCLUDE_DIR=${PYTHON_INCLUDE_DIR} "
        #config build type to RelWithDebInfo to enable MGB_ENABLE_DEBUG_UTIL etc
        export EXTRA_CMAKE_ARGS=${EXTRA_CMAKE_ARGS}" -DCMAKE_BUILD_TYPE=RelWithDebInfo "
        export EXTRA_CMAKE_ARGS="${EXTRA_CMAKE_ARGS} -DCMAKE_BUILD_TYPE=RelWithDebInfo "
        #call build and install
        #FIXME: cmake do not triger update python config, after
        #change PYTHON_LIBRARY and PYTHON_INCLUDE_DIR, so add
        #-r to remove build cache after a new ver build, which
        #will be more slow build than without -r
        if [ ${WINDOWS_WHL_WITH_CUDA} = "true" ]; then
        BUILD_ARGS=" -t -r"
        if [ ${BUILD_IMPERATIVE} = "ON" ]; then
            echo "build whl with IMPERATIVE python rt"
            BUILD_ARGS="${BUILD_ARGS} -n "
        else
            echo "build whl with legacy python rt"
        fi
        if [ ${WINDOWS_WHL_WITH_CUDA} = "ON" ]; then
            echo "build windows whl with cuda"
            ${SRC_DIR}/scripts/cmake-build/host_build.sh -t -r -c
            BUILD_ARGS="${BUILD_ARGS} -c "
        else
            echo "build windows whl with cpu only"
            ${SRC_DIR}/scripts/cmake-build/host_build.sh -t -r
        fi
        echo "host_build.sh BUILD_ARGS: ${BUILD_ARGS}"
        ${SRC_DIR}/scripts/cmake-build/host_build.sh ${BUILD_ARGS}
        #call setup.py
        BUILD_DIR=${SRC_DIR}/build_dir/host/build/
        cd ${BUILD_DIR}
@@ -143,10 +171,27 @@ function do_build() {
        fi
        mkdir -p staging
        if [ ${BUILD_IMPERATIVE} = "ON" ]; then
            echo "build whl with IMPERATIVE python rt"
            cp -a imperative/python/{megengine,setup.py,requires.txt,requires-style.txt,requires-test.txt} staging/
            cd ${BUILD_DIR}/staging/megengine/core
            rt_file=`ls _imperative_rt.*.pyd`
            echo "rt file is: ${rt_file}"
            if [[ -z ${rt_file} ]]
            then
                echo "ERR: can not find valid rt file"
                exit -1
            fi
            llvm-strip -s ${rt_file}
            mv ${rt_file} _imperative_rt.pyd
        else
            echo "build whl with legacy python rt"
            cp -a python_module/{megengine,setup.py,requires.txt,requires-style.txt,requires-test.txt} staging/
            cd ${BUILD_DIR}/staging/megengine/_internal
            llvm-strip -s _mgb.pyd
        fi
        cp -a python_module/{megengine,setup.py,requires.txt,requires-style.txt,requires-test.txt} staging/
        cd ${BUILD_DIR}/staging/megengine/_internal
        llvm-strip -s _mgb.pyd
        copy_more_dll
        cd ${BUILD_DIR}/staging
        ${PYTHON_DIR}/python3 setup.py bdist_wheel
@@ -175,5 +220,6 @@ function third_party_prepare() {
 }
 ######################
 export ALREADY_CONFIG_PYTHON_VER="yes"
 third_party_prepare
 do_build
--- a/src/core/impl/graph/seq_sublinear_memory.cpp
+++ b/src/core/impl/graph/seq_sublinear_memory.cpp
@@ -33,6 +33,11 @@ class RNGxorshf {
    uint64_t s[2];
 public:
 #if __cplusplus >= 201703L
    typedef uint64_t result_type;
    static constexpr uint64_t min() { return 0; }
    static constexpr uint64_t max() { return UINT64_MAX; }
 #endif
    RNGxorshf(uint64_t seed) {
        std::mt19937_64 gen(seed);
        s[0] = gen();
@@ -936,8 +941,12 @@ void SeqModifierForSublinearMemory::ActionSearcherSingleCN::search_genetic() {
            }
        }
        m_cur_records = records;
 #if __cplusplus >= 201703L
        std::shuffle(perm.begin(), perm.end(), rng);
 #else
        std::random_shuffle(perm.begin(), perm.end(),
                            [&](size_t x) { return rng() % x; });
 #endif
        for (size_t i = 0; i < length; ++i) {
            invoke_search(mutation(mutation(records[i].first)));
            invoke_search(crossover(records[i].first, records[perm[i]].first));
--- a/src/opr/test/blas.cpp
+++ b/src/opr/test/blas.cpp
@@ -705,7 +705,12 @@ TEST(TestOprBlas, MatrixInverse) {
        }
        auto ptr = inp[0]->ptr<float>();
        for (size_t i = 0; i < batch; ++i, ptr += n * n) {
 #if __cplusplus >= 201703L
            std::default_random_engine rng_engine;
            std::shuffle(perm.begin(), perm.end(), rng_engine);
 #else
            std::random_shuffle(perm.begin(), perm.end());
 #endif
            for (size_t j = 0; j < n; ++j) {
                ptr[j * n + perm[j]] += 5;
            }
--- a/src/opr/test/muxing.cpp
+++ b/src/opr/test/muxing.cpp
@@ -36,7 +36,12 @@ void run_all_gather(const std::vector<size_t>& axis_size, bool& success,
        sleep_time.push_back(i * 0.05 + 0.1);
        tot_axis_size += axis_size[i];
    }
 #if __cplusplus >= 201703L
    std::default_random_engine rng_engine;
    std::shuffle(sleep_time.begin(), sleep_time.end(), rng_engine);
 #else
    std::random_shuffle(sleep_time.begin(), sleep_time.end());
 #endif
    auto constexpr DEVICE_TYPE = CompNode::DeviceType::CUDA;
    size_t nr_dev = std::min<size_t>(
--- a/test/CMakeLists.txt
+++ b/test/CMakeLists.txt
@@ -18,7 +18,11 @@ endif()
 add_executable(megbrain_test ${SOURCES})
 target_link_libraries(megbrain_test gtest)
 target_link_libraries(megbrain_test megengine)
 if(MSVC OR WIN32)
    target_link_libraries(megbrain_test megbrain megdnn)
 else()
    target_link_libraries(megbrain_test megengine)
 endif()
 if(CXX_SUPPORT_WCLASS_MEMACCESS)
    if(MGE_WITH_CUDA)
        target_compile_options(megbrain_test PRIVATE "$<$<COMPILE_LANGUAGE:CUDA>:-Xcompiler=-Wno-class-memaccess>"
@@ -28,10 +32,12 @@ if(CXX_SUPPORT_WCLASS_MEMACCESS)
    endif()
 endif()
 if(APPLE OR ANDROID)
    target_link_libraries(megbrain_test dl)
 else()
    target_link_libraries(megbrain_test dl rt)
 if(UNIX)
    if(APPLE OR ANDROID)
        target_link_libraries(megbrain_test dl)
    else()
        target_link_libraries(megbrain_test dl rt)
    endif()
 endif()
 if (MGE_WITH_DISTRIBUTED)