diff --git a/CMakeLists.txt b/CMakeLists.txt
index 347da9f9..a4329709 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -47,10 +47,9 @@ option(MGE_DEBUG_UTIL "Enable debug utility" ON)
 option(MGE_ENABLE_EXCEPTIONS "Build with exceptions" ON)
 option(MGE_WITH_TEST "Enable test for MegEngine." OFF)
 option(MGE_WITH_DISTRIBUTED "Build with distributed support" ON)
-option(MGE_BUILD_IMPERATIVE_RT "Build _imperative_rt.so instead of _mgb.so " OFF)
+option(MGE_BUILD_IMPERATIVE_RT "Build _imperative_rt Python Module " ON)
 option(MGE_BUILD_SDK "Build load_and_run" ON)
 option(MGE_INFERENCE_ONLY "Build inference only library." OFF)
-option(MGE_WITH_PYTHON_MODULE "Build MegEngine Python Module." ON)
 option(MGE_WITH_MKLDNN "Enable Intel MKL_DNN support," ON)
 option(MGE_WITH_ROCM "Enable ROCM support" OFF)
 
@@ -256,8 +255,8 @@ endif()
 if(MGE_INFERENCE_ONLY)
     message("-- Disable distributed support for inference only build.")
     set(MGE_WITH_DISTRIBUTED OFF)
-    message("-- Disable python module for inference only build.")
-    set(MGE_WITH_PYTHON_MODULE OFF)
+    message("-- Disable imperative_rt python module for inference only build.")
+    set(MGE_BUILD_IMPERATIVE_RT OFF)
 endif()
 
 if(MGE_WITH_DISTRIBUTED)
@@ -694,43 +693,18 @@ if(MGE_BUILD_SDK)
     add_subdirectory(sdk/load-and-run)
 endif()
 
-if(MGE_WITH_PYTHON_MODULE)
-    if(MGE_BUILD_IMPERATIVE_RT)
-        add_subdirectory(imperative)
-        message("-- Enable imperative python wrapper runtime")
-    else()
-        add_subdirectory(python_module)
-        message("-- Enable legacy python wrapper runtime")
-    endif()
+
+if(MGE_BUILD_IMPERATIVE_RT)
+    add_subdirectory(imperative)
+    message("-- Enable imperative python wrapper runtime")
 endif()
 
 if(MGE_WITH_TEST AND MGE_ENABLE_RTTI)
     add_subdirectory(test)
 endif()
 
-if(TARGET mgb)
-    add_custom_target(
-        develop
-        COMMAND ${CMAKE_COMMAND} -E create_symlink
-          ${CMAKE_CURRENT_BINARY_DIR}/python_module/megengine/_internal/$<TARGET_FILE_NAME:mgb>
-          ${CMAKE_CURRENT_SOURCE_DIR}/python_module/megengine/_internal/$<TARGET_FILE_NAME:mgb>
-        COMMAND ${CMAKE_COMMAND} -E create_symlink
-          ${CMAKE_CURRENT_BINARY_DIR}/python_module/megengine/_internal/mgb.py
-          ${CMAKE_CURRENT_SOURCE_DIR}/python_module/megengine/_internal/mgb.py
-        COMMAND ${CMAKE_COMMAND} -E create_symlink
-          ${CMAKE_CURRENT_BINARY_DIR}/python_module/megengine/_internal/opr.py
-          ${CMAKE_CURRENT_SOURCE_DIR}/python_module/megengine/_internal/opr.py
-        COMMAND ${CMAKE_COMMAND} -E create_symlink
-          ${CMAKE_CURRENT_BINARY_DIR}/python_module/megengine/_internal/opr_param_defs.py
-          ${CMAKE_CURRENT_SOURCE_DIR}/python_module/megengine/_internal/opr_param_defs.py
-        COMMAND ${CMAKE_COMMAND} -E create_symlink
-          ${CMAKE_CURRENT_BINARY_DIR}/python_module/megengine/_internal/include
-          ${CMAKE_CURRENT_SOURCE_DIR}/python_module/megengine/_internal/include
 
-        DEPENDS mgb
-        VERBATIM
-    )
-elseif(TARGET _imperative_rt)
+if(TARGET _imperative_rt)
     add_custom_target(
         develop
         COMMAND ${CMAKE_COMMAND} -E create_symlink
diff --git a/imperative/python/megengine/core/tensor/multipledispatch/utils.py b/imperative/python/megengine/core/tensor/multipledispatch/utils.py
index 4dcd0dc4..968430f3 100644
--- a/imperative/python/megengine/core/tensor/multipledispatch/utils.py
+++ b/imperative/python/megengine/core/tensor/multipledispatch/utils.py
@@ -183,25 +183,16 @@ def typename(type):
 
 
 # parse typing.Union
-if sys.version_info < (3, 6):
-
-    def parse_union(ann):
+def parse_union(ann):
+    if hasattr(typing, "UnionMeta"):
         if type(ann) is not typing.UnionMeta:
             return
         return ann.__union_params__
-
-
-elif sys.version_info < (3, 7):
-
-    def parse_union(ann):
+    elif hasattr(typing, "_Union"):
         if type(ann) is not typing._Union:
             return
         return ann.__args__
-
-
-elif sys.version_info < (3, 8):
-
-    def parse_union(ann):
+    elif hasattr(typing, "_GenericAlias"):
         if type(ann) is not typing._GenericAlias:
             if type(ann) is not typing.Union:
                 return
@@ -209,11 +200,9 @@ elif sys.version_info < (3, 8):
             if ann.__origin__ is not typing.Union:
                 return
         return ann.__args__
-
-
-else:
-
-    def parse_union(ann):
+    elif hasattr(typing, "Union"):
         if typing.get_origin(ann) is not typing.Union:
             return
         return typing.get_args(ann)
+    else:
+        raise NotImplementedError("unsupported Python version")
diff --git a/imperative/python/setup.py b/imperative/python/setup.py
index c788b75c..19c6123c 100644
--- a/imperative/python/setup.py
+++ b/imperative/python/setup.py
@@ -6,6 +6,7 @@
 # Unless required by applicable law or agreed to in writing,
 # software distributed under the License is distributed on an
 # "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+
 import os
 import re
 import pathlib
@@ -55,11 +56,13 @@ package_data = [
     str(f.relative_to('megengine'))
     for f in pathlib.Path('megengine', 'core', 'include').glob('**/*')
 ]
+
 package_data += [
     str(f.relative_to('megengine'))
     for f in pathlib.Path('megengine', 'core', 'lib').glob('**/*')
 ]
 
+
 with open('requires.txt') as f:
     requires = f.read().splitlines()
 with open('requires-style.txt') as f:
@@ -67,6 +70,7 @@ with open('requires-style.txt') as f:
 with open('requires-test.txt') as f:
     requires_test = f.read().splitlines()
 
+prebuild_modules=[PrecompiledExtesion('megengine.core._imperative_rt')]
 setup_kwargs = dict(
     name=package_name,
     version=__version__,
@@ -78,7 +82,7 @@ setup_kwargs = dict(
     package_data={
         'megengine': package_data,
     },
-    ext_modules=[PrecompiledExtesion('megengine.core._imperative_rt')],
+    ext_modules=prebuild_modules,
     install_requires=requires,
     extras_require={
         'dev': requires_style + requires_test,
@@ -87,6 +91,7 @@ setup_kwargs = dict(
     cmdclass={'build_ext': build_ext},
 )
 
+
 setup_kwargs.update(dict(
     classifiers=[
     'Development Status :: 3 - Alpha',
diff --git a/imperative/python/test/run.sh b/imperative/python/test/run.sh
new file mode 100755
index 00000000..1e9676fb
--- /dev/null
+++ b/imperative/python/test/run.sh
@@ -0,0 +1,21 @@
+#!/bin/bash -e
+
+test_dirs="test"
+TEST_PLAT=$1
+
+if [[ "$TEST_PLAT" == cpu ]]; then
+    echo "only test cpu pytest"
+elif [[ "$TEST_PLAT" == cuda ]]; then
+    echo "test both cpu and gpu pytest"
+else
+    log "Argument must cpu or cuda"
+    exit 1
+fi
+
+pushd $(dirname "${BASH_SOURCE[0]}")/.. >/dev/null
+    PYTHONPATH="." PY_IGNORE_IMPORTMISMATCH=1 python3 -m pytest $test_dirs -m 'not isolated_distributed'
+    if [[ "$TEST_PLAT" == cuda ]]; then
+        echo "test GPU pytest now"
+        PYTHONPATH="." PY_IGNORE_IMPORTMISMATCH=1 python3 -m pytest $test_dirs -m 'isolated_distributed'
+    fi
+popd >/dev/null
diff --git a/python_module/.gitignore b/python_module/.gitignore
deleted file mode 100644
index f31367f5..00000000
--- a/python_module/.gitignore
+++ /dev/null
@@ -1,8 +0,0 @@
-/megbrain/_mgb.so
-/megbrain/_mgb.*.so
-/MegBrain.egg-info/
-/dist
-/dist_cuda
-/dist_nocuda
-/wheel_dist
-.cache
diff --git a/python_module/CMakeLists.txt b/python_module/CMakeLists.txt
deleted file mode 100644
index e23c2488..00000000
--- a/python_module/CMakeLists.txt
+++ /dev/null
@@ -1,113 +0,0 @@
-cmake_policy(SET CMP0086 NEW)
-
-find_package(PythonLibs ${PYTHON_VERSION_STRING} EXACT REQUIRED)
-
-find_package(Git)
-if(GIT_FOUND)
-  message("git found: ${GIT_EXECUTABLE}")
-endif()
-
-find_package(NumPy REQUIRED)
-
-find_package(SWIG REQUIRED)
-set(SWIG_SRC src/swig/mgb.i)
-if(MSVC OR WIN32)
-    set(CMAKE_SWIG_FLAGS -Wall -threads -py3 -DSWIGWORDSIZE64)
-    message("WARN: swig have some define issue at windows(64) env")
-    message("Please refs scripts/whl/BUILD_PYTHON_WHL_README.md to init windows build env")
-else()
-    set(CMAKE_SWIG_FLAGS -Wall -threads -py3 -modern -DSWIGWORDSIZE64)
-endif()
-set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wno-unused-parameter")
-
-file(GLOB_RECURSE OPR_DECL_SRCS "${PROJECT_SOURCE_DIR}/src/**/*.oprdecl")
-file(GLOB_RECURSE PYTHON_SRCS setup.py
-    src/python/*.py
-    test/*.py
-    megengine/*.py)
-list(REMOVE_ITEM PYTHON_SRCS
-    ${CMAKE_CURRENT_SOURCE_DIR}/megengine/_internal/mgb.py
-    ${CMAKE_CURRENT_SOURCE_DIR}/megengine/_internal/opr.py
-    ${CMAKE_CURRENT_SOURCE_DIR}/megengine/_internal/opr_param_defs.py
-)
-list(APPEND PYTHON_SRCS ${MGB_SRCS})
-
-file(GLOB_RECURSE ALL_HEADERS src/cpp/megbrain_pubapi.h
-    ${PROJECT_SOURCE_DIR}/src/core/include/*
-    ${PROJECT_SOURCE_DIR}/src/opr/include/*
-    ${PROJECT_SOURCE_DIR}/src/serialization/include/*
-    ${PROJECT_SOURCE_DIR}/src/plugin/include/*
-    ${PROJECT_SOURCE_DIR}/dnn/include/*)
-
-file(COPY ${PROJECT_SOURCE_DIR}/dnn/scripts/opr_param_defs.py DESTINATION ${CMAKE_CURRENT_BINARY_DIR})
-file(READ ${PROJECT_SOURCE_DIR}/tools/param_defs/mgb_opr_param_defs.py CONTENTS)
-file(APPEND ${CMAKE_CURRENT_BINARY_DIR}/opr_param_defs.py ${CONTENTS})
-
-add_custom_command(
-    OUTPUT ${CMAKE_CURRENT_BINARY_DIR}/megengine/_internal/opr.py ${CMAKE_CURRENT_BINARY_DIR}/megengine/_internal/opr_param_defs.py
-    COMMAND ${CMAKE_COMMAND} -E copy_directory ${CMAKE_CURRENT_SOURCE_DIR}/src/python ${CMAKE_CURRENT_BINARY_DIR}/src/python
-    COMMAND ${CMAKE_COMMAND} -E make_directory ${CMAKE_CURRENT_BINARY_DIR}/megengine/_internal
-    COMMAND ${PYTHON_EXECUTABLE} ${CMAKE_CURRENT_BINARY_DIR}/src/python/genopr.py ${OPR_DECL_SRCS}
-    COMMAND ${PYTHON_EXECUTABLE} ${PROJECT_SOURCE_DIR}/dnn/scripts/gen_param_defs.py -t py ${CMAKE_CURRENT_BINARY_DIR}/opr_param_defs.py ${CMAKE_CURRENT_BINARY_DIR}/megengine/_internal/opr_param_defs.py
-    DEPENDS ${OPR_DECL_SRCS}
-    VERBATIM
-)
-
-add_custom_target(mgb_opr_py DEPENDS ${CMAKE_CURRENT_BINARY_DIR}/megengine/_internal/opr.py)
-
-set(SRCS src/cpp/craniotome.cpp src/cpp/function_replace.cpp src/cpp/intbx.cpp src/cpp/bfloat16.cpp src/cpp/megbrain_config.cpp src/cpp/megbrain_pubapi.cpp src/cpp/megbrain_serialize.cpp src/cpp/megbrain_wrap.cpp src/cpp/opr_defs.cpp src/cpp/opr_helper.cpp src/cpp/plugin.cpp src/cpp/python_helper.cpp)
-
-include(UseSWIG)
-set_property(SOURCE ${SWIG_SRC} PROPERTY CPLUSPLUS ON)
-
-# cmake < 3.12 do not honor INCLUDE_DIRECTORIES property, just add include directory into SWIG_FLAGS
-# Add -I${PROJECT_BINARY_DIR}/genfiles in order to include megbrain_build_config.h so that we don't need to pass cmake flags by -D.
-set_property(SOURCE ${SWIG_SRC} PROPERTY SWIG_FLAGS -I${PROJECT_SOURCE_DIR}/src/serialization/include -I${PROJECT_BINARY_DIR}/genfiles)
-
-set(SWIG_OUTFILE_DIR ${CMAKE_CURRENT_BINARY_DIR})
-set(CMAKE_SWIG_OUTDIR ${CMAKE_CURRENT_BINARY_DIR}/megengine/_internal)
-swig_add_library(mgb LANGUAGE python SOURCES ${SWIG_SRC} ${SRCS})
-
-set(VERSION_SCRIPT ${CMAKE_CURRENT_SOURCE_DIR}/src/version.ld)
-add_custom_target(version_ld SOURCES ${VERSION_SCRIPT})
-
-set_target_properties(mgb PROPERTIES LIBRARY_OUTPUT_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}/megengine/_internal)
-if (APPLE)
-    target_link_libraries(mgb megbrain megdnn)
-    set_target_properties(mgb PROPERTIES LINK_FLAGS "-undefined dynamic_lookup")
-elseif (MSVC OR WIN32)
-    target_link_libraries(mgb megbrain megdnn)
-else()
-    target_link_libraries(mgb megbrain megdnn -Wl,--version-script=${VERSION_SCRIPT})
-endif()
-target_include_directories(mgb PRIVATE ${PYTHON_INCLUDE_DIRS} src/cpp ${CMAKE_CURRENT_BINARY_DIR} ${NUMPY_INCLUDE_DIR})
-# only windows need link PYTHON_LIBRARIES
-if(MSVC OR WIN32)
-    target_link_libraries(mgb ${PYTHON_LIBRARIES})
-endif()
-
-if (MGE_WITH_DISTRIBUTED)
-    target_link_libraries(mgb megray)
-endif()
-
-add_dependencies(mgb mgb_opr_py version_ld)
-
-add_custom_command(
-    TARGET mgb POST_BUILD
-    COMMAND ${CMAKE_COMMAND} -E copy ${PROJECT_SOURCE_DIR}/LICENSE ${PROJECT_SOURCE_DIR}/ACKNOWLEDGMENTS ${PROJECT_BINARY_DIR}
-    COMMAND ${CMAKE_COMMAND} -E copy_directory ${CMAKE_CURRENT_SOURCE_DIR}/megengine ${CMAKE_CURRENT_BINARY_DIR}/megengine
-    COMMAND ${CMAKE_COMMAND} -E copy_directory ${CMAKE_CURRENT_SOURCE_DIR}/test ${CMAKE_CURRENT_BINARY_DIR}/test
-    COMMAND ${CMAKE_COMMAND} -E copy ${CMAKE_CURRENT_SOURCE_DIR}/setup.py ${CMAKE_CURRENT_BINARY_DIR}/setup.py
-    COMMAND ${CMAKE_COMMAND} -E copy ${CMAKE_CURRENT_SOURCE_DIR}/requires.txt ${CMAKE_CURRENT_BINARY_DIR}/requires.txt
-    COMMAND ${CMAKE_COMMAND} -E copy ${CMAKE_CURRENT_SOURCE_DIR}/requires-style.txt ${CMAKE_CURRENT_BINARY_DIR}/requires-style.txt
-    COMMAND ${CMAKE_COMMAND} -E copy ${CMAKE_CURRENT_SOURCE_DIR}/requires-test.txt ${CMAKE_CURRENT_BINARY_DIR}/requires-test.txt
-    COMMAND ${CMAKE_COMMAND} -E remove_directory ${CMAKE_CURRENT_BINARY_DIR}/megengine/_internal/include
-    COMMAND ${CMAKE_COMMAND} -E copy ${CMAKE_CURRENT_SOURCE_DIR}/src/cpp/megbrain_pubapi.h ${CMAKE_CURRENT_BINARY_DIR}/megengine/_internal/include/megbrain_pubapi.h
-    COMMAND ${CMAKE_COMMAND} -E copy_directory ${PROJECT_SOURCE_DIR}/src/core/include ${CMAKE_CURRENT_BINARY_DIR}/megengine/_internal/include
-    COMMAND ${CMAKE_COMMAND} -E copy_directory ${PROJECT_SOURCE_DIR}/src/opr/include ${CMAKE_CURRENT_BINARY_DIR}/megengine/_internal/include
-    COMMAND ${CMAKE_COMMAND} -E copy_directory ${PROJECT_SOURCE_DIR}/src/serialization/include ${CMAKE_CURRENT_BINARY_DIR}/megengine/_internal/include
-    COMMAND ${CMAKE_COMMAND} -E copy_directory ${PROJECT_SOURCE_DIR}/src/plugin/include ${CMAKE_CURRENT_BINARY_DIR}/megengine/_internal/include
-    COMMAND ${CMAKE_COMMAND} -E copy_directory ${PROJECT_SOURCE_DIR}/dnn/include ${CMAKE_CURRENT_BINARY_DIR}/megengine/_internal/include
-    COMMAND ${CMAKE_COMMAND} -E copy ${PROJECT_BINARY_DIR}/genfiles/megbrain_build_config.h ${CMAKE_CURRENT_BINARY_DIR}/megengine/_internal/include/megbrain_build_config.h
-)
-
diff --git a/python_module/megengine/__init__.py b/python_module/megengine/__init__.py
deleted file mode 100644
index 81c59cd0..00000000
--- a/python_module/megengine/__init__.py
+++ /dev/null
@@ -1,11 +0,0 @@
-# -*- coding: utf-8 -*-
-# MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
-#
-# Copyright (c) 2014-2020 Megvii Inc. All rights reserved.
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-from .core import *
-from .logger import enable_debug_log, get_logger, set_log_file, set_log_level
-from .version import __version__
diff --git a/python_module/megengine/_internal/__init__.py b/python_module/megengine/_internal/__init__.py
deleted file mode 100644
index bac0e791..00000000
--- a/python_module/megengine/_internal/__init__.py
+++ /dev/null
@@ -1,729 +0,0 @@
-# -*- coding: utf-8 -*-
-# MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
-#
-# Copyright (c) 2014-2020 Megvii Inc. All rights reserved.
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-
-"""the megbrain python package
-
-Note that all the submodules are automatically imported, so you usually only
-need to ``import megengine._internal as mgb``.
-"""
-
-import collections
-import json
-import os
-import sys
-import platform
-import ctypes
-
-if sys.platform == "win32":
-    lib_path = os.path.join(os.path.dirname(__file__), "lib")
-    Lib_path = os.path.join(os.path.dirname(__file__), "Lib")
-    dll_paths = list(filter(os.path.exists, [lib_path, Lib_path]))
-    assert len(dll_paths) > 0
-
-    kernel32 = ctypes.WinDLL("kernel32.dll", use_last_error=True)
-    has_load_library_attr = hasattr(kernel32, "AddDllDirectory")
-    old_error_mode = kernel32.SetErrorMode(0x0001)
-
-    kernel32.LoadLibraryW.restype = ctypes.c_void_p
-    if has_load_library_attr:
-        kernel32.AddDllDirectory.restype = ctypes.c_void_p
-        kernel32.LoadLibraryExW.restype = ctypes.c_void_p
-
-    for dll_path in dll_paths:
-        if sys.version_info >= (3, 8):
-            os.add_dll_directory(dll_path)
-        elif has_load_library_attr:
-            res = kernel32.AddDllDirectory(dll_path)
-            if res is None:
-                err = ctypes.WinError(ctypes.get_last_error())
-                err.strerror += ' Error adding "{}" to the DLL search PATH.'.format(
-                    dll_path
-                )
-                raise err
-        else:
-            print("WARN: python or OS env have some issue, may load DLL failed!!!")
-
-    import glob
-
-    dlls = glob.glob(os.path.join(lib_path, "*.dll"))
-    path_patched = False
-    for dll in dlls:
-        is_loaded = False
-        if has_load_library_attr:
-            res = kernel32.LoadLibraryExW(dll, None, 0x00001100)
-            last_error = ctypes.get_last_error()
-            if res is None and last_error != 126:
-                err = ctypes.WinError(last_error)
-                err.strerror += ' Error loading "{}" or one of its dependencies.'.format(
-                    dll
-                )
-                raise err
-            elif res is not None:
-                is_loaded = True
-        if not is_loaded:
-            if not path_patched:
-                os.environ["PATH"] = ";".join(dll_paths + [os.environ["PATH"]])
-                path_patched = True
-            res = kernel32.LoadLibraryW(dll)
-            if res is None:
-                err = ctypes.WinError(ctypes.get_last_error())
-                err.strerror += ' Error loading "{}" or one of its dependencies.'.format(
-                    dll
-                )
-                raise err
-
-    kernel32.SetErrorMode(old_error_mode)
-
-import numpy as np
-
-from . import comp_graph_tools as cgtools
-from . import config, craniotome, dtype
-from . import global_init as _global_init
-from . import helper as _helper
-from . import mgb as _detail
-from . import opr, opr_extra, opr_param_defs, plugin
-from .exc import MegBrainError
-from .logconf import get_logger
-from .mgb import (
-    CompGraph,
-    CompNode,
-    SharedND,
-    SharedScalar,
-    SymbolVar,
-    TensorValueDumperContext,
-    TensorValueLoaderContext,
-)
-from .mgb import as_comp_node as comp_node
-from .mgb_helper import SharedNDLazyInitializer, callback_lazycopy, copy_output
-from .plugin import CompGraphProfiler
-from .plugin import GlobalInfkernFinder as _GlobalInfkernFinder
-from .plugin import NumRangeChecker
-from .version import __version__, version_info
-
-if sys.version_info.major < 3:
-    raise ImportError("megbrain requires python 3")
-
-
-class ProxySharedNDAndSymbolVar(_detail.SymbolVar):
-    """this is a :class:`.SymbolVar` with a corresponding :class:`.SharedND`.
-    It can participate in graph computating and also provides :meth:`set_value`
-    and :meth:`get_value`.  It should be constructed by :func:`make_shared`.
-    """
-
-    __shared_nd = None
-    __kwargs = None
-
-    def __init__(self, snd, comp_graph, name, **kwargs):
-        self.__shared_nd = snd
-        self.__kwargs = kwargs
-        self.this = snd.symvar(comp_graph=comp_graph, name=name, **kwargs).this
-
-    def set_value(self, v, **kwargs):
-        ret = self.__shared_nd.set_value(v, **kwargs)
-        self._reeval_if_eager_eval()
-        return ret
-
-    def get_value(self):
-        return self.__shared_nd.get_value()
-
-    def reset_zero(self):
-        self.__shared_nd.reset_zero()
-
-
-def make_shared(
-    comp_node,
-    *,
-    dtype=None,
-    shape=None,
-    value=None,
-    comp_graph=None,
-    name=None,
-    volatile=None
-):
-    """make a shared tensor which is stored on device and could be modified
-    later, either as a :class:`.SymbolVar` or a :class:`.SharedND` object
-
-    :param comp_node: computing node
-    :type comp_node: :class:`.CompNode`
-    :param dtype: data type; if it is None, then dtype of value would be used
-        if value is not None, and float32 would be used as default dtype if
-        value is None
-    :type dtype: :class:`numpy.dtype` compatible
-    :param value: initializing value
-    :type value: None or :class:`numpy.ndarray`
-    :param comp_graph: the computing graph to which this shared value should
-        belong; if provided, the retuned object could be used as a
-        :class:`.SymbolVar`
-    :type comp_graph: None or :class:`.CompGraph`
-    :param name: node name to be used in computing graph; only meaningful if
-        *comp_graph* is provided
-    :param volatile: if *comp_graph* is given then *volatile* indicates whether
-        shape or mem ptr of this SharedND can be changed
-    :rtype: :class:`.SharedND` if *comp_graph* is not given; or
-        :class:`ProxySharedNDAndSymbolVar` otherwise
-    """
-    if dtype is None:
-        if value is not None:
-            value = np.ascontiguousarray(value)
-            dtype = to_mgb_supported_dtype(value.dtype)
-        else:
-            dtype = np.float32
-    comp_node = _detail.as_comp_node(comp_node)
-    rst = _detail.SharedND(comp_node, dtype)
-    if value is not None:
-        assert shape is None, "could not provide both value and shape"
-        rst.set_value(value)
-    elif shape is not None:
-        rst._set_init_shape(shape)
-    if comp_graph is None:
-        assert name is None and volatile is None
-        return rst
-    assert isinstance(comp_graph, CompGraph), "expect CompGraph but got {}".format(
-        comp_graph
-    )
-    if volatile is None:
-        volatile = False
-    else:
-        assert isinstance(volatile, bool)
-    return ProxySharedNDAndSymbolVar(rst, comp_graph, name, volatile=volatile)
-
-
-def make_immutable(comp_node, comp_graph, value, *, dtype=None, name=None):
-    """make a graph node containing an immutable tensor from host tensor value
-
-    :param dtype: required data type; if not None, the data would be converted
-        to that type; otherwise
-    """
-
-    comp_node = _detail.as_comp_node(comp_node)
-    assert isinstance(
-        comp_graph, _detail.CompGraph
-    ), "expect CompGraph but got {!r}".format(comp_graph)
-
-    config = _detail.make_opr_config(name, comp_node)
-    return _helper.cvt_opr_result(
-        _detail._make_immutable(comp_graph, value, dtype, config)
-    )
-
-
-def make_arg(
-    comp_node,
-    comp_graph,
-    *,
-    dtype=np.float32,
-    shape=None,
-    name=None,
-    value=None,
-    enable_static_infer=True
-):
-    """make an argument to be passed to compiled function during runtime;
-
-    :type shape: None or tuple of int
-    :param shape: expected tensor shape to be used for shape inferring; actual
-        tesor shape could be different
-    :type name: str
-    :param name: name of the generated var node
-    :type value: None or ndarray-compatible
-    :param value: initial value used for static inference; if not given, static
-        infer would be deferred to first graph execution
-    :param enable_static_infer: whether to enable static inference for this var
-    """
-    comp_node = _detail.as_comp_node(comp_node)
-    host_val = mgb._HostSharedND(comp_node, dtype)
-
-    if value is not None:
-        value = np.ascontiguousarray(value, dtype=dtype)
-        if shape is None:
-            shape = value.shape
-        else:
-            assert shape == value.shape
-    if shape is not None:
-        host_val._resize(shape)
-
-    if value is not None:
-        host_val.set_value(value)
-
-    return _helper.cvt_opr_result(
-        ProxySharedNDAndSymbolVar(
-            host_val, comp_graph, name, enable_static_infer=enable_static_infer
-        )
-    )
-
-
-def comp_graph(*, extra_opts=None, check_env_var=True):
-    """allocate a new computing graph
-
-    :param extra_opts: extra options to be set; would be updated (modified
-        inplace) from ``MGB_COMP_GRAPH_OPT`` environment var. See
-        :func:`.set_comp_graph_option` for list of supported options.
-    :type extra_opts: dict
-    :param check_env_var: whether to check environment vars
-    :type check_env_var: bool
-
-    :return: the comp graph object
-    :rtype: :class:`.CompGraph`
-    """
-    cg = _detail.CompGraph()
-    if extra_opts is None:
-        extra_opts = {}
-    if check_env_var:
-        setting = os.getenv("MGB_COMP_GRAPH_OPT")
-        if setting:
-            for item in setting.split(";"):
-                k, v = item.split("=", 1)
-                extra_opts.setdefault(k, v)
-            get_logger().warning(
-                "set comp graph option from env: {}".format(extra_opts)
-            )
-        user_data = os.getenv("MGB_COMP_GRAPH_USER_DATA")
-        if user_data:
-            storage = cg.user_data
-            for ud in user_data.split(";"):
-                k, v = ud.split("=", 1)
-                storage[k] = eval(v)
-        _GlobalInfkernFinder.add_graph(cg)
-    for k, v in extra_opts.items():
-        cg.set_option(k, v)
-    return cg
-
-
-def grad(
-    target, wrt, warn_mid_wrt=True, use_virtual_grad=None, return_zero_for_nodep=True
-):
-    r"""compute symbolic grad
-
-    :param target: grad target var
-    :type target: :class:`.SymbolVar`
-    :param wrt: with respect to which to compute the grad
-    :type wrt: :class:`.SymbolVar` or Iterable[SymbolVar]
-    :param warn_mid_wrt: whether to give warning if *wrt* is not endpoint
-    :type warn_mid_wrt: bool
-    :param use_virtual_grad: whether to use virtual grad opr, so fwd graph can
-        be optimized before applying grad; if ``None`` is given, then virtual
-        grad would be used if ``graph_opt_level >= 2``
-    :type use_virtual_grad: :class:`bool` or ``None``
-    :param return_zero_for_nodep: if *target* does not depend on *wrt*, set to True to return
-        a zero-valued `.SymbolVar` rather than ``None``; can't be set to False when using
-        virtual grad opr.
-    :type return_zero_for_nodep: bool
-    :rtype: :class:`.SymbolVar` or None
-    :return: :math:`\frac{\partial\text{target}}{\partial\text{wrt}}`
-    """
-    if use_virtual_grad is None:
-        use_virtual_grad = -1
-    else:
-        use_virtual_grad = 1 if use_virtual_grad else 0
-
-    if isinstance(wrt, SymbolVar):
-        wrts = [
-            wrt,
-        ]
-    else:
-        wrts = wrt
-
-    assert isinstance(wrts, collections.Iterable)
-    # return a invalid SymbolVar (with nullptr VarNode*) when return_zero_for_nodep is False
-    # and target doesn't depend on wrt
-    grads = _detail._grad(
-        target, wrts, bool(warn_mid_wrt), use_virtual_grad, return_zero_for_nodep
-    )
-    grads = list(grads)
-
-    for i in range(len(grads)):
-        if not grads[i].valid:
-            assert (
-                not return_zero_for_nodep
-            ), "invalid grad SymbolVar: target={}, wrt={}".format(target, wrts[i])
-            grads[i] = None
-
-    if len(grads) == 1:
-        grads = grads[0]
-
-    return grads
-
-
-def current_grad_target(comp_graph):
-    """get current target var to compute grad, used for implementing custom
-    gradient"""
-    return _detail._current_grad_target(comp_graph)
-
-
-def add_device_map(map_location):
-    """add map location while loading models"""
-    _detail.CompNode.cn_thread_local.__setattr__("map_location", map_location)
-
-
-def del_device_map():
-    """delete map location"""
-    _detail.CompNode.cn_thread_local.__delattr__("map_location")
-
-
-def inter_graph_trans_var(dest_graph, src):
-    """get the corresponding var of *src* in *dest_graph*; assuming
-    *dest_graph* is a copy of owner graph of *src*; usually used in callback of
-    set_grad to get grad of vars in loop
-
-    :param dest_graph: target computing graph
-    :type dest_graph: :class:`.CompGraph`
-    :param src: source var node
-    :type src: :class:`.SymbolVar`
-    :return: corresponding var in *dest_graph*
-    :rtype: :class:`.SymbolVar`
-    """
-    return _detail._inter_graph_trans_var(dest_graph, src)
-
-
-def get_graph_optimizer_replaced_var(src):
-    """get optimized var corresponding to given var; usually used in callback
-    of set_grad to get grad w.r.t. some var
-
-    :param src: source var node
-    :type src: :class:`.SymbolVar`
-    :rtype: :class:`.SymbolVar`
-    """
-    return _detail._get_graph_optimizer_replaced_var(src)
-
-
-CompGraphSerializationResult = collections.namedtuple(
-    "CompGraphSerializationResult",
-    [
-        "nr_opr",
-        "tot_bytes",
-        "tensor_value_bytes",
-        "content_hash",
-        "inputs",
-        "outputs",
-        "params",
-    ],
-)
-
-
-def serialize_comp_graph_to_file(
-    fpath,
-    output_vars,
-    *,
-    keep_var_name=1,
-    keep_param_name=False,
-    keep_opr_priority=False,
-    tensor_value_dumper=None,
-    output_strip_info=False,
-    append=False,
-    format=None,
-    **kwargs
-):
-    """serialize this computing graph and write result to a file. Note:
-    ``kwargs`` exists for backward compatibility; there is no additional
-    arguments.
-
-    :parma fpath: path for the output file
-    :type fpath: ``str``
-    :param output_vars: output variables that need to be retrieved when
-        deserializing
-
-        .. note::
-
-            The underlying C++ API only accepts a var list. If a dict is given,
-            the vars would be renamed to given names.
-
-    :type output_vars: dict(name => :class:`.SymbolVar`), or a list of vars
-    :param keep_var_name: level for keeping variable names:
-
-        * 0: none of the names are kept
-        * 1: keep names of output vars
-        * 2: keep names of all (output and internal) vars
-    :param keep_param_name: whether to keep param names, so param values can be
-        easily manipulated after loading model
-    :param keep_opr_priority: whether to keep priority setting for operators
-    :param tensor_value_dumper: a callable to dump tensor values; it should
-        only write the tensor value without layout information. It would be
-        given a :class:`.TensorValueDumperContext` object as its sole argument.
-    :param output_strip_info: if set to True, then a json file containing
-        information for code strip would be written to ``fpath+'.json'``
-    :param append: whether to open output file in append mode
-    :return: an instance of namedtuple :class:`CompGraphSerializationResult`,
-        whose fields are:
-
-            * ``nr_opr`` number of operators dumped
-            * ``tot_bytes`` total bytes for the whole graph
-            * ``tensor_value_bytes`` bytes consumed for dumping tensor values
-            * ``inputs`` names of input tensors
-            * ``params`` list of names of dumped params
-            * ``outputs`` names of output vars
-    :param format: serialization format of the resulting model, should be either
-        "mdl" or "fbs"; none means default.
-    :type format: ``str``
-    """
-
-    assert isinstance(fpath, str), "bad file path: {!r}".format(fpath)
-    ov = _detail._VectorSymbolVar()
-    SUPPORTED_FORMATS = {
-        # default
-        None: _detail.GraphDumpFormat_FLATBUFFERS,
-        "fbs": _detail.GraphDumpFormat_FLATBUFFERS,
-    }
-    resolved_fmt = SUPPORTED_FORMATS.get(format, None)
-    if resolved_fmt is None:
-        raise ValueError(
-            "unknown format {} requested, supported ones are {}".format(
-                format, list(filter(None, SUPPORTED_FORMATS.keys()))
-            )
-        )
-    if isinstance(output_vars, dict):
-        used_vars = set()
-        for name, var in output_vars.items():
-            assert isinstance(var, _detail.SymbolVar), "bad output var: {!r}".format(
-                var
-            )
-            assert var.id not in used_vars, (
-                "var name is associated with a var object, so we can not have "
-                "two names given to the same var: {}".format(var)
-            )
-            used_vars.add(var.id)
-            var.rename(name)
-            ov.push_back(var)
-    else:
-        for i in output_vars:
-            assert isinstance(i, _detail.SymbolVar), "bad output var: {!r}".format(i)
-            ov.push_back(i)
-
-    if tensor_value_dumper is not None:
-        assert isinstance(tensor_value_dumper, collections.Callable)
-
-        class Callback(_detail._TensorValueDumperCallback):
-            def call(self, ctx, *, _f=tensor_value_dumper):
-                _f(ctx)
-
-        tensor_value_dumper = Callback()
-
-    # for backward compatibility
-    mangle_opr_name = kwargs.pop("mangle_opr_name", ov)
-    if mangle_opr_name is not ov:
-        get_logger().warning("mangle_opr_name is deprecated; use keep_var_name instead")
-        keep_var_name = 1 if mangle_opr_name else 2
-    mangle_param_name = kwargs.pop("mangle_param_name", ov)
-    assert (
-        not kwargs
-    ), "extra kwargs provided to serialize_comp_graph_to_file: {}".format(kwargs)
-
-    if mangle_param_name is not ov:
-        get_logger().warning(
-            "mangle_param_name is deprecated; use keep_param_name instead"
-        )
-        keep_param_name = not mangle_param_name
-
-    inputs = _detail._VectorString()
-    outputs = _detail._VectorString()
-    params = _detail._VectorString()
-    stat = _detail._VectorSizeT()
-
-    _detail._serialize_comp_graph_to_file(
-        fpath,
-        append,
-        resolved_fmt,
-        ov,
-        keep_var_name,
-        keep_param_name,
-        keep_opr_priority,
-        tensor_value_dumper,
-        stat,
-        inputs,
-        outputs,
-        params,
-    )
-
-    dump_ret = CompGraphSerializationResult(
-        *stat, list(inputs), list(outputs), list(params)
-    )
-
-    if output_strip_info:
-        with open(fpath + ".json", "w") as fout:
-            strip_info = _detail._get_info_for_strip(ov)
-            strip_info_dict = json.loads(strip_info)
-            strip_info_dict["hash"] = dump_ret.content_hash
-            json.dump(strip_info_dict, fout)
-
-    return dump_ret
-
-
-CompGraphLoadResult = collections.namedtuple(
-    "CompGraphLoadResult", ["graph", "output_vars_dict", "output_vars_list"]
-)
-
-
-def load_comp_graph_from_file(
-    fpath, *, comp_node_mapper=None, tensor_value_loader=None
-):
-    """Load a serialized computing graph from file.
-
-    :parma fpath: Path for the output file
-    :type fpath: ``str``
-    :param comp_node_mapper: A callable to modify comp node locator, takes old
-        locator as argument and returns new locator.
-    :type comp_node_mapper: Callable[[str], str]
-    :param tensor_value_loader: A callable to load tensor values. It should
-        read the tensor value with the given shape and dtype and return it as
-        NumPy ndarray. It would be given a :class:`.TensorValueLoaderContext`
-        object as its sole argument.
-    :type tensor_value_loader: Callable[[TensorValueLoaderContext], numpy.ndarray]
-    :return: An instance of namedtuple :class:`CompGraphLoadResult`,
-        whose fields are:
-
-            * ``graph`` loaded CompGraph
-            * ``output_vars_dict`` A Python dict, mapping name to output SymbolVar
-            * ``output_vars_list`` A Python list, containing output vars in the
-                                   order passed to serialize_comp_graph_to_file
-    """
-    assert isinstance(fpath, str), "bad file path: {!r}".format(fpath)
-
-    if comp_node_mapper is not None:
-        assert isinstance(comp_node_mapper, collections.Callable)
-
-        class Callback(_detail._CompNodeMapperCallback):
-            def call(self, desc, *, _f=comp_node_mapper):
-                return _f(desc)
-
-        comp_node_mapper = Callback()
-    if tensor_value_loader is not None:
-        assert isinstance(tensor_value_loader, collections.Callable)
-
-        class Callback(_detail._TensorValueLoaderCallback):
-            def call(self, ctx, *, _f=tensor_value_loader):
-                return _f(ctx)
-
-        tensor_value_loader = Callback()
-    output_vars_map = _detail._VectorPairStringSymbolVar()
-    output_vars_list = _detail._VectorSymbolVar()
-    cg = _detail._load_comp_graph_from_file(
-        fpath, comp_node_mapper, tensor_value_loader, output_vars_map, output_vars_list
-    )
-    return CompGraphLoadResult(cg, dict(list(output_vars_map)), list(output_vars_list))
-
-
-def optimize_for_inference(
-    output_vars,
-    *,
-    f16_io_f32_comp=False,
-    f16_io_comp=False,
-    use_nhwcd4=False,
-    fuse_conv_bias_nonlinearity=False,
-    use_nchw32=False,
-    fuse_conv_bias_with_z=False,
-    use_nchw4=False,
-    use_nchw88=False,
-    use_nchw44=False,
-    use_nchw44_dot=False,
-    use_chwn4=False
-):
-    """optimize computing graph for inference
-
-    This applies a predefined set of optimization passes. Refer to the mnist
-    sdk example and C++ code for fine-grained control.
-
-    :param output_vars: output symvars
-    :type output_vars: list of :class:`.SymbolVar`
-    :param f16_io_f32_comp: whether to use float16 for I/O between oprs and use
-        float32 as internal computation precision. Note the output var would be
-        changed to float16
-    :param f16_io_comp: whether to use float16 for both I/O and computation
-        precision
-    :param use_nhwcd4: whether to use NHWCD4 data format. This is faster on some
-        OpenCL devices
-    :param fuse_conv_bias_nonlinearity: whether to fuse conv+bias+nonlinearty
-        into one opr. This is supported only in NHWCD4 format.
-    :param use_nchw4: whether to use NCHW4 tensor format.
-    :param use_nchw88: whether to use NCHW88 tensor format. This maybe faster some
-        times.
-    :param use_nchw44: whether to use NCHW44 tensor format. This maybe faster some
-        times.
-    :param use_nchw44_dot: whether to use NCHW44_DOT tensor format. This format is
-        optimized for inference in armv8.2
-    :param use_nchw32: whether to use NCHW32 tensor format. Mainly used for
-        nvidia tensorcore.
-    :param use_chwn4: whether to use CHWN4 tensor format. Mainly used for
-        nvidia tensorcore.
-
-
-    :return: list of transformed vars corresponding to given output vars
-    """
-
-    assert isinstance(output_vars, (list, tuple))
-    opt = _detail._OptimizeForInferenceOptions()
-    settings = locals()
-    for i in [
-        "f16_io_f32_comp",
-        "f16_io_comp",
-        "fuse_conv_bias_nonlinearity",
-        "fuse_conv_bias_with_z",
-    ]:
-        if settings[i]:
-            getattr(opt, "enable_{}".format(i))()
-
-    layout_tranform = None
-    for k, v in {
-        "use_nchw4": "nchw4",
-        "use_nhwcd4": "nhwcd4",
-        "use_nchw32": "nchw32",
-        "use_nchw88": "nchw88",
-        "use_nchw44": "nchw44",
-        "use_nchw44_dot": "nchw44_dot",
-        "use_chwn4": "chwn4",
-    }.items():
-        if settings[k]:
-            assert (
-                not layout_tranform
-            ), "Only one layout transform supported, both {} and {}".format(
-                layout_tranform, k
-            )
-            getattr(opt, "enable_{}".format(v))()
-            layout_tranform = k
-
-    vec = _detail._VectorSymbolVar()
-    for i in output_vars:
-        assert isinstance(i, _detail.SymbolVar), "bad var: {}".format(i)
-        vec.push_back(i)
-    return list(_detail._optimize_for_inference(vec, opt))
-
-
-def get_opr_fp_graph_exec(comp_graph, output_vars):
-    """get opr footprint and graph exec info
-
-    This function will recompile the compute graph, the AsyncExecutable compiled
-    before will be invalid.
-
-    :param comp_graph: ComputingGraph
-    :param output_vars: list of :class:'.SymbolVar'
-    """
-    assert isinstance(output_vars, (list, tuple))
-    vec = _detail._VectorSymbolVar()
-    for i in output_vars:
-        assert isinstance(i, _detail.SymbolVar), "bad var: {}".format(i)
-        vec.push_back(i)
-    return json.loads(_detail._get_opr_fp_graph_exec(comp_graph, output_vars))
-
-
-def to_mgb_supported_dtype(dtype_):
-    """get the dtype supported by megbrain nearest to given dtype"""
-    if (
-        dtype.is_lowbit(dtype_)
-        or dtype.is_quantize(dtype_)
-        or dtype.is_bfloat16(dtype_)
-    ):
-        return dtype_
-    return _detail._to_mgb_supported_dtype(dtype_)
-
-
-def return_free_memory():
-    """return free memory chunks on all devices.
-
-    This function will try it best to free all consecutive free chunks back to 
-    operating system, small pieces may not be returned.
-
-    Please notice that this function will not move any memory in-use.
-    """
-    _detail.CompNode._try_coalesce_all_free_memory()
diff --git a/python_module/megengine/_internal/_timed_func_fork_exec_entry.py b/python_module/megengine/_internal/_timed_func_fork_exec_entry.py
deleted file mode 100644
index 50492ec1..00000000
--- a/python_module/megengine/_internal/_timed_func_fork_exec_entry.py
+++ /dev/null
@@ -1,37 +0,0 @@
-# -*- coding: utf-8 -*-
-# MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
-#
-# Copyright (c) 2014-2020 Megvii Inc. All rights reserved.
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-
-import argparse
-import os
-import sys
-
-import megengine._internal.mgb as _mgb
-
-try:
-    from setproctitle import setproctitle
-except ImportError:
-    setproctitle = None
-
-
-def main():
-    parser = argparse.ArgumentParser(
-        description="entry point for fork-exec callback in TimedFuncInvoker;"
-        " this file should not be used directly by normal user."
-    )
-    parser.add_argument("user_data")
-    args = parser.parse_args()
-
-    if setproctitle:
-        setproctitle("megbrain:timed_func_exec:ppid={}".format(os.getppid()))
-    _mgb._timed_func_exec_cb(args.user_data)
-    raise SystemError("_timed_func_exec_cb returned")
-
-
-if __name__ == "__main__":
-    main()
diff --git a/python_module/megengine/_internal/comp_graph_tools.py b/python_module/megengine/_internal/comp_graph_tools.py
deleted file mode 100644
index 5777d7d0..00000000
--- a/python_module/megengine/_internal/comp_graph_tools.py
+++ /dev/null
@@ -1,274 +0,0 @@
-# -*- coding: utf-8 -*-
-# MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
-#
-# Copyright (c) 2014-2020 Megvii Inc. All rights reserved.
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-
-"""tools for graph manipulation"""
-
-import collections
-
-from . import mgb as _mgb
-
-
-def get_dep_vars(var, var_type=None):
-    """return :class:`.SymbolVar` of type ``var_type`` that input ``var``
-    depands on. If ``var_type`` is None, return all types.
-
-    :type var: an instance or iterable of :class:`.SymbolVar`
-    :type var_type: ``str`` or an iterable of ``str``
-    "rtype: list of :class:`.SymbolVar`
-    """
-    outputs = []
-    memo = set()
-
-    if isinstance(var, _mgb.SymbolVar):
-        var = [var]
-
-    if isinstance(var_type, str):
-        var_type = [var_type]
-
-    q = list(var)
-    while q:
-        v = q.pop()
-        if v in memo:
-            continue
-        memo.add(v)
-        q.extend(get_inputs(v))
-        if var_type is not None:
-            if get_type(v) in var_type:
-                outputs.append(v)
-        else:
-            outputs.append(v)
-
-    return outputs
-
-
-def get_inputs(var):
-    """get the inputs of owner opr of a variable
-
-    :type var: :class:`.SymbolVar`
-    :rtype: list of :class:`.SymbolVar`
-    """
-    assert isinstance(var, _mgb.SymbolVar)
-    return _mgb._get_owner_opr_inputs(var)
-
-
-def get_type(var):
-    """get the type of owner opr of a variable
-
-    :type var: :class:`.SymbolVar`
-    :rtype: ``str``
-    """
-    assert isinstance(var, _mgb.SymbolVar)
-    return _mgb._get_owner_opr_type(var)
-
-
-def get_opr_type(opr):
-    """get the type of a opr
-
-    :type var: :class:`.Operator`
-    :rtype: ``str``
-    """
-    assert isinstance(opr, _mgb.Operator)
-    return _mgb._get_opr_type(opr)
-
-
-def graph_traversal(outputs):
-    """helper function to traverse the computing graph and reeturn enough useful information
-
-    :param outputs: model outputs
-    :type outputs: :class:`.Symbolvar`
-    :return:  tuple (map_oprs, map_vars, var2oprs, opr2receivers, indegree2opr, opr2indegree)
-        WHERE
-        map_oprs is dict from opr_id to actual opr
-        map_vars is dict from var_id to actual var
-        var2oprs is dict from var to dest oprs along with index
-        opr2receivers is dict from current opr to next opr
-        indegree2opr is dict from in_degree to opr in computing graph
-        opr2indegree is dict from opr in computing graph to in_degree
-
-        (indegree2opr, opr2indegree) are only used in topological sort in get_oprs_seq function
-    """
-    # meta information for comp graph
-    map_oprs = collections.defaultdict(set)
-    map_vars = collections.defaultdict(set)
-
-    var2oprs = collections.defaultdict(list)
-    opr2receivers = collections.defaultdict(list)
-
-    queue = list(map(lambda x: x.owner_opr, outputs))
-    visited = set(map(lambda x: x.id, queue))
-
-    # iterate through whole comp_graph, fill in meta information
-    indegree2opr = collections.defaultdict(set)
-    opr2indegree = {}
-
-    idx = 0
-    while idx < len(queue):
-        cur_opr = queue[idx]
-        map_oprs[cur_opr.id] = cur_opr
-
-        idx += 1
-
-        indegree = 0
-        for var_idx, var in enumerate(cur_opr.inputs):
-            map_vars[var.id] = var
-            var2oprs[var.id].append((cur_opr.id, var_idx))
-
-            pre_opr = var.owner_opr
-
-            if pre_opr.id not in visited:
-                visited.add(pre_opr.id)
-                queue.append(pre_opr)
-
-            indegree += 1
-            opr2receivers[pre_opr.id].append(cur_opr.id)
-
-        indegree2opr[indegree].add(cur_opr.id)
-        opr2indegree[cur_opr.id] = indegree
-
-    return map_oprs, map_vars, var2oprs, opr2receivers, indegree2opr, opr2indegree
-
-
-def get_oprs_seq(outputs, prune_reshape=False):
-    """get oprs in some topological order for a dumped model
-
-    :param outputs: model outputs
-    :param prune_reshape: whether to prune the operators useless during inference
-    :return: opr list with some correct execution order
-    """
-
-    def topological_sort(map_oprs, opr2receivers, indegree2opr, opr2indegree):
-        # generate an execution order with topological sort algorithm
-        oprs_seq = []
-        nr_remain = len(map_oprs)
-        while indegree2opr[0]:
-            opr_id = indegree2opr[0].pop()
-            opr = map_oprs[opr_id]
-            nr_remain -= 1
-
-            # skip const value generation operator
-            if get_opr_type(opr) != "ImmutableTensor":
-                oprs_seq.append(opr)
-
-            for post_id in opr2receivers[opr_id]:
-                indegree = opr2indegree[post_id]
-                indegree2opr[indegree].remove(post_id)
-
-                indegree -= 1
-                indegree2opr[indegree].add(post_id)
-                opr2indegree[post_id] = indegree
-
-        assert nr_remain == 0, "there are {} remaining nodes; cyclic graph?".format(
-            nr_remain
-        )
-        return oprs_seq
-
-    # reshape op definition: reshape(input_tensor, dest_shape) -> output_tensor
-    # when inferencing, shape of output_tensor is already known, so one can prune some operators related to dest_shape in the loaded graph
-    def prune_reshape_oprs(outputs, oprs_seq, var2oprs):
-        def iterative_pruning(cur_opr, post_opr, marked_opr_ids):
-            useless = True
-            for oup in cur_opr.outputs:
-                if "workspace" not in oup.name:
-                    var_idx = post_opr.inputs.index(oup)
-                    var2oprs[oup.id].remove((post_opr.id, var_idx))
-                    useless = useless and (len(var2oprs[oup.id]) == 0)
-
-            if useless:
-                marked_opr_ids.append(cur_opr.id)
-
-                for inp in cur_opr.inputs:
-                    iterative_pruning(inp.owner_opr, cur_opr, marked_opr_ids)
-
-        reshape_vars = get_dep_vars(outputs, "Reshape")
-        reshape_oprs = [var.owner_opr for var in reshape_vars]
-
-        marked_opr_ids = []
-        for reshape_opr in reshape_oprs:
-            iterative_pruning(
-                reshape_opr.inputs[1].owner_opr, reshape_opr, marked_opr_ids
-            )
-
-        # filter out all marked oprs
-        return list(filter(lambda x: x.id not in marked_opr_ids, oprs_seq))
-
-    map_oprs, _, var2oprs, opr2receivers, indegree2opr, opr2indegree = graph_traversal(
-        outputs
-    )
-    oprs_seq = topological_sort(map_oprs, opr2receivers, indegree2opr, opr2indegree)
-    if prune_reshape is True:
-        oprs_seq = prune_reshape_oprs(outputs, oprs_seq, var2oprs.copy())
-    return oprs_seq
-
-
-def replace_vars(dst, varmap):
-    """replace vars in the graph
-
-    :param dst: target vars representing the graph
-    :type dst: list of :class:`.SymbolVar`
-    :param varmap: the map that specifies how to replace the vars
-    :type varmap: dict that maps from src var to dst var
-
-    :return: new vars that correspond to ``dst`` with all the dependencies
-        replaced
-    :rtype: list of :class:`.SymbolVar`
-    """
-    dst_vec = _mgb._VectorSymbolVar()
-    repl_src_vec = _mgb._VectorSymbolVar()
-    repl_dst_vec = _mgb._VectorSymbolVar()
-    for i in dst:
-        assert isinstance(i, _mgb.SymbolVar)
-        dst_vec.push_back(i)
-
-    for i, j in getattr(varmap, "items", lambda: varmap)():
-        assert isinstance(i, _mgb.SymbolVar)
-        assert isinstance(j, _mgb.SymbolVar)
-        repl_src_vec.push_back(i)
-        repl_dst_vec.push_back(j)
-
-    return _mgb._replace_vars(repl_src_vec, repl_dst_vec, dst_vec)
-
-
-def replace_oprs(dst, oprmap):
-    """Replace operators in the graph. Roughly equivalent to
-
-    :param dst: target vars representing the graph
-    :type dst: list of :class:`.SymbolVar`
-    :param oprmap: the map that specifies how to replace the operators
-    :type oprmap: dict that maps from src operator to dst operator
-
-    :return: new vars that correspond to ``dst`` with all the dependencies
-        replaced
-    :rtype: list of :class:`.SymbolVar`
-    """
-    dst_vec = _mgb._VectorSymbolVar()
-    repl_src_vec = _mgb._VectorOperator()
-    repl_dst_vec = _mgb._VectorOperator()
-    for i in dst:
-        assert isinstance(i, _mgb.SymbolVar)
-        dst_vec.push_back(i)
-
-    for i, j in getattr(oprmap, "items", lambda: oprmap)():
-        assert isinstance(i, _mgb.Operator)
-        assert isinstance(j, _mgb.Operator)
-        repl_src_vec.push_back(i)
-        repl_dst_vec.push_back(j)
-
-    return _mgb._replace_oprs(repl_src_vec, repl_dst_vec, dst_vec)
-
-
-def set_priority_to_id(dest_vars):
-    """For all oprs in the subgraph constructed by dest_vars
-       set its priority to id if its original priority is zero
-    :param dest_vars: target vars representing the graph
-    """
-    dest_vec = _mgb._VectorSymbolVar()
-    for i in dest_vars:
-        assert isinstance(i, _mgb.SymbolVar)
-        dest_vec.push_back(i)
-    _mgb._set_priority_to_id(dest_vec)
diff --git a/python_module/megengine/_internal/config.py b/python_module/megengine/_internal/config.py
deleted file mode 100644
index 3bac8ab5..00000000
--- a/python_module/megengine/_internal/config.py
+++ /dev/null
@@ -1,439 +0,0 @@
-# -*- coding: utf-8 -*-
-# MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
-#
-# Copyright (c) 2014-2020 Megvii Inc. All rights reserved.
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-
-import collections
-import os
-
-from . import mgb as _mgb
-
-_default_device_type = "CUDA"
-
-
-def set_device_map(logical_dev, physical_dev, device_type=None):
-    """map from *logical_dev* to *physical_dev* for furture comp node
-    loading
-
-    example::
-
-        set_device_map(0, 2, 'CPU') # cpu0 -> cpu2
-        set_device_map('gpu3', 'gpu0') # gpu0 -> gpu0
-
-    :param device_type: specify the device type if devices are given by
-        integers; if devices are given by integers and ``device_type`` is not
-        given, the default value ``'CUDA'`` would be used. Possible values are
-        ``'CUDA'`` and ``'CPU'``.
-    """
-
-    if device_type is None:
-        device_type = _default_device_type
-
-    if device_type == "CUDA":
-        xpu = "gpu"
-    else:
-        assert device_type == "CPU"
-        xpu = "cpu"
-
-    def rmxpu(v):
-        if isinstance(v, str):
-            assert v.startswith(xpu) or v.startswith("xpu"), (
-                "bad comp node in set_device_map: "
-                "device_type={} comp_node={}".format(device_type, v)
-            )
-            return v[3:]
-        return v
-
-    logical_dev, physical_dev = map(rmxpu, [logical_dev, physical_dev])
-    _mgb.CompNode._set_device_map(device_type, int(logical_dev), int(physical_dev))
-
-
-def set_default_device(physical_dev, device_type=None):
-    """set physcal device for xpux
-
-    when *device_type* is None and *physical_dev* starts with *gpu* or *cpu*,
-    the default device type would be modified accordingly for future calls to
-    :func:`set_device_map` when remapping device number.
-    """
-    global _default_device_type
-    if (
-        device_type is None
-        and isinstance(physical_dev, str)
-        and not physical_dev.isdigit()
-        and not physical_dev.startswith("xpu")
-    ):
-        t = physical_dev[:3]
-        if t == "gpu":
-            _default_device_type = "CUDA"
-        else:
-            assert t == "cpu", "bad physical_dev: {}".format(physical_dev)
-            _default_device_type = "CPU"
-        set_default_device_type(_default_device_type)
-        device_type = _default_device_type
-    set_device_map(-1, physical_dev, device_type)
-
-
-def set_default_device_type(device_type):
-    """set device type for xpu"""
-    global _default_device_type
-    device_type = device_type.upper()
-    _mgb.CompNode._set_unspec_device_type(device_type)
-    _default_device_type = device_type
-
-
-def set_fork_cuda_warning_flag(flag):
-    """set warning to be printed at fork if cuda has been initialized
-
-    :type flag: int
-    :param flag: controls how the warning should be printed:
-
-        * 0: disable warning
-        * 1: print warning to log
-        * 2: print warning to log and raise exception
-    """
-    _mgb._config.set_fork_cuda_warning_flag(int(flag))
-
-
-def get_device_count(device_type="xpu", warn=True):
-    """get number of devices installed on this system
-
-    :param device_type: device type, one of 'xpu', 'gpu' or 'cpu'
-    :type device_type: str
-    """
-    return _mgb.CompNode._get_device_count(device_type.upper(), warn)
-
-
-def parse_locator(device_name: str) -> tuple:
-    """get the tensor locator expression by device name.
-
-    :param device_name: device name, like 'cpu0', 'gpu1' and 'xpux'
-    :type device_name: str
-
-    :return: (device_type, dev_num, stream_num)
-    """
-    return _mgb.CompNode._parse_locator(device_name)
-
-
-def set_mem_reserve_size(size):
-    """set memory reserve size:
-
-        * If *size* is greater than 1, it is the absolute amount of memory to
-          be reserved in MB;
-        * If *size* is in the range (0, 1), it is the ratio of total memory;
-        * If *size* is 0, memory reservation and pre-allocation would be
-          disabled;
-        * If *size* is -1, disable custom memory allocator and use cuda APIs
-          directly.
-    """
-    _mgb._config.set_mem_reserve_size(float(size))
-
-
-def set_comp_graph_option(comp_graph, name, val):
-    """set computing graph option and return its old value
-    :type comp_graph: :class:`.CompGraph`
-    :param comp_graph: the computing graph whose option should be modified
-    :type name: str
-    :param name: option name
-        Currently supported options are:
-
-            * "no_profiling_on_shape_change": bool;
-              When execution strategy is set to profiling, always use the
-              initial profile result and do not re-run profiling even if input
-              shape changes.
-            * "seq_opt.enable_mem_plan_opt": bool
-            * "seq_opt.enable_mem_reuse_alloc": bool
-            * "seq_opt.enable_seq_comp_node_opt": bool
-            * "force_dynamic_alloc": bool
-            * "var_sanity_check_first_run": bool
-            * "enable_sublinear_memory_opt": bool
-            * "enable_memory_swap": bool; whether to enable memory swap; it
-                usually performs worse than sublinear memory
-            * "enable_var_mem_defragment": bool
-            * "allocate_static_mem_after_graph_compile": bool
-            * "enable_grad_var_static_reshape": bool:
-               If set to ``True``, dynamically-shaped gradients whose original
-               shape is statically inferrable would be reshaped, so static
-               shape inference can continue
-            * "async_exec_level": int
-
-                 * ``0``: do not dispatch asynchronously
-                 * ``1``: async dispatch if there are more than 1 cuda comp
-                   nodes
-                 * mask ``0b10``: async for comp nodes with unlimited queue
-                   (e.g. CPU comp nodes)
-                 * mask ``0b100``: async for even one comp node
-            * "log_level": int
-
-                 * ``0``: no log info for graph construction/compiling
-                 * ``1``: static memory allocation status,
-                   WorkspaceLimitGetter summary, and optimizer summary
-                 * ``2``: optimizer details and duplicated operators tha are
-                   removed
-            * "graph_opt.jit": whether to enable JIT
-            * "graph_opt.tensorrt": whether to enable fine-grained automatic
-              replacement for TensorRT operators
-            * "graph_opt.android_nn": whether to enable fine-grained automatic
-              replacement for Android NN operators
-            * "graph_opt_level": int
-
-                 * ``0``: disable
-                 * ``1``: level-1: inplace arith transformations during graph
-                   construction
-                 * ``2``: (default) level-2: level-1, plus global optimization
-                   before graph compiling
-                 * ``3``: also enable JIT
-    :param val: new option value
-    :return: old option value
-    """
-    if name == "log_static_mem_alloc":
-        name = "log_level"
-    if name == "enable_async_exec":
-        name = "async_exec_level"
-    return _mgb._config.set_comp_graph_option(comp_graph, name, int(val))
-
-
-def comp_graph_is_eager(comp_graph):
-    return _mgb._config.comp_graph_is_eager(comp_graph)
-
-
-def add_extra_vardep(var, dep):
-    """add *dep* as an extra dependency of *var*, so if *var* is required to
-    compute the final output when compiling a comp graph, *dep* would also be
-    included in the computing sequence. Note that the order computing of these
-    two vars is not guaranteed.
-    """
-    assert isinstance(var, _mgb.SymbolVar) and isinstance(dep, _mgb.SymbolVar)
-    assert var.owner_graph == dep.owner_graph
-    return _mgb._config.add_extra_vardep(var, dep)
-
-
-class _GraphPropertyBase:
-    """helper class for implementing operator property setter context managers"""
-
-    _cur_graph = None
-
-    _graph2stack = None
-    """class attribute that maintains mapping from graph to property stack;
-    should be defined by child classes"""
-
-    __prop_setup__ = None
-    """overwritten by subclass to setup property"""
-
-    __prop_clear__ = None
-    """overwritten by subclass to clear property"""
-
-    def __init__(self, comp_graph, prop):
-        """:param comp_graph: computing graph, or None to not set this
-        property"""
-        if comp_graph is not None:
-            assert isinstance(
-                comp_graph, _mgb.CompGraph
-            ), "invalid comp graph: {!r}".format(comp_graph)
-        self._cur_graph = comp_graph
-        self._graph2stack.setdefault(comp_graph, []).append(prop)
-
-    def __setup(self, prop):
-        self.__prop_setup__(self._cur_graph, prop)
-
-    def __clear(self):
-        self.__prop_clear__(self._cur_graph)
-
-    def __enter__(self):
-        if self._cur_graph is None:
-            return
-
-        stack = self._graph2stack[self._cur_graph]
-        if len(stack) > 1:
-            # clear nested property
-            self.__clear()
-        self.__setup(stack[-1])
-
-    def __exit__(self, exc_type, exc_value, exc_traceback):
-        if self._cur_graph is None:
-            return
-
-        stack = self._graph2stack[self._cur_graph]
-        self.__clear()
-        stack.pop()
-        if stack:
-            # restore nested property
-            self.__setup(stack[-1])
-        else:
-            del self._graph2stack[self._cur_graph]
-
-
-class exc_opr_tracker_scope(_GraphPropertyBase):
-    """context manager for associating an object with all operators created
-    within this context; so when an exception is raised, information about the
-    corresponding operator could be retrieved from
-    :attr:`.MegBrainError.tracker`
-
-    :param comp_graph: the computing graph where the operators should be tracked
-    :type comp_graph: :class:`.CompGraph`
-    :param tracker: an arbitrary python object to track the operators
-    """
-
-    _graph2stack = {}
-
-    def __init__(self, comp_graph, tracker):
-        assert (
-            tracker is not None
-        ), "bad args for exc_opr_tracker_scope: {!r} {!r}".format(comp_graph, tracker)
-        super().__init__(comp_graph, tracker)
-
-    __prop_setup__ = staticmethod(_mgb._config.begin_set_exc_opr_tracker)
-    __prop_clear__ = staticmethod(_mgb._config.end_set_exc_opr_tracker)
-
-
-class opr_priority_scope(_GraphPropertyBase):
-    """context manager for setting priority for all operators created in this
-    context
-
-    :param comp_graph: the computing graph for which operator priority should
-        be set
-    :type comp_graph: :class:`.CompGraph`
-    :param priority: operator priority. Smaller number means higher priority.
-        Default value is 0. Grad operator would use negative priority by
-        default.
-    """
-
-    _graph2stack = {}
-
-    LOWEST_PRIORITY = 2 ** 31 - 1
-    """lowest prority (i.e. max possible value)"""
-
-    HIGHEST_PRIORITY = -LOWEST_PRIORITY
-    """highest prority (i.e. min possible value)"""
-
-    def __init__(self, comp_graph, priority):
-        super().__init__(comp_graph, int(priority))
-
-    __prop_setup__ = staticmethod(_mgb._config.begin_set_opr_priority)
-    __prop_clear__ = staticmethod(_mgb._config.end_set_opr_priority)
-
-
-OprTrackerResult = collections.namedtuple(
-    "OprTrackerResult", ["msg", "tracker", "grad_tracker"]
-)
-
-
-def get_opr_tracker(cg, var_id):
-    """get the tracking object associated with the owner operator of a var
-
-    :param cg: the computing graph
-    :param var_id: id of the var whose owner opr tracker should be found
-
-    :return: if no var is found, ``None`` is returned; otherwise return an
-        :class:`OprTrackerResult` object
-    """
-    assert isinstance(cg, _mgb.CompGraph)
-    ret = _mgb._config.get_opr_tracker(cg, int(var_id))
-    if ret is None:
-        return
-    return OprTrackerResult(*ret)
-
-
-def set_opr_sublinear_memory_endpoint(var):
-    """set the owner operator of a symvar to be endpoint of sublinear memory
-    optimizer
-
-
-    :type var: :class:`.SymbolVar`
-    """
-    _mgb._config.set_opr_sublinear_memory_endpoint(var)
-
-
-def max_size_t():
-    """get max value of size_t type on local architecture"""
-    return _mgb.max_size_t()
-
-
-def is_cuda_ctx_set():
-    """return whether current thread has an active cuda driver context"""
-    return _mgb._config.is_cuda_ctx_set()
-
-
-def get_include_path():
-    """get include path for building megbrain extensions"""
-    return os.path.join(os.path.realpath(os.path.dirname(__file__)), "include")
-
-
-def get_cuda_gencode(only_cap=False):
-    """get -gencode options to be passed to nvcc for compiling on local
-    machine
-
-    :param only_cap: if True, return only a list of cuda compute capability
-        strings (like ``['35', '52']`` )
-    """
-    ret = _mgb._config.get_cuda_gencode().split()
-    if not only_cap:
-        ret = " ".join(map("-gencode arch=compute_{0},code=sm_{0}".format, ret))
-    return ret
-
-
-def get_cuda_lib_path():
-    """get the cuda lib64 path by locating nvcc
-    """
-    return _mgb._config.get_cuda_lib_path()
-
-
-def get_cuda_include_path():
-    """get the cuda include path by locating nvcc, including
-        parent path and `parent path`/include
-    """
-    return _mgb._config.get_cuda_include_path()
-
-
-def get_cuda_version():
-    """get runtime cuda version
-    """
-    return _mgb._config.get_cuda_version()
-
-
-def is_local_cuda_env_ok():
-    """check whether local cuda environment ok by locating nvcc
-    """
-    return _mgb._config.is_local_cuda_env_ok()
-
-
-def is_compiled_with_cuda():
-    """whether cuda is enabled at compile time"""
-    return _mgb._config.is_compiled_with_cuda()
-
-
-def load_opr_library(path):
-    """Load an external operator library. This essentially sets megbrain
-    symbols as public and load the library.
-
-    :param path: path to the shared object; if it is None, then only megbrain
-    symbols are made public.
-    """
-    _mgb._config.load_opr_library(
-        os.path.realpath(os.path.join(os.path.dirname(__file__), "_mgb.so")), path
-    )
-
-
-def dump_registered_oprs():
-    """
-    get all registered oprs, return dict(id, name)
-    """
-    return dict(_mgb._config.dump_registered_oprs())
-
-
-def create_mm_server(server_addr, port):
-    """
-    create mm server with server address
-    throw exception if server_addr is already used
-    """
-    return _mgb._config.create_mm_server(server_addr, port)
-
-
-def group_barrier(server_addr, port, size, rank):
-    """
-    block until all ranks reach this barrier
-    """
-    return _mgb._config.group_barrier(server_addr, port, size, rank)
diff --git a/python_module/megengine/_internal/craniotome.py b/python_module/megengine/_internal/craniotome.py
deleted file mode 100644
index 3a1e6d50..00000000
--- a/python_module/megengine/_internal/craniotome.py
+++ /dev/null
@@ -1,432 +0,0 @@
-# -*- coding: utf-8 -*-
-# MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
-#
-# Copyright (c) 2014-2020 Megvii Inc. All rights reserved.
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-
-"""used for creating a megbrain operator from python"""
-
-import copy
-import itertools
-from abc import ABCMeta, abstractmethod, abstractproperty
-
-from . import helper as _helper
-from . import mgb as _mgb
-
-
-class _CraniotomeBaseMeta(ABCMeta):
-    _base_created = False
-
-    def __init__(cls, name, bases, member_dict):
-        if _CraniotomeBaseMeta._base_created:
-            assert "__init__" not in member_dict, (
-                "Craniotome operators should not overwrite __init__ method; "
-                "use setup() instead."
-            )
-            forbidden = set(
-                k for k in dir(CraniotomeBase) if k[0] == "_" and k[1] != "_"
-            )
-            forbidden.add("get_io_vars")
-            check_key = member_dict.get("__check_key__", True)
-            whitelist = ["__classcell__"]
-            for k in member_dict.keys():
-                assert k not in forbidden, "{} could not be overwritten".format(k)
-                if (
-                    check_key
-                    and k.startswith("__")
-                    and k.endswith("__")
-                    and k not in whitelist
-                    and not hasattr(CraniotomeBase, k)
-                ):
-                    raise KeyError(
-                        "name {} in class {} does not exist in the baseclass".format(
-                            k, name
-                        )
-                    )
-        else:
-            _CraniotomeBaseMeta._base_created = True
-        super().__init__(name, bases, member_dict)
-
-
-class CraniotomeBase(_mgb.CraniotomeDesc, metaclass=_CraniotomeBaseMeta):
-    """base class used for extending megbrain core operators in python
-
-    Note: all names starting and ending with two underscores in the subclasses
-    would be checked and KeyError would be raised if the name does not exist in
-    the base class. This behavor can be disabled by setting ``__check_key__``
-    to ``False`` (see the testcase for more details)
-    """
-
-    # methods and attributes to be overwritten by subclasses
-
-    __expand_single_outputs__ = True
-    """if :attr:`__nr_outputs__` is 1, whether to return a single
-    :class:`.SymbolVar` instead of a tuple in :meth:`make`"""
-
-    __is_dynamic_output_shape__ = False
-    """whether output shape could not be inferred from input shape. If value of
-    this attribute is ``False``, :meth:`infer_shape` must be implemented. If
-    this attribute is ``True`` but the operator has no inputs, then
-    :meth:`infer_shape` would also be called to infer output shape before
-    operator execution.
-    """
-
-    __disable_sys_mem_alloc__ = False
-    """whether to disable system memory allocator. This is used when
-    :attr:`__is_dynamic_output_shape__` is ``False`` but the output memory
-    should not be managed by megbrain system (so it can be forwarded from
-    external buffer)"""
-
-    __allow_duplicate__ = True
-    """whether this operator can be duplicated (e.g. used in sublinear
-    memory)"""
-
-    __allow_empty_out__ = False
-    """whether empty output shape is allowed; if it is set as ``False``, then
-    an exception would be raised if output var is empty to prevent erroneously
-    forgetting initializing output vars"""
-
-    @abstractproperty
-    def __nr_inputs__(self):
-        """number of input vars"""
-
-    @abstractproperty
-    def __nr_outputs__(self):
-        """number of output vars"""
-
-    @abstractmethod
-    def execute(self, inputs, outputs):
-        """execute the operator, read values from *inputs* by calling
-        :meth:`.CompGraphCallbackValueProxy.get_value` and write results into
-        *outputs* by calling :meth:`.SharedND.set_value`
-
-        :param inputs: values for each input var
-        :type inputs: tuple of :class:`.CompGraphCallbackValueProxy`
-        :param outputs: values for each output var
-        :type outputs: tuple of :class:`.SharedND`
-        """
-
-    def setup(self):
-        """overwritten by subclass to accept kwargs passed to :meth:`make` to
-        setup the operator"""
-
-    def infer_shape(self, inp_shapes):
-        """infer output shape from input shapes
-
-        :type inp_shapes: tuple of tuple of ints
-        :param inp_shapes: input shapes for each input var
-        :rtype: tuple of tuple of ints
-        :return: output shapes for each output var
-        """
-        raise NotImplementedError(
-            "{}: infer_shape() not implemented; for operators with dynamic "
-            "output shape, __is_dynamic_output_shape__ should be set to True".format(
-                self
-            )
-        )
-
-    def grad(self, wrt_idx, inputs, outputs, out_grad):
-        """compute symbolic gradient; should be overwritten by differentiable
-        subclasses
-
-        :type wrt_idx: int
-        :param wrt_idx: the input var with respect to which the gradient should
-            be computed; please also see the notes below
-        :type inputs: tuple of :class:`.SymbolVar`
-        :param inputs: input symbol vars
-        :type outputs: tuple of :class:`.SymbolVar`
-        :param outputs: output symbol vars
-        :type out_grad: tuple of (:class:`.SymbolVar` or None)
-        :param out_grad: gradients of loss with respect to each output var
-
-            .. note::
-
-                In case when loss does not depend on some var (i.e. zero grad),
-                the corresponding value in *out_grad* would be ``None``. It is
-                guaranteed that at least one element in *out_grad* is not
-                ``None``.
-
-        .. note::
-
-            This function can return either of the following:
-
-                1. Gradient of the input specified by ``wrt_idx``
-                2. A list containing gradients of all inputs. In this case,
-                   ``wrt_idx`` can be ignored.
-
-            And the so called gradient can be either one of:
-
-                1. A :class:`.SymbolVar` representing the symbolic gradient
-                   value
-                2. ``0`` representing zero gradient
-        """
-        raise NotImplementedError("grad for {} not implemented".format(self))
-
-    def init_output_dtype(self, input_dtypes):
-        """infer output dtypes from input dtypes; return None to use default
-        infer function in megbrain.
-
-        .. note::
-            This method must be implemented if there is no input var
-
-        :param input_dtypes: input dtypes
-        :type input_dtypes: list of :class:`numpy.dtype`
-        :rtype: None or list of :class:`numpy.dtype`-compatible
-        """
-
-    def get_serialize_params(self):
-        """get params for megbrain graph serialization. This function should
-        return a list or tuple, containing one or two elements: the first
-        element must be a string, representing the name passed to
-        ``opr_loader_maker`` during deserializing; the second element, if
-        exists, must be convertible to ``bytes`` and is used for dumping any
-        extra opr params, which can be retrieved by ``load_buf_with_len``
-        during deserializing.
-        """
-        raise NotImplementedError(
-            "get_serialize_params() for {} not implemented".format(self)
-        )
-
-    def copy(self):
-        """copy this craniotome descriptor; the default implementation creates
-        a new object, and copies object ``__dict__``"""
-        ret = type(self)()
-        d0 = self.__dict__.copy()
-        d0.pop("this")
-        ret.__dict__.update(copy.deepcopy(d0))
-        return ret
-
-    def on_graph_compiled(self, used_outputs):
-        """a callback that would be invoked when the graph is compiled; it
-        would always have a matching :meth:`on_compiled_func_deleted` call
-
-        :param used_outputs: indices of outputs that are needed for the
-            computation
-        :type used_outputs: ``tuple of int``
-        """
-
-    def on_compiled_func_deleted(self):
-        """a callback that would be invoked when the compiled function is
-        destructed; it would always have a matching :meth:`on_graph_compiled`
-        call"""
-
-    def get_io_vars(self):
-        """get input vars, comp order dep vars and output vars
-
-        :return: a dict with keys ``'input'``, ``'output'`` and
-            ``'comp_order'`` that maps to corresponding list of vars
-        """
-        all_vars = list(self._get_all_io_vars())
-        nr_inp = self.__nr_inputs__
-        nr_out = self.__nr_outputs__
-        nr_comp_order = self._get_nr_dev_comp_order_deps()
-        s0 = nr_inp + nr_comp_order
-        return dict(
-            input=all_vars[:nr_inp],
-            comp_order=all_vars[nr_inp:s0],
-            output=all_vars[s0:],
-        )
-
-    @property
-    def owner_opr_id(self):
-        """ID of the operator that owns this descriptor"""
-        return self._get_opr_id()
-
-    @property
-    def comp_node(self):
-        """comp node on which this operator runs"""
-        return self._get_comp_node()
-
-    # below are methods that should not be changed
-
-    def _hash(self):
-        return int(hash(self)) % (1 << 64)
-
-    def _setup_self(self, dst):
-        dst.append(self)
-
-    def _is_same(self, rhs):
-        return bool(self == rhs)
-
-    def _node_flag(self):
-        return (
-            (int(bool(self.__is_dynamic_output_shape__)) << 0)
-            | (int(not self.__allow_duplicate__) << 1)
-            | (int(bool(self.__allow_empty_out__)) << 2)
-            | (int(bool(self.__disable_sys_mem_alloc__)) << 3)
-        )
-
-    def _get_opr_type_name(self):
-        return str(self.__class__.__name__)
-
-    def _get_nr_outputs(self):
-        return int(self.__nr_outputs__)
-
-    def _execute(self, inputs, outputs):
-        inputs = tuple(inputs)
-        outputs = tuple(outputs)
-        if not self.__is_dynamic_output_shape__:
-            out_shapes = [i.shape for i in outputs]
-        self.execute(inputs, outputs)
-        if not self.__is_dynamic_output_shape__:
-            new_shapes = [i.shape for i in outputs]
-            assert (
-                out_shapes == new_shapes
-            ), "output shape changed after executing {}: before={} after={}".format(
-                self, out_shapes, new_shapes
-            )
-
-    def _infer_shape(self, inp_shapes):
-        inp_shapes = tuple(tuple(map(int, i)) for i in inp_shapes)
-        oshp_get = self.infer_shape(inp_shapes)
-        assert (
-            len(oshp_get) == self.__nr_outputs__
-        ), "{}: expect {} outputs; got {}(val: {}) from infer_shape".format(
-            self, self.__nr_outputs__, len(oshp_get), oshp_get
-        )
-        return _helper.cvt_to_vector_of_shape(oshp_get)
-
-    def _grad(self, wrt_idx, inputs, outputs, out_grad):
-        og = []
-        for i in out_grad:
-            if i.valid:
-                og.append(i)
-            else:
-                og.append(None)
-        rst = self.grad(int(wrt_idx), tuple(inputs), tuple(outputs), tuple(og))
-        if not isinstance(rst, (list, tuple)):
-            rst = [rst]
-        else:
-            assert len(rst) == len(
-                inputs
-            ), "{}: opr has {} inputs but {} grads are returned".format(
-                self, len(inputs), len(rst)
-            )
-
-        for i in range(len(rst)):
-            cur = rst[i]
-            if cur is 0:
-                rst[i] = _mgb.SymbolVar()
-            else:
-                assert isinstance(cur, _mgb.SymbolVar), (
-                    "{}: invalid grad result; it should be either "
-                    "0 or a SymbolVar, got {!r} instead".format(self, cur)
-                )
-        return rst
-
-    def _get_nr_dev_comp_order_deps(self):
-        return 0
-
-    def _init_output_dtype(self, input_dtypes, ret):
-        get = self.init_output_dtype(input_dtypes)
-        if get is not None:
-            assert isinstance(ret, (list, tuple)) and len(get) == len(ret)
-            ret[:] = get
-            return True
-        assert self.__nr_inputs__, (
-            "{}: init_output_dtype must be implemented "
-            "if there is no input var".format(self)
-        )
-        return False
-
-    def _setup_serialize_params(self, output):
-        val = list(self.get_serialize_params())
-        assert len(val) in [1, 2]
-        name = val[0]
-        assert isinstance(name, str)
-        output.append(name)
-        if len(val) == 2:
-            output.append(bytes(val[1]))
-
-    def _copy(self):
-        ret = self.copy()
-        assert type(ret) is type(
-            self
-        ), "copy() returned different type: src={} copied={}".format(
-            type(self), type(ret)
-        )
-        assert ret is not self
-        ret.__disown__()
-        self._set_copy_result(ret)
-
-    def _on_graph_compile_or_func_del(self, used_outputs):
-        if used_outputs:
-            self.on_graph_compiled(used_outputs)
-        else:
-            self.on_compiled_func_deleted()
-
-    def __repr__(self):
-        return "cranoiotome:{}".format(self.__class__.__name__)
-
-    @classmethod
-    def make(
-        cls,
-        *inputs,
-        comp_graph=None,
-        name=None,
-        comp_node=None,
-        config=None,
-        dev_comp_order_deps=[],
-        **kwargs
-    ):
-        """apply this operator on some input vars and return corresponding
-        output vars
-
-        :type inputs: tuple of :class:`.SymbolVar`
-        :param inputs: input symvars; immediate values could also be accepted,
-            as long as there is symvar to infer comp node and comp graph
-        :param comp_graph: if there is no input vars, *comp_graph* must be
-            provided to specify which computing graph to insert this operator
-        :param dev_comp_order_deps: vars that must have been computed
-            before executing this operator
-        :param kwargs: extra keyword arguments to be passed to :meth:`setup` of
-            this class
-        :param name: name of the resulting operator
-        :rtype: tuple of :class:`.SymbolVar`
-        :return: output symvars
-        """
-
-        if not inputs and not dev_comp_order_deps:
-            assert isinstance(
-                comp_graph, _mgb.CompGraph
-            ), "{}: comp_graph must be given if no inputs provided".format(self)
-
-        desc = cls()
-        desc.setup(**kwargs)
-        assert (
-            len(inputs) == desc.__nr_inputs__
-        ), "{}: expected {} inputs, got {}".format(
-            desc, desc.__nr_inputs__, len(inputs)
-        )
-
-        config = _helper.gen_config(name, comp_node, config)
-
-        # get inp_vec
-        inp_vec = _mgb._VectorSymbolVar()
-        for i in _helper.canonize_input_vars(
-            itertools.chain(inputs, dev_comp_order_deps),
-            comp_graph=comp_graph,
-            config=config,
-        ):
-            inp_vec.push_back(i)
-        desc._get_nr_dev_comp_order_deps = lambda *, val=len(dev_comp_order_deps): val
-
-        if comp_graph is not None:
-            desc._get_comp_graph = lambda: comp_graph
-        expand_single_outputs = desc.__expand_single_outputs__
-        desc.__disown__()
-        rst = _mgb.make_opr_from_craniotome_desc(desc, inp_vec, config)
-        if expand_single_outputs and len(rst) == 1:
-            return rst[0]
-        return tuple(rst)
-
-
-def make_opr(cls):
-    """decorator used to wrap a :class:`.CraniotomeBase` subclass and return
-    its :meth:`~.CraniotomeBase.make` method
-    """
-    assert issubclass(cls, CraniotomeBase)
-    return cls.make
diff --git a/python_module/megengine/_internal/dtype.py b/python_module/megengine/_internal/dtype.py
deleted file mode 100644
index 6bb32f86..00000000
--- a/python_module/megengine/_internal/dtype.py
+++ /dev/null
@@ -1,286 +0,0 @@
-# MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
-#
-# Copyright (c) 2014-2020 Megvii Inc. All rights reserved.
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-
-import collections
-from typing import Union
-
-import numpy as np
-
-from .mgb import bfloat16, intb1, intb2, intb4
-
-_QuantDtypeMetadata = collections.namedtuple(
-    "QuantDtypeMetadata", ["name", "np_dtype_str", "is_unsigned", "qmin", "qmax",]
-)
-
-_metadata_dict = {
-    "quint8": _QuantDtypeMetadata("Quantized8Asymm", "uint8", True, 0, 255),
-    "qint8": _QuantDtypeMetadata("QuantizedS8", "int8", False, -128, 127),
-    "quint4": _QuantDtypeMetadata("Quantized4Asymm", "uint8", True, 0, 15),
-    "qint4": _QuantDtypeMetadata("QuantizedS4", "int8", False, -8, 7),
-    "qint32": _QuantDtypeMetadata(
-        "QuantizedS32", "int32", False, -(2 ** 31), 2 ** 31 - 1,
-    ),
-    # NOTE: int2 is not supported for model dump yet
-    "quint2": _QuantDtypeMetadata(None, "uint8", True, 0, 3),
-    "qint2": _QuantDtypeMetadata(None, "int8", False, -2, 1),
-}
-
-
-def is_quantize(dtype):
-    return (
-        hasattr(dtype, "metadata")
-        and dtype.metadata is not None
-        and "mgb_dtype" in dtype.metadata
-    )
-
-
-def is_lowbit(dtype):
-    return (dtype is intb1) or (dtype is intb2) or (dtype is intb4)
-
-
-def is_bfloat16(dtype):
-    return dtype is bfloat16
-
-
-def get_scale(dtype):
-    assert is_quantize(dtype)
-    return dtype.metadata["mgb_dtype"]["scale"]
-
-
-def get_zero_point(dtype):
-    assert is_quantize(dtype)
-    metadata = dtype.metadata["mgb_dtype"]
-    assert metadata["name"] in ("Quantized8Asymm", "Quantized4Asymm")
-    return metadata["zero_point"]
-
-
-def _check_zero_point(zp: int, dtype_str: str):
-    qmin = _metadata_dict[dtype_str].qmin
-    qmax = _metadata_dict[dtype_str].qmax
-    if zp < qmin or zp > qmax:
-        raise ValueError(
-            "zero_point should be within [{}, {}] for {}".format(qmin, qmax, dtype_str)
-        )
-
-
-def get_quantized_dtype(dtype_str: str, scale: float, zp: Union[int, None]):
-    r"""
-    Get quantized dtype with metadata attribute according to _metadata_dict.
-
-    Note that unsigned dtype must have ``zero_point`` and signed dtype must
-    not have ``zero_point``, to be consitent with tensor generated by calling
-    compiled function from `CompGraph.compile(inputs, outspec)`.
-
-    :param dtype: a string indicating which dtype to return
-    :param scale: a number for scale to store in dtype's metadata
-    :param zp: a number for zero_point to store in dtype's metadata
-    """
-    metadata = _metadata_dict[dtype_str]
-    np_dtype_str = metadata.np_dtype_str
-    is_unsigned = metadata.is_unsigned
-    if is_unsigned:
-        if zp is None or int(zp) != zp:
-            raise ValueError("zero_point should be an integer")
-        zp = int(zp)
-        _check_zero_point(zp, dtype_str)
-        return np.dtype(
-            np_dtype_str,
-            metadata={
-                "mgb_dtype": {
-                    "name": metadata.name,
-                    "scale": float(scale),
-                    "zero_point": zp,
-                }
-            },
-        )
-    else:
-        return np.dtype(
-            np_dtype_str,
-            metadata={"mgb_dtype": {"name": metadata.name, "scale": float(scale)}},
-        )
-
-
-def quint8(scale, zero_point):
-    """
-    Consturct a quantized unsigned int8 data type with ``scale`` (float) and
-    ``zero_point`` (uint8). The real value represented by a quint8 data type is
-    float_val = scale * (uint8_val - zero_point)
-    """
-    return get_quantized_dtype("quint8", scale, zero_point)
-
-
-def qint8(scale):
-    """
-    Construct a quantized int8 data type with ``scale`` (float). The real value
-    represented by a qint8 data type is float_val = scale * int8_val
-    """
-    return get_quantized_dtype("qint8", scale, None)
-
-
-def qint32(scale):
-    """
-    Construct a quantized int32 data type with ``scale`` (float). The real value
-    represented by a qint32 data type is float_val = scale * int32_val
-    """
-    return get_quantized_dtype("qint32", scale, None)
-
-
-def quint4(scale, zero_point):
-    """
-    Consturct a quantized unsigned int4 data type with ``scale`` (float) and
-    ``zero_point`` (uint8). The real value represented by a quint4 data type is
-    float_val = scale * (uint4_val - zero_point)
-    """
-    return get_quantized_dtype("quint4", scale, zero_point)
-
-
-def qint4(scale):
-    """
-    Construct a quantized int4 data type with ``scale`` (float). The real value
-    represented by a qint4 data type is float_val = scale * int4_val
-    """
-    return get_quantized_dtype("qint4", scale, None)
-
-
-def _convert_to_quantized_dtype(arr: np.ndarray, dtype: np.dtype, dtype_str: str):
-    metadata = _metadata_dict[dtype_str]
-    arr_metadata = dtype.metadata["mgb_dtype"]
-    if not isinstance(arr, np.ndarray):
-        raise ValueError("arr parameter should be instance of np.ndarray")
-    if not is_quantize(dtype) or arr_metadata["name"] != metadata.name:
-        raise ValueError("dtype parameter should be a {} dtype".format(dtype_str))
-    is_unsigned = metadata.is_unsigned
-    if is_unsigned:
-        scale, zp = (
-            arr_metadata["scale"],
-            arr_metadata["zero_point"],
-        )
-        return (
-            (np.round(arr / scale) + zp)
-            .clip(metadata.qmin, metadata.qmax)
-            .astype(dtype)
-        )
-    else:
-        # don't trick to combine with is_unsigned, seeing ``get_quantized_dtype``
-        scale = arr_metadata["scale"]
-        return np.round(arr / scale).clip(metadata.qmin, metadata.qmax).astype(dtype)
-
-
-def _convert_from_quantized_dtype(arr: np.ndarray, dtype_str: str):
-    metadata = _metadata_dict[dtype_str]
-    arr_metadata = arr.dtype.metadata["mgb_dtype"]
-    if not isinstance(arr, np.ndarray):
-        raise ValueError("arr parameter should be instance of np.ndarray")
-    if not is_quantize(arr.dtype) or arr_metadata["name"] != metadata.name:
-        raise ValueError("arr's dtype should be a {} dtype".format(dtype_str))
-    is_unsigned = metadata.is_unsigned
-    if is_unsigned:
-        scale, zp = (
-            arr_metadata["scale"],
-            arr_metadata["zero_point"],
-        )
-        return (arr.astype(np.float32) - zp) * scale
-    else:
-        # don't trick to combine with is_unsigned, seeing ``get_quantized_dtype``
-        scale = arr_metadata["scale"]
-        return (arr.astype(np.float32)) * scale
-
-
-def convert_to_quint8(arr: np.ndarray, q: np.dtype):
-    """
-    Quantize a float NumPy ndarray into a quint8 one with specified params.
-
-    :param arr: Input ndarray.
-    :param q: Target data type, should be a quint8.
-    """
-    return _convert_to_quantized_dtype(arr, q, "quint8")
-
-
-def convert_from_quint8(arr: np.ndarray):
-    """
-    Dequantize a quint8 NumPy ndarray into a float one.
-
-    :param arr: Input ndarray.
-    """
-    return _convert_from_quantized_dtype(arr, "quint8")
-
-
-def convert_to_qint8(arr: np.ndarray, q: np.dtype):
-    """
-    Quantize a float NumPy ndarray into a qint8 one with specified params.
-
-    :param arr: Input ndarray.
-    :param q: Target data type, should be a qint8.
-    """
-    return _convert_to_quantized_dtype(arr, q, "qint8")
-
-
-def convert_from_qint8(arr: np.ndarray):
-    """
-    Dequantize a qint8 NumPy ndarray into a float one.
-
-    :param arr: Input ndarray.
-    """
-    return _convert_from_quantized_dtype(arr, "qint8")
-
-
-def convert_to_qint32(arr: np.ndarray, q: np.dtype):
-    """
-    Quantize a float NumPy ndarray into a qint32 one with specified params.
-
-    :param arr: Input ndarray.
-    :param q: Target data type, should be a qint8.
-    """
-    return _convert_to_quantized_dtype(arr, q, "qint32")
-
-
-def convert_from_qint32(arr):
-    """
-    Dequantize a qint32 NumPy ndarray into a float one.
-
-    :param arr: Input ndarray.
-    """
-    return _convert_from_quantized_dtype(arr, "qint32")
-
-
-def convert_to_quint4(arr: np.ndarray, q: np.dtype):
-    """
-    Quantize a float NumPy ndarray into a quint4 one with specified params.
-
-    :param arr: Input ndarray.
-    :param q: Target data type, should be a quint4.
-    """
-    return _convert_to_quantized_dtype(arr, q, "quint4")
-
-
-def convert_from_quint4(arr: np.ndarray):
-    """
-    Dequantize a quint4 NumPy ndarray into a float one.
-
-    :param arr: Input ndarray.
-    """
-    return _convert_from_quantized_dtype(arr, "quint4")
-
-
-def convert_to_qint4(arr: np.ndarray, q: np.dtype):
-    """
-    Quantize a float NumPy ndarray into a qint4 one with specified params.
-
-    :param arr: Input ndarray.
-    :param q: Target data type, should be a qint4.
-    """
-    return _convert_to_quantized_dtype(arr, q, "qint4")
-
-
-def convert_from_qint4(arr: np.ndarray):
-    """
-    Dequantize a qint4 NumPy ndarray into a float one.
-
-    :param arr: Input ndarray.
-    """
-    return _convert_from_quantized_dtype(arr, "qint4")
diff --git a/python_module/megengine/_internal/enum36.py b/python_module/megengine/_internal/enum36.py
deleted file mode 100644
index 929eecf7..00000000
--- a/python_module/megengine/_internal/enum36.py
+++ /dev/null
@@ -1,947 +0,0 @@
-# -*- coding: utf-8 -*-
-# Copyright [2001] [Cython]
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ---------------------------------------------------------------------
-# MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
-#
-# Copyright (c) 2014-2020 Megvii Inc. All rights reserved.
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-#
-# This file has been modified by Megvii ("Megvii Modifications").
-# All Megvii Modifications are Copyright (C) 2014-2020 Megvii Inc. All rights reserved.
-# ----------------------------------------------------------------------
-
-import sys
-from functools import reduce
-from operator import or_ as _or_
-from types import DynamicClassAttribute, MappingProxyType
-
-# try _collections first to reduce startup cost
-try:
-    from _collections import OrderedDict
-except ImportError:
-    from collections import OrderedDict
-
-
-__all__ = [
-    "EnumMeta",
-    "Enum",
-    "IntEnum",
-    "Flag",
-    "IntFlag",
-    "auto",
-    "unique",
-]
-
-
-def _is_descriptor(obj):
-    """Returns True if obj is a descriptor, False otherwise."""
-    return (
-        hasattr(obj, "__get__") or hasattr(obj, "__set__") or hasattr(obj, "__delete__")
-    )
-
-
-def _is_dunder(name):
-    """Returns True if a __dunder__ name, False otherwise."""
-    return (
-        name[:2] == name[-2:] == "__"
-        and name[2:3] != "_"
-        and name[-3:-2] != "_"
-        and len(name) > 4
-    )
-
-
-def _is_sunder(name):
-    """Returns True if a _sunder_ name, False otherwise."""
-    return (
-        name[0] == name[-1] == "_"
-        and name[1:2] != "_"
-        and name[-2:-1] != "_"
-        and len(name) > 2
-    )
-
-
-def _make_class_unpicklable(cls):
-    """Make the given class un-picklable."""
-
-    def _break_on_call_reduce(self, proto):
-        raise TypeError("%r cannot be pickled" % self)
-
-    cls.__reduce_ex__ = _break_on_call_reduce
-    cls.__module__ = "<unknown>"
-
-
-_auto_null = object()
-
-
-class auto:
-    """
-    Instances are replaced with an appropriate value in Enum class suites.
-    """
-
-    value = _auto_null
-
-
-class _EnumDict(dict):
-    """Track enum member order and ensure member names are not reused.
-
-    EnumMeta will use the names found in self._member_names as the
-    enumeration member names.
-
-    """
-
-    def __init__(self):
-        super().__init__()
-        self._member_names = []
-        self._last_values = []
-
-    def __setitem__(self, key, value):
-        """Changes anything not dundered or not a descriptor.
-
-        If an enum member name is used twice, an error is raised; duplicate
-        values are not checked for.
-
-        Single underscore (sunder) names are reserved.
-
-        """
-        if _is_sunder(key):
-            if key not in (
-                "_order_",
-                "_create_pseudo_member_",
-                "_generate_next_value_",
-                "_missing_",
-            ):
-                raise ValueError("_names_ are reserved for future Enum use")
-            if key == "_generate_next_value_":
-                setattr(self, "_generate_next_value", value)
-        elif _is_dunder(key):
-            if key == "__order__":
-                key = "_order_"
-        elif key in self._member_names:
-            # descriptor overwriting an enum?
-            raise TypeError("Attempted to reuse key: %r" % key)
-        elif not _is_descriptor(value):
-            if key in self:
-                # enum overwriting a descriptor?
-                raise TypeError("%r already defined as: %r" % (key, self[key]))
-            if isinstance(value, auto):
-                if value.value == _auto_null:
-                    value.value = self._generate_next_value(
-                        key, 1, len(self._member_names), self._last_values[:]
-                    )
-                value = value.value
-            self._member_names.append(key)
-            self._last_values.append(value)
-        super().__setitem__(key, value)
-
-
-# Dummy value for Enum as EnumMeta explicitly checks for it, but of course
-# until EnumMeta finishes running the first time the Enum class doesn't exist.
-# This is also why there are checks in EnumMeta like `if Enum is not None`
-Enum = None
-
-
-class EnumMeta(type):
-    """Metaclass for Enum"""
-
-    @classmethod
-    def __prepare__(metacls, cls, bases):
-        # create the namespace dict
-        enum_dict = _EnumDict()
-        # inherit previous flags and _generate_next_value_ function
-        member_type, first_enum = metacls._get_mixins_(bases)
-        if first_enum is not None:
-            enum_dict["_generate_next_value_"] = getattr(
-                first_enum, "_generate_next_value_", None
-            )
-        return enum_dict
-
-    def __new__(metacls, cls, bases, classdict):
-        # an Enum class is final once enumeration items have been defined; it
-        # cannot be mixed with other types (int, float, etc.) if it has an
-        # inherited __new__ unless a new __new__ is defined (or the resulting
-        # class will fail).
-        member_type, first_enum = metacls._get_mixins_(bases)
-        __new__, save_new, use_args = metacls._find_new_(
-            classdict, member_type, first_enum
-        )
-
-        # save enum items into separate mapping so they don't get baked into
-        # the new class
-        enum_members = {k: classdict[k] for k in classdict._member_names}
-        for name in classdict._member_names:
-            del classdict[name]
-
-        # adjust the sunders
-        _order_ = classdict.pop("_order_", None)
-
-        # check for illegal enum names (any others?)
-        invalid_names = set(enum_members) & {
-            "mro",
-        }
-        if invalid_names:
-            raise ValueError(
-                "Invalid enum member name: {0}".format(",".join(invalid_names))
-            )
-
-        # create a default docstring if one has not been provided
-        if "__doc__" not in classdict:
-            classdict["__doc__"] = "An enumeration."
-
-        # create our new Enum type
-        enum_class = super().__new__(metacls, cls, bases, classdict)
-        enum_class._member_names_ = []  # names in definition order
-        enum_class._member_map_ = OrderedDict()  # name->value map
-        enum_class._member_type_ = member_type
-
-        # save attributes from super classes so we know if we can take
-        # the shortcut of storing members in the class dict
-        base_attributes = {a for b in enum_class.mro() for a in b.__dict__}
-
-        # Reverse value->name map for hashable values.
-        enum_class._value2member_map_ = {}
-
-        # If a custom type is mixed into the Enum, and it does not know how
-        # to pickle itself, pickle.dumps will succeed but pickle.loads will
-        # fail.  Rather than have the error show up later and possibly far
-        # from the source, sabotage the pickle protocol for this class so
-        # that pickle.dumps also fails.
-        #
-        # However, if the new class implements its own __reduce_ex__, do not
-        # sabotage -- it's on them to make sure it works correctly.  We use
-        # __reduce_ex__ instead of any of the others as it is preferred by
-        # pickle over __reduce__, and it handles all pickle protocols.
-        if "__reduce_ex__" not in classdict:
-            if member_type is not object:
-                methods = (
-                    "__getnewargs_ex__",
-                    "__getnewargs__",
-                    "__reduce_ex__",
-                    "__reduce__",
-                )
-                if not any(m in member_type.__dict__ for m in methods):
-                    _make_class_unpicklable(enum_class)
-
-        # instantiate them, checking for duplicates as we go
-        # we instantiate first instead of checking for duplicates first in case
-        # a custom __new__ is doing something funky with the values -- such as
-        # auto-numbering ;)
-        for member_name in classdict._member_names:
-            value = enum_members[member_name]
-            if not isinstance(value, tuple):
-                args = (value,)
-            else:
-                args = value
-            if member_type is tuple:  # special case for tuple enums
-                args = (args,)  # wrap it one more time
-            if not use_args:
-                enum_member = __new__(enum_class)
-                if not hasattr(enum_member, "_value_"):
-                    enum_member._value_ = value
-            else:
-                enum_member = __new__(enum_class, *args)
-                if not hasattr(enum_member, "_value_"):
-                    if member_type is object:
-                        enum_member._value_ = value
-                    else:
-                        enum_member._value_ = member_type(*args)
-            value = enum_member._value_
-            enum_member._name_ = member_name
-            enum_member.__objclass__ = enum_class
-            enum_member.__init__(*args)
-            # If another member with the same value was already defined, the
-            # new member becomes an alias to the existing one.
-            for name, canonical_member in enum_class._member_map_.items():
-                if canonical_member._value_ == enum_member._value_:
-                    enum_member = canonical_member
-                    break
-            else:
-                # Aliases don't appear in member names (only in __members__).
-                enum_class._member_names_.append(member_name)
-            # performance boost for any member that would not shadow
-            # a DynamicClassAttribute
-            if member_name not in base_attributes:
-                setattr(enum_class, member_name, enum_member)
-            # now add to _member_map_
-            enum_class._member_map_[member_name] = enum_member
-            try:
-                # This may fail if value is not hashable. We can't add the value
-                # to the map, and by-value lookups for this value will be
-                # linear.
-                enum_class._value2member_map_[value] = enum_member
-            except TypeError:
-                pass
-
-        # double check that repr and friends are not the mixin's or various
-        # things break (such as pickle)
-        for name in ("__repr__", "__str__", "__format__", "__reduce_ex__"):
-            class_method = getattr(enum_class, name)
-            obj_method = getattr(member_type, name, None)
-            enum_method = getattr(first_enum, name, None)
-            if obj_method is not None and obj_method is class_method:
-                setattr(enum_class, name, enum_method)
-
-        # replace any other __new__ with our own (as long as Enum is not None,
-        # anyway) -- again, this is to support pickle
-        if Enum is not None:
-            # if the user defined their own __new__, save it before it gets
-            # clobbered in case they subclass later
-            if save_new:
-                enum_class.__new_member__ = __new__
-            enum_class.__new__ = Enum.__new__
-
-        # py3 support for definition order (helps keep py2/py3 code in sync)
-        if _order_ is not None:
-            if isinstance(_order_, str):
-                _order_ = _order_.replace(",", " ").split()
-            if _order_ != enum_class._member_names_:
-                raise TypeError("member order does not match _order_")
-
-        return enum_class
-
-    def __bool__(self):
-        """
-        classes/types should always be True.
-        """
-        return True
-
-    def __call__(
-        cls, value, names=None, *, module=None, qualname=None, type=None, start=1
-    ):
-        """Either returns an existing member, or creates a new enum class.
-
-        This method is used both when an enum class is given a value to match
-        to an enumeration member (i.e. Color(3)) and for the functional API
-        (i.e. Color = Enum('Color', names='RED GREEN BLUE')).
-
-        When used for the functional API:
-
-        `value` will be the name of the new class.
-
-        `names` should be either a string of white-space/comma delimited names
-        (values will start at `start`), or an iterator/mapping of name, value pairs.
-
-        `module` should be set to the module this class is being created in;
-        if it is not set, an attempt to find that module will be made, but if
-        it fails the class will not be picklable.
-
-        `qualname` should be set to the actual location this class can be found
-        at in its module; by default it is set to the global scope.  If this is
-        not correct, unpickling will fail in some circumstances.
-
-        `type`, if set, will be mixed in as the first base class.
-
-        """
-        if names is None:  # simple value lookup
-            return cls.__new__(cls, value)
-        # otherwise, functional API: we're creating a new Enum type
-        return cls._create_(
-            value, names, module=module, qualname=qualname, type=type, start=start
-        )
-
-    def __contains__(cls, member):
-        return isinstance(member, cls) and member._name_ in cls._member_map_
-
-    def __delattr__(cls, attr):
-        # nicer error message when someone tries to delete an attribute
-        # (see issue19025).
-        if attr in cls._member_map_:
-            raise AttributeError("%s: cannot delete Enum member." % cls.__name__)
-        super().__delattr__(attr)
-
-    def __dir__(self):
-        return [
-            "__class__",
-            "__doc__",
-            "__members__",
-            "__module__",
-        ] + self._member_names_
-
-    def __getattr__(cls, name):
-        """Return the enum member matching `name`
-
-        We use __getattr__ instead of descriptors or inserting into the enum
-        class' __dict__ in order to support `name` and `value` being both
-        properties for enum members (which live in the class' __dict__) and
-        enum members themselves.
-
-        """
-        if _is_dunder(name):
-            raise AttributeError(name)
-        try:
-            return cls._member_map_[name]
-        except KeyError:
-            raise AttributeError(name) from None
-
-    def __getitem__(cls, name):
-        return cls._member_map_[name]
-
-    def __iter__(cls):
-        return (cls._member_map_[name] for name in cls._member_names_)
-
-    def __len__(cls):
-        return len(cls._member_names_)
-
-    @property
-    def __members__(cls):
-        """Returns a mapping of member name->value.
-
-        This mapping lists all enum members, including aliases. Note that this
-        is a read-only view of the internal mapping.
-
-        """
-        return MappingProxyType(cls._member_map_)
-
-    def __repr__(cls):
-        return "<enum %r>" % cls.__name__
-
-    def __reversed__(cls):
-        return (cls._member_map_[name] for name in reversed(cls._member_names_))
-
-    def __setattr__(cls, name, value):
-        """Block attempts to reassign Enum members.
-
-        A simple assignment to the class namespace only changes one of the
-        several possible ways to get an Enum member from the Enum class,
-        resulting in an inconsistent Enumeration.
-
-        """
-        member_map = cls.__dict__.get("_member_map_", {})
-        if name in member_map:
-            raise AttributeError("Cannot reassign members.")
-        super().__setattr__(name, value)
-
-    def _create_(
-        cls, class_name, names=None, *, module=None, qualname=None, type=None, start=1
-    ):
-        """Convenience method to create a new Enum class.
-
-        `names` can be:
-
-        * A string containing member names, separated either with spaces or
-          commas.  Values are incremented by 1 from `start`.
-        * An iterable of member names.  Values are incremented by 1 from `start`.
-        * An iterable of (member name, value) pairs.
-        * A mapping of member name -> value pairs.
-
-        """
-        metacls = cls.__class__
-        bases = (cls,) if type is None else (type, cls)
-        _, first_enum = cls._get_mixins_(bases)
-        classdict = metacls.__prepare__(class_name, bases)
-
-        # special processing needed for names?
-        if isinstance(names, str):
-            names = names.replace(",", " ").split()
-        if isinstance(names, (tuple, list)) and names and isinstance(names[0], str):
-            original_names, names = names, []
-            last_values = []
-            for count, name in enumerate(original_names):
-                value = first_enum._generate_next_value_(
-                    name, start, count, last_values[:]
-                )
-                last_values.append(value)
-                names.append((name, value))
-
-        # Here, names is either an iterable of (name, value) or a mapping.
-        for item in names:
-            if isinstance(item, str):
-                member_name, member_value = item, names[item]
-            else:
-                member_name, member_value = item
-            classdict[member_name] = member_value
-        enum_class = metacls.__new__(metacls, class_name, bases, classdict)
-
-        # TODO: replace the frame hack if a blessed way to know the calling
-        # module is ever developed
-        if module is None:
-            try:
-                module = sys._getframe(2).f_globals["__name__"]
-            except (AttributeError, ValueError) as exc:
-                pass
-        if module is None:
-            _make_class_unpicklable(enum_class)
-        else:
-            enum_class.__module__ = module
-        if qualname is not None:
-            enum_class.__qualname__ = qualname
-
-        return enum_class
-
-    @staticmethod
-    def _get_mixins_(bases):
-        """Returns the type for creating enum members, and the first inherited
-        enum class.
-
-        bases: the tuple of bases that was given to __new__
-
-        """
-        if not bases:
-            return object, Enum
-
-        # double check that we are not subclassing a class with existing
-        # enumeration members; while we're at it, see if any other data
-        # type has been mixed in so we can use the correct __new__
-        member_type = first_enum = None
-        for base in bases:
-            if base is not Enum and issubclass(base, Enum) and base._member_names_:
-                raise TypeError("Cannot extend enumerations")
-        # base is now the last base in bases
-        if not issubclass(base, Enum):
-            raise TypeError(
-                "new enumerations must be created as "
-                "`ClassName([mixin_type,] enum_type)`"
-            )
-
-        # get correct mix-in type (either mix-in type of Enum subclass, or
-        # first base if last base is Enum)
-        if not issubclass(bases[0], Enum):
-            member_type = bases[0]  # first data type
-            first_enum = bases[-1]  # enum type
-        else:
-            for base in bases[0].__mro__:
-                # most common: (IntEnum, int, Enum, object)
-                # possible:    (<Enum 'AutoIntEnum'>, <Enum 'IntEnum'>,
-                #               <class 'int'>, <Enum 'Enum'>,
-                #               <class 'object'>)
-                if issubclass(base, Enum):
-                    if first_enum is None:
-                        first_enum = base
-                else:
-                    if member_type is None:
-                        member_type = base
-
-        return member_type, first_enum
-
-    @staticmethod
-    def _find_new_(classdict, member_type, first_enum):
-        """Returns the __new__ to be used for creating the enum members.
-
-        classdict: the class dictionary given to __new__
-        member_type: the data type whose __new__ will be used by default
-        first_enum: enumeration to check for an overriding __new__
-
-        """
-        # now find the correct __new__, checking to see of one was defined
-        # by the user; also check earlier enum classes in case a __new__ was
-        # saved as __new_member__
-        __new__ = classdict.get("__new__", None)
-
-        # should __new__ be saved as __new_member__ later?
-        save_new = __new__ is not None
-
-        if __new__ is None:
-            # check all possibles for __new_member__ before falling back to
-            # __new__
-            for method in ("__new_member__", "__new__"):
-                for possible in (member_type, first_enum):
-                    target = getattr(possible, method, None)
-                    if target not in {
-                        None,
-                        None.__new__,
-                        object.__new__,
-                        Enum.__new__,
-                    }:
-                        __new__ = target
-                        break
-                if __new__ is not None:
-                    break
-            else:
-                __new__ = object.__new__
-
-        # if a non-object.__new__ is used then whatever value/tuple was
-        # assigned to the enum member name will be passed to __new__ and to the
-        # new enum member's __init__
-        if __new__ is object.__new__:
-            use_args = False
-        else:
-            use_args = True
-
-        return __new__, save_new, use_args
-
-
-class Enum(metaclass=EnumMeta):
-    """Generic enumeration.
-
-    Derive from this class to define new enumerations.
-
-    """
-
-    def __new__(cls, value):
-        # all enum instances are actually created during class construction
-        # without calling this method; this method is called by the metaclass'
-        # __call__ (i.e. Color(3) ), and by pickle
-        if type(value) is cls:
-            # For lookups like Color(Color.RED)
-            return value
-        # by-value search for a matching enum member
-        # see if it's in the reverse mapping (for hashable values)
-        try:
-            if value in cls._value2member_map_:
-                return cls._value2member_map_[value]
-        except TypeError:
-            # not there, now do long search -- O(n) behavior
-            for member in cls._member_map_.values():
-                if member._value_ == value:
-                    return member
-        # still not found -- try _missing_ hook
-        return cls._missing_(value)
-
-    def _generate_next_value_(name, start, count, last_values):
-        for last_value in reversed(last_values):
-            try:
-                return last_value + 1
-            except TypeError:
-                pass
-        else:
-            return start
-
-    @classmethod
-    def _missing_(cls, value):
-        raise ValueError("%r is not a valid %s" % (value, cls.__name__))
-
-    def __repr__(self):
-        return "<%s.%s: %r>" % (self.__class__.__name__, self._name_, self._value_)
-
-    def __str__(self):
-        return "%s.%s" % (self.__class__.__name__, self._name_)
-
-    def __dir__(self):
-        added_behavior = [
-            m
-            for cls in self.__class__.mro()
-            for m in cls.__dict__
-            if m[0] != "_" and m not in self._member_map_
-        ]
-        return ["__class__", "__doc__", "__module__"] + added_behavior
-
-    def __format__(self, format_spec):
-        # mixed-in Enums should use the mixed-in type's __format__, otherwise
-        # we can get strange results with the Enum name showing up instead of
-        # the value
-
-        # pure Enum branch
-        if self._member_type_ is object:
-            cls = str
-            val = str(self)
-        # mix-in branch
-        else:
-            cls = self._member_type_
-            val = self._value_
-        return cls.__format__(val, format_spec)
-
-    def __hash__(self):
-        return hash(self._name_)
-
-    def __reduce_ex__(self, proto):
-        return self.__class__, (self._value_,)
-
-    # DynamicClassAttribute is used to provide access to the `name` and
-    # `value` properties of enum members while keeping some measure of
-    # protection from modification, while still allowing for an enumeration
-    # to have members named `name` and `value`.  This works because enumeration
-    # members are not set directly on the enum class -- __getattr__ is
-    # used to look them up.
-
-    @DynamicClassAttribute
-    def name(self):
-        """The name of the Enum member."""
-        return self._name_
-
-    @DynamicClassAttribute
-    def value(self):
-        """The value of the Enum member."""
-        return self._value_
-
-    @classmethod
-    def _convert(cls, name, module, filter, source=None):
-        """
-        Create a new Enum subclass that replaces a collection of global constants
-        """
-        # convert all constants from source (or module) that pass filter() to
-        # a new Enum called name, and export the enum and its members back to
-        # module;
-        # also, replace the __reduce_ex__ method so unpickling works in
-        # previous Python versions
-        module_globals = vars(sys.modules[module])
-        if source:
-            source = vars(source)
-        else:
-            source = module_globals
-        # We use an OrderedDict of sorted source keys so that the
-        # _value2member_map is populated in the same order every time
-        # for a consistent reverse mapping of number to name when there
-        # are multiple names for the same number rather than varying
-        # between runs due to hash randomization of the module dictionary.
-        members = [(name, source[name]) for name in source.keys() if filter(name)]
-        try:
-            # sort by value
-            members.sort(key=lambda t: (t[1], t[0]))
-        except TypeError:
-            # unless some values aren't comparable, in which case sort by name
-            members.sort(key=lambda t: t[0])
-        cls = cls(name, members, module=module)
-        cls.__reduce_ex__ = _reduce_ex_by_name
-        module_globals.update(cls.__members__)
-        module_globals[name] = cls
-        return cls
-
-
-class IntEnum(int, Enum):
-    """Enum where members are also (and must be) ints"""
-
-
-def _reduce_ex_by_name(self, proto):
-    return self.name
-
-
-class Flag(Enum):
-    """Support for flags"""
-
-    def _generate_next_value_(name, start, count, last_values):
-        """
-        Generate the next value when not given.
-
-        name: the name of the member
-        start: the initital start value or None
-        count: the number of existing members
-        last_value: the last value assigned or None
-        """
-        if not count:
-            return start if start is not None else 1
-        for last_value in reversed(last_values):
-            try:
-                high_bit = _high_bit(last_value)
-                break
-            except Exception:
-                raise TypeError("Invalid Flag value: %r" % last_value) from None
-        return 2 ** (high_bit + 1)
-
-    @classmethod
-    def _missing_(cls, value):
-        original_value = value
-        if value < 0:
-            value = ~value
-        possible_member = cls._create_pseudo_member_(value)
-        if original_value < 0:
-            possible_member = ~possible_member
-        return possible_member
-
-    @classmethod
-    def _create_pseudo_member_(cls, value):
-        """
-        Create a composite member iff value contains only members.
-        """
-        pseudo_member = cls._value2member_map_.get(value, None)
-        if pseudo_member is None:
-            # verify all bits are accounted for
-            _, extra_flags = _decompose(cls, value)
-            if extra_flags:
-                raise ValueError("%r is not a valid %s" % (value, cls.__name__))
-            # construct a singleton enum pseudo-member
-            pseudo_member = object.__new__(cls)
-            pseudo_member._name_ = None
-            pseudo_member._value_ = value
-            # use setdefault in case another thread already created a composite
-            # with this value
-            pseudo_member = cls._value2member_map_.setdefault(value, pseudo_member)
-        return pseudo_member
-
-    def __contains__(self, other):
-        if not isinstance(other, self.__class__):
-            return NotImplemented
-        return other._value_ & self._value_ == other._value_
-
-    def __repr__(self):
-        cls = self.__class__
-        if self._name_ is not None:
-            return "<%s.%s: %r>" % (cls.__name__, self._name_, self._value_)
-        members, uncovered = _decompose(cls, self._value_)
-        return "<%s.%s: %r>" % (
-            cls.__name__,
-            "|".join([str(m._name_ or m._value_) for m in members]),
-            self._value_,
-        )
-
-    def __str__(self):
-        cls = self.__class__
-        if self._name_ is not None:
-            return "%s.%s" % (cls.__name__, self._name_)
-        members, uncovered = _decompose(cls, self._value_)
-        if len(members) == 1 and members[0]._name_ is None:
-            return "%s.%r" % (cls.__name__, members[0]._value_)
-        else:
-            return "%s.%s" % (
-                cls.__name__,
-                "|".join([str(m._name_ or m._value_) for m in members]),
-            )
-
-    def __bool__(self):
-        return bool(self._value_)
-
-    def __or__(self, other):
-        if not isinstance(other, self.__class__):
-            return NotImplemented
-        return self.__class__(self._value_ | other._value_)
-
-    def __and__(self, other):
-        if not isinstance(other, self.__class__):
-            return NotImplemented
-        return self.__class__(self._value_ & other._value_)
-
-    def __xor__(self, other):
-        if not isinstance(other, self.__class__):
-            return NotImplemented
-        return self.__class__(self._value_ ^ other._value_)
-
-    def __invert__(self):
-        members, uncovered = _decompose(self.__class__, self._value_)
-        inverted_members = [
-            m
-            for m in self.__class__
-            if m not in members and not m._value_ & self._value_
-        ]
-        inverted = reduce(_or_, inverted_members, self.__class__(0))
-        return self.__class__(inverted)
-
-
-class IntFlag(int, Flag):
-    """Support for integer-based Flags"""
-
-    @classmethod
-    def _missing_(cls, value):
-        if not isinstance(value, int):
-            raise ValueError("%r is not a valid %s" % (value, cls.__name__))
-        new_member = cls._create_pseudo_member_(value)
-        return new_member
-
-    @classmethod
-    def _create_pseudo_member_(cls, value):
-        pseudo_member = cls._value2member_map_.get(value, None)
-        if pseudo_member is None:
-            need_to_create = [value]
-            # get unaccounted for bits
-            _, extra_flags = _decompose(cls, value)
-            # timer = 10
-            while extra_flags:
-                # timer -= 1
-                bit = _high_bit(extra_flags)
-                flag_value = 2 ** bit
-                if (
-                    flag_value not in cls._value2member_map_
-                    and flag_value not in need_to_create
-                ):
-                    need_to_create.append(flag_value)
-                if extra_flags == -flag_value:
-                    extra_flags = 0
-                else:
-                    extra_flags ^= flag_value
-            for value in reversed(need_to_create):
-                # construct singleton pseudo-members
-                pseudo_member = int.__new__(cls, value)
-                pseudo_member._name_ = None
-                pseudo_member._value_ = value
-                # use setdefault in case another thread already created a composite
-                # with this value
-                pseudo_member = cls._value2member_map_.setdefault(value, pseudo_member)
-        return pseudo_member
-
-    def __or__(self, other):
-        if not isinstance(other, (self.__class__, int)):
-            return NotImplemented
-        result = self.__class__(self._value_ | self.__class__(other)._value_)
-        return result
-
-    def __and__(self, other):
-        if not isinstance(other, (self.__class__, int)):
-            return NotImplemented
-        return self.__class__(self._value_ & self.__class__(other)._value_)
-
-    def __xor__(self, other):
-        if not isinstance(other, (self.__class__, int)):
-            return NotImplemented
-        return self.__class__(self._value_ ^ self.__class__(other)._value_)
-
-    __ror__ = __or__
-    __rand__ = __and__
-    __rxor__ = __xor__
-
-    def __invert__(self):
-        result = self.__class__(~self._value_)
-        return result
-
-
-def _high_bit(value):
-    """returns index of highest bit, or -1 if value is zero or negative"""
-    return value.bit_length() - 1
-
-
-def unique(enumeration):
-    """Class decorator for enumerations ensuring unique member values."""
-    duplicates = []
-    for name, member in enumeration.__members__.items():
-        if name != member.name:
-            duplicates.append((name, member.name))
-    if duplicates:
-        alias_details = ", ".join(
-            ["%s -> %s" % (alias, name) for (alias, name) in duplicates]
-        )
-        raise ValueError(
-            "duplicate values found in %r: %s" % (enumeration, alias_details)
-        )
-    return enumeration
-
-
-def _decompose(flag, value):
-    """Extract all members from the value."""
-    # _decompose is only called if the value is not named
-    not_covered = value
-    negative = value < 0
-    # issue29167: wrap accesses to _value2member_map_ in a list to avoid race
-    #             conditions between iterating over it and having more psuedo-
-    #             members added to it
-    if negative:
-        # only check for named flags
-        flags_to_check = [
-            (m, v)
-            for v, m in list(flag._value2member_map_.items())
-            if m.name is not None
-        ]
-    else:
-        # check for named flags and powers-of-two flags
-        flags_to_check = [
-            (m, v)
-            for v, m in list(flag._value2member_map_.items())
-            if m.name is not None or _power_of_two(v)
-        ]
-    members = []
-    for member, member_value in flags_to_check:
-        if member_value and member_value & value == member_value:
-            members.append(member)
-            not_covered &= ~member_value
-    if not members and value in flag._value2member_map_:
-        members.append(flag._value2member_map_[value])
-    members.sort(key=lambda m: m._value_, reverse=True)
-    if len(members) > 1 and members[0].value == value:
-        # we have the breakdown, don't need the value member itself
-        members.pop(0)
-    return members, not_covered
-
-
-def _power_of_two(value):
-    if value < 1:
-        return False
-    return value == 2 ** _high_bit(value)
diff --git a/python_module/megengine/_internal/exc.py b/python_module/megengine/_internal/exc.py
deleted file mode 100644
index 954756b1..00000000
--- a/python_module/megengine/_internal/exc.py
+++ /dev/null
@@ -1,58 +0,0 @@
-# -*- coding: utf-8 -*-
-# MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
-#
-# Copyright (c) 2014-2020 Megvii Inc. All rights reserved.
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-
-"""exception handling"""
-
-from . import mgb as _mgb
-
-
-class MegBrainError(Exception):
-    """exception class used by megbrain library"""
-
-    tracker = None
-    """the tracker setup by :func:`.set_exc_opr_tracker` when the related
-    operator is created"""
-
-    tracker_grad_orig = None
-    """if this operator is created by taking gradient, this var would be the
-    tracker of the operator that causes the grad."""
-
-    def __init__(self, msg, tracker, tracker_grad_orig):
-        assert isinstance(msg, str)
-        super().__init__(msg, tracker, tracker_grad_orig)
-        self.tracker = tracker
-        self.tracker_grad_orig = tracker_grad_orig
-
-    @classmethod
-    def _format_tracker(cls, tracker):
-        return ("| " + i for i in str(tracker).split("\n"))
-
-    def __str__(self):
-        lines = []
-        lines.extend(self.args[0].split("\n"))
-        if self.tracker is not None:
-            lines.append("Exception tracker:")
-            lines.extend(self._format_tracker(self.tracker))
-        if self.tracker_grad_orig is not None:
-            lines.append(
-                "Exception caused by taking grad of another operator with tracker:"
-            )
-            lines.extend(self._format_tracker(self.tracker_grad_orig))
-        while not lines[-1].strip():
-            lines.pop()
-        for idx, ct in enumerate(lines):
-            if ct.startswith("bt:"):
-                lines[idx] = "+ " + lines[idx]
-                for t in range(idx + 1, len(lines)):
-                    lines[t] = "| " + lines[t]
-                break
-        return "\n".join(lines)
-
-
-_mgb._reg_exception_class(MegBrainError)
diff --git a/python_module/megengine/_internal/global_init.py b/python_module/megengine/_internal/global_init.py
deleted file mode 100644
index 1b4fff87..00000000
--- a/python_module/megengine/_internal/global_init.py
+++ /dev/null
@@ -1,41 +0,0 @@
-# -*- coding: utf-8 -*-
-# MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
-#
-# Copyright (c) 2014-2020 Megvii Inc. All rights reserved.
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-
-"""global initialization work; classes/functions defined in this module should
-not be used by user code"""
-
-import atexit
-import os
-import sys
-import traceback
-
-from . import mgb
-from .logconf import get_logger
-from .persistent_cache import PersistentCacheOnServer
-
-
-class PyStackExtracterImpl(mgb._PyStackExtracter):
-    def extract(self):
-        return "".join(traceback.format_stack()[:-1])
-
-
-mgb._register_logger(get_logger())
-assert sys.executable
-mgb._timed_func_set_fork_exec_path(
-    sys.executable,
-    os.path.join(os.path.dirname(__file__), "_timed_func_fork_exec_entry.py"),
-)
-
-persistent_cache_impl_ins = PersistentCacheOnServer()
-mgb._PersistentCache.reg(persistent_cache_impl_ins)
-
-PyStackExtracterImplIns = PyStackExtracterImpl()
-PyStackExtracterImpl.reg(PyStackExtracterImplIns)
-
-atexit.register(mgb._mgb_global_finalize)
diff --git a/python_module/megengine/_internal/helper.py b/python_module/megengine/_internal/helper.py
deleted file mode 100644
index 8fbb974f..00000000
--- a/python_module/megengine/_internal/helper.py
+++ /dev/null
@@ -1,316 +0,0 @@
-# -*- coding: utf-8 -*-
-# MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
-#
-# Copyright (c) 2014-2020 Megvii Inc. All rights reserved.
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-
-import collections
-
-import numpy as np
-
-from . import mgb
-from .exc import MegBrainError
-from .mgb import SharedND, SymbolVar
-from .opr_param_defs import OptionalAxisV1
-
-
-def canonize_reshape(inputs, *, comp_graph, config):
-    src, tshape = inputs
-    tshape = cvt_to_shape_desc(tshape, src, comp_graph, config)
-    return src, tshape
-
-
-def canonize_shape_input(inputs, *, comp_graph, config):
-    assert isinstance(inputs, (list, tuple)) and len(inputs) == 1
-    return [cvt_to_shape_desc(inputs[0], None, comp_graph, config)]
-
-
-def cvt_to_shape_desc(val, inpvar, graph, config):
-    """convert some python object to a :class:`SymbolVar` that describes tensor
-    shape
-
-    :param val: the python object to be converted from
-    :param inpvar, graph, config: provide graph and comp node information; can
-        be None if not known. Either input or (graph, config) must be provided.
-    :return: a new var corresponding to *val*
-    :rtype: :class:`.SymbolVar`
-    """
-    if hasattr(val, "__mgb_symvar__"):
-        val = val.__mgb_symvar__()
-    elif hasattr(val, "symvar"):
-        val = val.symvar
-    if isinstance(val, SymbolVar):
-        return val
-    if not isinstance(val, collections.Iterable):
-        val = [val]
-    components = []
-    has_sym = False
-    for i in val:
-        if hasattr(i, "__mgb_symvar__"):
-            i = i.__mgb_symvar__()
-        elif hasattr(i, "symvar"):
-            i = i.symvar
-        if isinstance(i, SymbolVar):
-            has_sym = True
-            components.append(i)
-        else:
-            assert isinstance(i, int), (
-                "shape desc could contain either int or SymbolVar, got {}"
-                " actually".format(repr(i))
-            )
-            components.append(i)
-    assert components, "shape desc could not be empty"
-
-    if inpvar is not None:
-        assert isinstance(inpvar, SymbolVar)
-        if graph is None:
-            graph = inpvar.owner_graph
-        else:
-            assert graph == inpvar.owner_graph
-        config = mgb.make_opr_config(comp_node=inpvar.comp_node)
-    else:
-        assert isinstance(graph, mgb.CompGraph), "graph must be provided"
-        assert isinstance(config, mgb.OperatorNodeConfig)
-
-    if not has_sym:
-        shape = np.ascontiguousarray(components, dtype=np.int32)
-        assert np.all(shape == components), "failed to convert to shape: {}".format(
-            components
-        )
-        return mgb._make_immutable(graph, shape, None, config)
-
-    for idx, v in enumerate(components):
-        if not isinstance(v, SymbolVar):
-            vi = int(v)
-            assert vi == v, "could not convert {} to int".format(v)
-            components[idx] = mgb._make_immutable(graph, vi, None, config)
-    from . import opr as O
-
-    return O.concat(components, axis=0, config=config)
-
-
-def canonize_input_vars(inputs, *, comp_graph, config):
-    """convert immediate numbers and SharedND to SymbolVar in inputs; at least
-    one of the inputs must be SymbolVar, so comp node and comp graph can
-    beinferred
-
-    :return: list of converted vars
-    """
-    from . import make_immutable
-
-    if (
-        isinstance(inputs, (list, tuple))
-        and len(inputs) == 1
-        and isinstance(inputs[0], (list, tuple))
-    ):
-        # handle the case when a list is passed to a function with
-        # variable-length argument (e.g. concat has signature concat(*inputs)
-        # and is called with concat([a, b]))
-        inputs = inputs[0]
-
-    if isinstance(inputs, SymbolVar):
-        return [inputs]
-
-    old_inputs = inputs
-    inputs = []
-    get_comp_node = None
-    need_cvt = False
-    for i in old_inputs:
-        if isinstance(i, SymbolVar):
-            get_comp_node = lambda cn=i.comp_node: cn
-            if comp_graph is not None:
-                assert comp_graph == i.owner_graph
-            else:
-                comp_graph = i.owner_graph
-        else:
-            need_cvt = True
-        inputs.append(i)
-    if not need_cvt:
-        return inputs
-
-    if get_comp_node is None:
-
-        def get_comp_node():
-            nonlocal get_comp_node
-            cn = config.require_comp_node()
-            get_comp_node = lambda: cn
-            return cn
-
-    for idx, var in enumerate(inputs):
-        if not isinstance(var, SymbolVar):
-            if isinstance(var, SharedND):
-                var = var.symvar(comp_graph)
-            elif isinstance(var, mgb.SharedScalar):
-                var = var._as_sym_var(comp_graph, get_comp_node())
-            elif hasattr(var, "__mgb_symvar__"):
-                try:
-                    cn = get_comp_node()
-                except MegBrainError:
-                    cn = None
-                var = var.__mgb_symvar__(comp_graph=comp_graph, comp_node=cn)
-            elif hasattr(var, "symvar"):
-                var = var.symvar
-            else:
-                var = make_immutable(get_comp_node(), comp_graph, var)
-            inputs[idx] = var
-    return inputs
-
-
-def cvt_to_vector_of_shape(shapes):
-    """convert ``[[int]]`` to nested ``std::vector`` of ``size_t``"""
-    ret = mgb._VectorTensorShape()
-    for i in shapes:
-        val = tuple(i)
-        assert val and all(
-            j > 0 and isinstance(j, int) for j in val
-        ), "something returns bad shape in infer_shape(): {}".format(val)
-        ret.push_back(val)
-    return ret
-
-
-def cvt_to_opr_param_def(param, ptype, kwargs):
-    if param is not None:
-        if isinstance(param, ptype):
-            return param
-
-        param = [param]
-        assert len(param) == len(
-            ptype.__slots__
-        ), "{} needs {} params, but {} are provided".format(
-            ptype, len(ptype.__slots__), len(param)
-        )
-        return ptype(*param)
-
-    ckw = {}
-    for i in ptype.__slots__:
-        val = kwargs.pop(i, ckw)
-        if val is not ckw:
-            ckw[i] = val
-    return ptype(**ckw)
-
-
-def cvt_getitem_to_idx_desc(inpvar, tuple_val, *, allow_newaxis=True):
-    """convert ``__getitem__`` args to index desc
-
-    :return: ``(new_var, index_desc)`` where new_var is inpvar with
-        ``np.newaxis`` applied; note that ``index_desc`` can be ``None``.
-    """
-    assert isinstance(inpvar, SymbolVar), "bad input: {!r}".format(inpvar)
-    if not isinstance(tuple_val, tuple):
-        tuple_val = (tuple_val,)
-
-    axis_indexer = mgb._VectorAxisIndexer()
-
-    config = mgb.make_opr_config(comp_node=inpvar.comp_node)
-    graph = inpvar.owner_graph
-
-    def as_symvar(v, *, allow_list=True):
-        if isinstance(v, SymbolVar):
-            return v
-        vi = np.ascontiguousarray(v, dtype=np.int32)
-        assert np.abs(vi - v).max() == 0, "bad index: {!r}".format(v)
-        return mgb._make_immutable(graph, vi, None, config)
-
-    def _s(v):  # convert slice item
-        if v is None:
-            return SymbolVar()
-        return as_symvar(v, allow_list=False)
-
-    new_axes = []
-    cur_axis = -1
-    for i_idx, i in enumerate(tuple_val):
-        cur_axis += 1
-        if i is np.newaxis:
-            if cur_axis >= 0:
-                new_axes.append(cur_axis)
-            continue
-
-        if i is Ellipsis:
-            cur_axis = -1
-            for j in tuple_val[:i_idx:-1]:
-                if j is Ellipsis:
-                    raise IndexError("only one ellipsis is allowed")
-                if j is np.newaxis:
-                    new_axes.append(cur_axis)
-                cur_axis -= 1
-            continue
-
-        if isinstance(i, slice):
-            if i.start is None and i.stop is None and i.step is None:
-                continue
-            cur = mgb._AxisIndexer.make_interval(
-                cur_axis, _s(i.start), _s(i.stop), _s(i.step)
-            )
-        else:
-            cur = mgb._AxisIndexer.make_index(cur_axis, as_symvar(i))
-        axis_indexer.push_back(cur)
-    if new_axes:
-        if not allow_newaxis:
-            raise IndexError("newaxis is not allowed here")
-        inpvar = mgb._Opr.add_axis(inpvar, new_axes, mgb.make_opr_config())
-    if axis_indexer.empty():
-        axis_indexer = None
-    return inpvar, axis_indexer
-
-
-def cvt_to_reshape_unspec_axis(unspec_axis, tshape):
-    assert isinstance(unspec_axis, OptionalAxisV1), repr(unspec_axis)
-    unspec_axis = unspec_axis.axis
-    assert abs(unspec_axis) <= OptionalAxisV1.MAX_NDIM
-    if not isinstance(tshape, SymbolVar):
-        for idx, val in enumerate(tshape):
-            if val == -1:
-                assert (
-                    unspec_axis == OptionalAxisV1.INVALID_AXIS
-                ), "multiple unknown dimensions for reshape"
-                unspec_axis = idx
-    return OptionalAxisV1(unspec_axis)
-
-
-def gen_config(name, comp_node, config, output_dtype=None):
-    if config is None:
-        config = mgb.make_opr_config(name, comp_node, output_dtype)
-    else:
-        assert isinstance(config, mgb.OperatorNodeConfig)
-        assert name is None and comp_node is None
-    return config
-
-
-def cvt_opr_result(rst, *, explode_single=True):
-    """:param explode_single: whether to return the content of a single-item
-        list rather thatn the list itself"""
-    if not isinstance(rst, mgb.SymbolVar):
-        assert isinstance(rst, (list, tuple))
-        if len(rst) == 1 and explode_single:
-            return cvt_opr_result(rst[0])
-        return tuple(map(cvt_opr_result, rst))
-    if not rst.valid:
-        return None
-    # TODO Because the __init__ of SwigObject can not be modified to keep the
-    # reference of graph, we get owner graph explicitly here. The correct
-    # handling is moving the reference to SwigWrapper, but it is unsupported to
-    # add a member variable to SwigWrapper, so we should wrap the SymbolVar
-    # manually in megbrain_wrap.h
-    rst.owner_graph
-
-    f32 = np.float32
-    if not hasattr(cvt_opr_result, "_cvt_to_float32"):
-        import os
-        from .logconf import get_logger
-
-        cvt_opr_result._cvt_to_float32 = os.getenv("MGB_ALL_FLOAT32")
-        if cvt_opr_result._cvt_to_float32:
-            get_logger().warn(
-                "\n"
-                "+=====================================================+\n"
-                "| MGB_ALL_FLOAT32 is set, so all megbrain opr result  |\n"
-                "| would to converted to float32; this should only be  |\n"
-                "| used for loading old models.                        |\n"
-                "+=====================================================+"
-            )
-    if cvt_opr_result._cvt_to_float32 and rst.dtype != f32:
-        rst = rst.astype(f32)
-    return rst
diff --git a/python_module/megengine/_internal/logconf.py b/python_module/megengine/_internal/logconf.py
deleted file mode 100644
index f88c8c08..00000000
--- a/python_module/megengine/_internal/logconf.py
+++ /dev/null
@@ -1,54 +0,0 @@
-# -*- coding: utf-8 -*-
-# MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
-#
-# Copyright (c) 2014-2020 Megvii Inc. All rights reserved.
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-
-import logging
-import os
-
-_replaced_logger = None
-
-
-def get_logger():
-    global _replaced_logger
-    if _replaced_logger is not None:
-        return _replaced_logger
-    logger = logging.getLogger("megbrain")
-    logger.propagate = False
-    logger.setLevel(logging.INFO)
-    handler = logging.StreamHandler()
-    handler.setFormatter(MgbLogFormatter(datefmt="%d %H:%M:%S"))
-    handler.setLevel(0)
-    del logger.handlers[:]
-    logger.addHandler(handler)
-    _replaced_logger = logger
-    return logger
-
-
-class MgbLogFormatter(logging.Formatter):
-    def format(self, record):
-        date = "\x1b[32m[%(asctime)s %(lineno)d@%(filename)s:%(name)s]\x1b[0m"
-        msg = "%(message)s"
-        if record.levelno == logging.DEBUG:
-            fmt = "{} \x1b[32mDBG\x1b[0m {}".format(date, msg)
-        elif record.levelno == logging.WARNING:
-            fmt = "{} \x1b[1;31mWRN\x1b[0m {}".format(date, msg)
-        elif record.levelno == logging.ERROR:
-            fmt = "{} \x1b[1;4;31mERR\x1b[0m {}".format(date, msg)
-        else:
-            fmt = date + " " + msg
-        self._style._fmt = fmt
-        return super().format(record)
-
-
-def set_logger(logger):
-    """replace the logger"""
-    global _replaced_logger
-    _replaced_logger = logger
-    from .mgb import _register_logger
-
-    _register_logger(logger)
diff --git a/python_module/megengine/_internal/mgb_helper.py b/python_module/megengine/_internal/mgb_helper.py
deleted file mode 100644
index 955d5c88..00000000
--- a/python_module/megengine/_internal/mgb_helper.py
+++ /dev/null
@@ -1,87 +0,0 @@
-# -*- coding: utf-8 -*-
-# MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
-#
-# Copyright (c) 2014-2020 Megvii Inc. All rights reserved.
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-
-"""helper utils for the core mgb module"""
-
-import collections
-import inspect
-import json
-import threading
-from abc import ABCMeta, abstractmethod
-
-
-class callback_lazycopy:
-    """wraps around a callable to be passed to :meth:`.CompGraph.compile`.
-
-    This is used to disable eager copy, so we could get rid of an h2d copy and
-    a d2h if values are to be passed from one callback to another
-    :class:`.SharedND`.
-    """
-
-    def __init__(self, func):
-        assert isinstance(func, collections.Callable)
-        self.__func = func
-
-    @property
-    def func(self):
-        return self.__func
-
-
-class SharedNDLazyInitializer(metaclass=ABCMeta):
-    """lazy initialization policy for :class:`.SharedND`"""
-
-    @abstractmethod
-    def get_shape(self):
-        """get shape, without loading value"""
-
-    @abstractmethod
-    def get_value(self):
-        """get value as numpy ndarray"""
-
-
-class copy_output:
-    """wraps a :class:`.SymbolVar` in outspec for :meth:`.CompGraph.compile`,
-    to copy the output to function return value"""
-
-    symvar = None
-    borrow_mem = None
-
-    def __init__(self, symvar, *, borrow_mem=False):
-        """
-
-        :param borrow_mem: see :meth:`.CompGraphCallbackValueProxy.get_value`
-        """
-        from .mgb import SymbolVar
-
-        assert isinstance(
-            symvar, SymbolVar
-        ), "copy_output expects an SymbolVar, got {} instead".format(symvar)
-        self.symvar = symvar
-        self.borrow_mem = borrow_mem
-
-
-class FuncOutputSaver:
-    """instance could be used as callbacks for :meth:`.CompGraph.compile` to
-    copy output to host buffer
-    """
-
-    _value = None
-    _borrow_mem = None
-
-    def __init__(self, borrow_mem=False):
-        self._borrow_mem = borrow_mem
-
-    def __call__(self, v):
-        self._value = v.get_value(borrow_mem=self._borrow_mem)
-
-    def get(self):
-        assert (
-            self._value is not None
-        ), "{} not called; maybe due to unwaited async func".format(self)
-        return self._value
diff --git a/python_module/megengine/_internal/opr_extra.py b/python_module/megengine/_internal/opr_extra.py
deleted file mode 100644
index 7a59a2cf..00000000
--- a/python_module/megengine/_internal/opr_extra.py
+++ /dev/null
@@ -1,3 +0,0 @@
-# -*- coding: utf-8 -*-
-# Copyright (c) 2015-2019 Megvii Inc. All rights reserved.
-
diff --git a/python_module/megengine/_internal/persistent_cache.py b/python_module/megengine/_internal/persistent_cache.py
deleted file mode 100644
index 47da6637..00000000
--- a/python_module/megengine/_internal/persistent_cache.py
+++ /dev/null
@@ -1,90 +0,0 @@
-# -*- coding: utf-8 -*-
-# MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
-#
-# Copyright (c) 2014-2020 Megvii Inc. All rights reserved.
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-
-import argparse
-import getpass
-import json
-import os
-import shelve
-
-from .logconf import get_logger
-from .mgb import _PersistentCache
-from .version import __version__
-
-
-class _FakeRedisConn:
-    def __init__(self):
-        try:
-            from ..hub.hub import _get_megengine_home
-
-            cache_dir = os.path.expanduser(
-                os.path.join(_get_megengine_home(), "persistent_cache")
-            )
-            os.makedirs(cache_dir, exist_ok=True)
-            cache_file = os.path.join(cache_dir, "cache")
-            self._dict = shelve.open(cache_file)
-            self._is_shelve = True
-        except:
-            self._dict = {}
-            self._is_shelve = False
-
-    def get(self, key):
-        if self._is_shelve and isinstance(key, bytes):
-            key = key.decode("utf-8")
-
-        return self._dict.get(key)
-
-    def set(self, key, val):
-        if self._is_shelve and isinstance(key, bytes):
-            key = key.decode("utf-8")
-
-        self._dict[key] = val
-
-    def __del__(self):
-        if self._is_shelve:
-            self._dict.close()
-
-
-class PersistentCacheOnServer(_PersistentCache):
-    _cached_conn = None
-    _prefix = None
-    _prev_get_refkeep = None
-
-    @property
-    def _conn(self):
-        """get redis connection"""
-        if self._cached_conn is None:
-            self._cached_conn = _FakeRedisConn()
-            self._prefix = self.make_user_prefix()
-
-        return self._cached_conn
-
-    @classmethod
-    def make_user_prefix(cls):
-        return "mgbcache:{}".format(getpass.getuser())
-
-
-    def _make_key(self, category, key):
-        prefix_with_version = "{}:MGB{}".format(self._prefix, __version__)
-        return b"@".join(
-            (prefix_with_version.encode("ascii"), category.encode("ascii"), key)
-        )
-
-    def put(self, category, key, value):
-        conn = self._conn
-        key = self._make_key(category, key)
-        conn.set(key, value)
-
-    def get(self, category, key):
-        conn = self._conn
-        key = self._make_key(category, key)
-        self._prev_get_refkeep = conn.get(key)
-        return self._prev_get_refkeep
-
-
diff --git a/python_module/megengine/_internal/plugin.py b/python_module/megengine/_internal/plugin.py
deleted file mode 100644
index 4290bc1b..00000000
--- a/python_module/megengine/_internal/plugin.py
+++ /dev/null
@@ -1,261 +0,0 @@
-# -*- coding: utf-8 -*-
-# MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
-#
-# Copyright (c) 2014-2020 Megvii Inc. All rights reserved.
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-
-"""plugins associated with computing graph"""
-
-import atexit
-import collections
-import json
-import os
-import platform
-import signal
-import struct
-
-import numpy as np
-
-from . import mgb as _mgb
-from .logconf import get_logger
-
-InfkernFinderInputValueRec = collections.namedtuple(
-    "InfkernFinderInputValueRec", ["var_name", "var_id", "run_id", "value"]
-)
-
-
-class CompGraphProfiler(_mgb._CompGraphProfilerImpl):
-    """a plugin to profile computing graphs"""
-
-    def __init__(self, comp_graph):
-        super().__init__(comp_graph)
-
-    def get(self):
-        """get visualizable profiling result on a function"""
-        return json.loads(self._get_result())
-
-    def write_json(self, fobj):
-        """write the result to a json file
-
-        :param fobj: a file-like object, or a string
-        """
-        if isinstance(fobj, str):
-            with open(fobj, "w") as fout:
-                return self.write_json(fout)
-        fobj.write(self._get_result())
-
-
-class NumRangeChecker(_mgb._NumRangeCheckerImpl):
-    """check that all numberical float values of variables in a computing graph
-    are within given range"""
-
-    def __init__(self, comp_graph, max_abs_val):
-        """:param max_abs_val: max absolute value"""
-        super().__init__(comp_graph, float(max_abs_val))
-
-
-class TextOprIODump(_mgb._TextOprIODumpImpl):
-    """dump all internal results as text to a file"""
-
-    def __init__(self, comp_graph, fpath, *, print_addr=None, max_size=None):
-        super().__init__(comp_graph, fpath)
-        if print_addr is not None:
-            self.print_addr(print_addr)
-        if max_size is not None:
-            self.max_size(max_size)
-
-    def print_addr(self, flag):
-        """set whether to print var address
-
-        :return: self
-        """
-        self._print_addr(flag)
-        return self
-
-    def max_size(self, size):
-        """set the number of elements to be printed for each var
-
-        :return: self
-        """
-        self._max_size(size)
-        return self
-
-
-class BinaryOprIODump(_mgb._BinaryOprIODumpImpl):
-    """dump all internal results binary files to a directory; the values can be
-    loaded by :func:`load_tensor_binary`
-    """
-
-    def __init__(self, comp_graph, dir_path):
-        super().__init__(comp_graph, dir_path)
-
-
-class InfkernFinder(_mgb._InfkernFinderImpl):
-    """a plugin to find kernels that cause infinite loops"""
-
-    def __init__(self, comp_graph, record_input_value):
-        """
-        :param record_input_value: whether need to record input var values of
-            all operators
-        :type record_input_value: bool
-        """
-        super().__init__(comp_graph, record_input_value)
-
-    def write_to_file(self, fpath):
-        """write current execution status to a text file
-
-        :return: ID of the first operator that is still not finished,
-            or None if all oprs are finished
-        :rtype: int or None
-        """
-        v = self._write_to_file(fpath)
-        if v == 0:
-            return
-        return v - 1
-
-    def get_input_values(self, opr_id):
-        """get recorded input values of a given operator. Return a list
-        of :class:`InfkernFinderInputValueRec`. Note that the value in
-        each item is either None (if it is not recorded) or a numpy
-        array
-        """
-        ret = []
-        for idx in range(self._get_input_values_prepare(opr_id)):
-            vn = self._get_input_values_var_name(idx)
-            vi = self._get_input_values_var_idx(idx)
-            ri = self._get_input_values_run_id(idx)
-            val = self._get_input_values_val(idx)
-            if not val.shape:
-                val = None
-            else:
-                val = val.get_value()
-            ret.append(InfkernFinderInputValueRec(vn, vi, ri, val))
-        return ret
-
-
-def fast_signal_hander(signum, callback):
-    """bypass python's signal handling system and registera handler that is
-    called ASAP in a dedicated thread (in contrary, python calls handlers in
-    the main thread)
-
-    :param callback: signal callback, taking the signal number as its sole
-        argument
-    """
-
-    def cb_wrapped():
-        try:
-            callback(signum)
-        except:
-            get_logger().exception("error calling signal handler for {}".format(signum))
-
-    _mgb._FastSignal.register_handler(signum, cb_wrapped)
-
-
-atexit.register(_mgb._FastSignal.shutdown)
-
-
-class GlobalInfkernFinder:
-    """
-    manage a list of :class:`InfkernFinder` objects; when this process is
-    signaled with SIGUSR1, an interactive IPython shell would be presented for
-    further investigation
-    """
-
-    _signal = None
-    if platform.system() != "Windows":
-        _signal = signal.SIGUSR1
-    else:
-        _signal = signal.CTRL_C_EVENT
-    _registry = []
-    _shell_maker = None
-
-    @classmethod
-    def add_graph(cls, comp_graph):
-        """register a graph so it can be tracked by :class:`InfkernFinder`"""
-        enabled = os.getenv("MGB_DBG_INFKERN_FINDER")
-        if not enabled:
-            return
-
-        if enabled == "1":
-            record_input_value = False
-        else:
-            assert enabled == "2", (
-                "MGB_DBG_INFKERN_FINDER must be either 1 or 2, indicating "
-                "whether to record input values"
-            )
-            record_input_value = True
-
-        finder = InfkernFinder(comp_graph, record_input_value)
-        get_logger().warning(
-            "interactive InfkernFinder {} registered to graph {}; all input "
-            "var values would be recorded and the graph would never be "
-            "reclaimed. You can enter the interactive debug session by "
-            'executing "kill -{} {}". record_input_value={}'.format(
-                finder, comp_graph, cls._signal, os.getpid(), record_input_value
-            )
-        )
-
-        if not cls._registry:
-            from IPython.terminal.embed import InteractiveShellEmbed
-
-            cls._shell_maker = InteractiveShellEmbed
-            fast_signal_hander(cls._signal, cls._on_signal)
-
-        cls._registry.append(finder)
-
-    @classmethod
-    def _on_signal(cls, signum):
-        shell = cls._shell_maker()
-        shell(
-            header="Enter interactive InfkernFinder session; the registered "
-            "finder objects can be found in variable f",
-            local_ns={"f": cls._registry},
-        )
-
-
-def load_tensor_binary(fobj):
-    """load a tensor dumped by the :class:`BinaryOprIODump` plugin; the actual
-    tensor value dump is implemented by ``mgb::debug::dump_tensor``.
-
-    Multiple values can be compared by ``tools/compare_binary_iodump.py``.
-
-    :param fobj: file object, or a string that contains the file name
-    :return: tuple ``(tensor_value, tensor_name)``
-    """
-    if isinstance(fobj, str):
-        with open(fobj, "rb") as fin:
-            return load_tensor_binary(fin)
-
-    DTYPE_LIST = {
-        0: np.float32,
-        1: np.uint8,
-        2: np.int8,
-        3: np.int16,
-        4: np.int32,
-        5: _mgb.intb1,
-        6: _mgb.intb2,
-        7: _mgb.intb4,
-        8: None,
-        9: np.float16,
-        # quantized dtype start from 100000
-        # see MEGDNN_PARAMETERIZED_DTYPE_ENUM_BASE in
-        # dnn/include/megdnn/dtype.h
-        100000: np.uint8,
-        100001: np.int32,
-        100002: np.int8,
-    }
-
-    header_fmt = struct.Struct("III")
-    name_len, dtype, max_ndim = header_fmt.unpack(fobj.read(header_fmt.size))
-    assert (
-        DTYPE_LIST[dtype] is not None
-    ), "Cannot load this tensor: dtype Byte is unsupported."
-
-    shape = list(struct.unpack("I" * max_ndim, fobj.read(max_ndim * 4)))
-    while shape[-1] == 0:
-        shape.pop(-1)
-    name = fobj.read(name_len).decode("ascii")
-    return np.fromfile(fobj, dtype=DTYPE_LIST[dtype]).reshape(shape), name
diff --git a/python_module/megengine/_internal/version.py b/python_module/megengine/_internal/version.py
deleted file mode 100644
index 57803f31..00000000
--- a/python_module/megengine/_internal/version.py
+++ /dev/null
@@ -1,57 +0,0 @@
-# -*- coding: utf-8 -*-
-# MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
-#
-# Copyright (c) 2014-2020 Megvii Inc. All rights reserved.
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-
-"""version information for MegBrain package"""
-
-import collections
-
-from . import mgb as _mgb
-
-
-class Version(
-    collections.namedtuple("VersionBase", ["major", "minor", "patch", "dev"])
-):
-    """simple sematic version object"""
-
-    @classmethod
-    def __normalize(cls, v):
-        if isinstance(v, str):
-            v = v.split(".")
-        a, b, c = map(int, v)
-        return cls(a, b, c)
-
-    def __eq__(self, rhs):
-        return super().__eq__(self.__normalize(rhs))
-
-    def __ne__(self, rhs):
-        return super().__ne__(self.__normalize(rhs))
-
-    def __lt__(self, rhs):
-        return super().__lt__(self.__normalize(rhs))
-
-    def __le__(self, rhs):
-        return super().__le__(self.__normalize(rhs))
-
-    def __gt__(self, rhs):
-        return super().__gt__(self.__normalize(rhs))
-
-    def __ge__(self, rhs):
-        return super().__ge__(self.__normalize(rhs))
-
-    def __str__(self):
-        rst = "{}.{}.{}".format(self.major, self.minor, self.patch)
-        if self.dev:
-            rst += "-dev{}".format(self.dev)
-        return rst
-
-
-Version.__new__.__defaults__ = (0,)  # dev defaults to 0
-
-version_info = Version(*_mgb._get_mgb_version())
-__version__ = str(version_info)
diff --git a/python_module/megengine/core/__init__.py b/python_module/megengine/core/__init__.py
deleted file mode 100644
index ab452954..00000000
--- a/python_module/megengine/core/__init__.py
+++ /dev/null
@@ -1,20 +0,0 @@
-# -*- coding: utf-8 -*-
-# MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
-#
-# Copyright (c) 2014-2020 Megvii Inc. All rights reserved.
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-from .device import (
-    get_default_device,
-    get_device_count,
-    is_cuda_available,
-    set_default_device,
-)
-from .function import Function
-from .graph import Graph, dump
-from .serialization import load, save
-from .tensor import Tensor, TensorDict, tensor, wrap_io_tensor
-from .tensor_factory import ones, zeros
-from .tensor_nn import Buffer, Parameter
diff --git a/python_module/megengine/core/device.py b/python_module/megengine/core/device.py
deleted file mode 100644
index cb3999db..00000000
--- a/python_module/megengine/core/device.py
+++ /dev/null
@@ -1,60 +0,0 @@
-# -*- coding: utf-8 -*-
-# MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
-#
-# Copyright (c) 2014-2020 Megvii Inc. All rights reserved.
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-import os
-
-import megengine._internal as mgb
-
-_default_device = os.getenv("MGE_DEFAULT_DEVICE", "xpux")
-
-
-def get_device_count(device_type: str) -> int:
-    """Gets number of devices installed on this system.
-
-    :param device_type: device type, one of 'gpu' or 'cpu'
-    """
-
-    device_type_set = ("cpu", "gpu")
-    assert device_type in device_type_set, "device must be one of {}".format(
-        device_type_set
-    )
-    return mgb.config.get_device_count(device_type)
-
-
-def is_cuda_available() -> bool:
-    """Returns whether cuda device is available on this system.
-
-    """
-    return mgb.config.get_device_count("gpu", warn=False) > 0
-
-
-def set_default_device(device: str = "xpux"):
-    r"""Sets default computing node.
-
-    :param device: default device type. The type can be 'cpu0', 'cpu1', etc.,
-        or 'gpu0', 'gpu1', etc., to specify the particular cpu or gpu to use.
-        'cpux' and  'gupx' can also be used to specify any number of cpu or gpu devices.
-
-        'multithread' device type is avaliable when inference, which implements
-        multi-threading parallelism at the operator level. For example,
-        'multithread4' will compute with 4 threads. which implements
-
-        The default value is 'xpux' to specify any device available.
-
-        It can also be set by environmental variable `MGE_DEFAULT_DEVICE`.
-    """
-    global _default_device  # pylint: disable=global-statement
-    _default_device = device
-
-
-def get_default_device() -> str:
-    r"""Gets default computing node.
-
-    It returns the value set by :func:`~.set_default_device`.
-    """
-    return _default_device
diff --git a/python_module/megengine/core/function.py b/python_module/megengine/core/function.py
deleted file mode 100644
index c6fb7d43..00000000
--- a/python_module/megengine/core/function.py
+++ /dev/null
@@ -1,176 +0,0 @@
-# -*- coding: utf-8 -*-
-# MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
-#
-# Copyright (c) 2014-2020 Megvii Inc. All rights reserved.
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-import copy
-from abc import ABCMeta, abstractmethod
-from typing import Iterable, Tuple, Union
-
-import megengine._internal as mgb
-
-from .tensor import Tensor
-
-
-class _OverrideGradientCraniotome(mgb.craniotome.CraniotomeBase):
-    __nr_inputs__ = None
-    __nr_outputs__ = None
-    __expand_single_outputs__ = False
-    __allow_duplicate__ = False
-
-    grad_func = None
-
-    def setup(self, nr_inputs, nr_outputs, grad_func):
-        self.__nr_inputs__ = nr_inputs + nr_outputs
-        self.__nr_outputs__ = nr_outputs
-        self.grad_func = grad_func
-
-    def infer_shape(self, inp_shapes):
-        return inp_shapes[-self.__nr_outputs__ :]
-
-    def init_output_dtype(self, input_dtypes):
-        return input_dtypes[-self.__nr_outputs__ :]
-
-    def execute(self, inputs, outputs):
-        for ivar, ovar in zip(inputs[-self.__nr_outputs__ :], outputs):
-            ovar.set_value(ivar)
-
-    def grad(self, wrt_idx, inputs, outputs, out_grad):
-        # TODO: Make sure grad_values really have values in eager mode.
-        # Porting to the new imperative engine would solve this, but if it
-        # don't happen, EagerEvalManager should be changed.
-        grads = self.grad_func(
-            *(Tensor(x) if x is not None else None for x in out_grad)
-        )
-        # pylint: disable=literal-comparison
-        if isinstance(grads, Tensor) or grads is None or grads is 0:
-            grads = (grads,)
-        assert (
-            len(grads) == self.__nr_inputs__ - self.__nr_outputs__
-        ), "Function.backward should return a tuple with len = {}, got {}".format(
-            self.__nr_inputs__ - self.__nr_outputs__, len(grads)
-        )
-        # pylint: disable=literal-comparison
-        return (
-            list(x._symvar if x is not None and x is not 0 else 0 for x in grads)
-            + [0] * self.__nr_outputs__
-        )
-
-    def get_serialize_params(self):
-        raise NotImplementedError("Serialization of Function is not implemented")
-
-
-class Function(metaclass=ABCMeta):
-    """
-    Defines a block of operations with customizable differentiation.
-
-    The computation should be defined in ``forward`` method, with gradient
-    computation defined in ``backward`` method.
-
-    Each instance of ``Function`` should be used only once during forwardding.
-
-    Examples:
-
-    .. testcode::
-
-        class Sigmoid(Function):
-            def forward(self, x):
-                y = 1 / (1 + F.exp(-x))
-                self.save_for_backward(y)
-                return y
-
-            def backward(self, output_grads):
-                (y, ) = self.saved_tensors
-                return output_grads * y * (1-y)
-
-    """
-
-    _has_saved_state = False
-    saved_tensors = None
-
-    def __init__(self):
-        self.saved_tensors = ()
-
-    @abstractmethod
-    def forward(self, *inputs: Iterable[Tensor]) -> Union[Tuple[Tensor], Tensor]:
-        """
-        Applies operations to ``inputs`` and returns results. It must be overriden by all subclasses.
-        Users can call :meth:`~.function.Function.save_for_backward` in this method to save tensors.
-
-        :param input: Input tensors.
-        :return: A tuple of Tensor or a single Tensor.
-
-        .. note::
-
-            This method should return a tuple of Tensor or a single Tensor representing the output
-            of the function.
-        """
-        raise NotImplementedError
-
-    @abstractmethod
-    def backward(
-        self, *output_grads: Iterable[Union[Tensor, None]]
-    ) -> Union[Tuple[Tensor], Tensor]:
-        """
-        Compute the gradient of the forward function. It must be overriden by all subclasses.
-
-        :param output_grads: gradients of outputs that are returned by :meth:`~.function.Function.forward`
-
-            .. note::
-
-                In case when some tensors of outputs are not related to loss function, the corresponding
-                values in ``output_grads`` would be ``None``.
-
-        .. note::
-
-            This method should return a tuple which containing the gradients of all inputs, in the same order
-            as the ``inputs`` argument of :meth:`~.function.Function.forward` . A ``Tensor`` could be returned
-            instead if there is only one input. If users want to stop the propagation of some gradients,
-            the corresponding returned values should be set ``None`` .
-
-        """
-        raise NotImplementedError
-
-    def save_for_backward(self, *tensors: Iterable[Tensor]):
-        """
-        Saves tensors needed for gradient computation. This method should be called only
-        once in :meth:`~.function.Function.forward`, additional calls will replace values saved previously.
-
-        The saved tensors can be accessed through the ``saved_tensors`` attribute.
-        """
-        self.saved_tensors = tensors
-
-    def __deepcopy__(self, memo):
-        """
-        Defines how the operator is deeply copied
-        """
-        cls = self.__class__
-        result = cls.__new__(cls)
-        tmp = self.saved_tensors
-        self.saved_tensors = None
-        memo[id(self)] = result
-        for k, v in self.__dict__.items():
-            setattr(result, k, copy.deepcopy(v, memo))
-        setattr(result, "saved_tensors", tmp)
-        self.saved_tensors = tmp
-        return result
-
-    def __call__(self, *inputs):
-        assert (
-            not self._has_saved_state
-        ), "A Function instance should not be called multiple times"
-        outputs = self.forward(*inputs)
-        if isinstance(outputs, Tensor):
-            outputs = (outputs,)
-        self._has_saved_state = True
-        sv = (x._symvar for x in inputs + outputs)
-        outputs = _OverrideGradientCraniotome.make(
-            *sv, nr_inputs=len(inputs), nr_outputs=len(outputs), grad_func=self.backward
-        )
-        outputs = tuple(map(Tensor, outputs))
-        if len(outputs) == 1:
-            outputs = outputs[0]
-        return outputs
diff --git a/python_module/megengine/core/graph.py b/python_module/megengine/core/graph.py
deleted file mode 100644
index 332f198d..00000000
--- a/python_module/megengine/core/graph.py
+++ /dev/null
@@ -1,158 +0,0 @@
-# -*- coding: utf-8 -*-
-# MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
-#
-# Copyright (c) 2014-2020 Megvii Inc. All rights reserved.
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-import collections
-import threading
-
-import megengine._internal as mgb
-
-from .device import get_default_device
-
-
-class _DefaultGraph(threading.local):
-    r"""
-    An implicit thread-local graph
-    """
-
-    def __init__(self):
-        super(_DefaultGraph, self).__init__()
-        self._default_graph = None
-
-    def get_default(self):
-        r"""Returns a default Graph object for eager evaluation.
-        """
-        if self._default_graph is None:
-            self._default_graph = Graph()
-        return self._default_graph
-
-
-_default_graph = _DefaultGraph()
-
-
-class Graph(mgb.CompGraph):
-    r"""
-    A computing graph that supporting context management.
-
-    :param check_env_var: whether to check environment vars including ``MGB_COMP_GRAPH_OPT``.
-    :param eager_evaluation: use dynamic graph(``True``) or static graph(``False``).
-
-    Examples:
-
-    .. testcode::
-
-        import numpy as np
-        from megengine import tensor
-        from megengine.core import Graph
-
-        with Graph(eager_evaluation=True):
-            x = tensor([1, 2])
-            print(x)
-
-    Outputs:
-
-    .. testoutput::
-
-        Tensor([1 2], dtype=int32)
-
-    """
-
-    __saved_graph = None
-
-    def __new__(
-        cls, *, check_env_var: bool = True, eager_evaluation: bool = True, **kwargs
-    ):
-        kwargs.update(eager_evaluation=eager_evaluation)
-        self = mgb.comp_graph(extra_opts=kwargs, check_env_var=check_env_var)
-        self.__class__ = cls
-        return self
-
-    def __init__(
-        self, *, check_env_var: bool = True, eager_evaluation: bool = True, **kwargs
-    ):
-        # pylint: disable=super-init-not-called
-        pass
-
-    def __enter__(self):
-        self.__saved_graph = _default_graph._default_graph
-        _default_graph._default_graph = self
-        return self
-
-    def __exit__(self, type, value, traceback):
-        _default_graph._default_graph = self.__saved_graph
-        del self.__saved_graph
-
-
-def _use_default_if_none(device, comp_graph):
-    if device is None:
-        device = get_default_device()
-    if comp_graph is None:
-        comp_graph = get_default_graph()
-    return device, comp_graph
-
-
-def dump(outputs, fpath, optimize_options=None, **kwargs):
-    r"""
-    Serializes this computing graph and writes it to a file.
-
-    :type outputs: ``Tensor`` or a collection of ``Tensor``
-    :param outputs: output variables that need to be retrieved when
-        deserializing
-    :type fpath: ``str``
-    :param fpath: path for the output file
-    :type optimize_options: ``list``
-    :param optimize_options: ``['f16_io_f32_comp', 'f16_io_comp', 'use_nhwcd4', 'fuse_conv_bias_nonlinearity']`` , four elements are optional, it can be an empty list, None or a list containing any of them.
-
-        .. note::
-
-            ``f16_io_f32_comp`` – whether to use float16 for I/O between oprs and use float32 as internal computation precision. Note the output var would be changed to float16;
-
-            ``f16_io_comp`` – whether to use float16 for both I/O and computation precision;
-
-            ``use_nhwcd4`` – whether to use NHWCD4 data format. This is faster on some OpenCL devices;
-
-            ``fuse_conv_bias_nonlinearity`` – whether to fuse conv+bias+nonlinearty into one opr. This is supported only when ``use_nhwcd4`` is set.
-
-    """
-    from .tensor import Tensor
-
-    assert optimize_options is None or isinstance(
-        optimize_options, list
-    ), "optimize_options must be a list"
-
-    if isinstance(outputs, Tensor):
-        outputs = [outputs]
-    else:
-        assert isinstance(outputs, collections.Iterable), "{} not iterable".format(
-            outputs
-        )
-        outputs = list(outputs)
-
-    for output in outputs:
-        assert isinstance(output, Tensor), "All outputs must be Tensors."
-
-    outputs = [o._symvar for o in outputs]
-
-    if optimize_options:
-        opt_dict = dict.fromkeys(optimize_options, True)
-        mgb.optimize_for_inference(outputs, **opt_dict)
-    mgb.serialize_comp_graph_to_file(fpath, outputs, **kwargs)
-
-
-def set_default_graph(default_graph):
-    r"""
-    Sets a global default Graph object.
-    """
-    global _default_graph  # pylint: disable=global-statement
-    _default_graph._default_graph = default_graph
-
-
-def get_default_graph():
-    r"""
-    Returns a default Graph object, most probably for eager evaluation.
-    """
-    return _default_graph.get_default()
diff --git a/python_module/megengine/core/serialization.py b/python_module/megengine/core/serialization.py
deleted file mode 100644
index 8c18a534..00000000
--- a/python_module/megengine/core/serialization.py
+++ /dev/null
@@ -1,128 +0,0 @@
-# -*- coding: utf-8 -*-
-# MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
-#
-# Copyright (c) 2014-2020 Megvii Inc. All rights reserved.
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-import pickle
-
-import megengine._internal as mgb
-
-from ..utils.max_recursion_limit import max_recursion_limit
-from .device import get_default_device
-
-
-def save(obj, f, pickle_module=pickle, pickle_protocol=pickle.HIGHEST_PROTOCOL):
-    r"""Save an object to disk file.
-
-    :type obj: object
-    :param obj: object to save. Only ``module`` or ``state_dict`` are allowed.
-    :type f: text file object
-    :param f: a string of file name or a text file object to which ``obj`` is saved to.
-    :type pickle_module:
-    :param pickle_module: Default: ``pickle``.
-    :type pickle_protocol:
-    :param pickle_protocol: Default: ``pickle.HIGHEST_PROTOCOL``.
-
-    """
-    if isinstance(f, str):
-        with open(f, "wb") as fout:
-            save(
-                obj, fout, pickle_module=pickle_module, pickle_protocol=pickle_protocol
-            )
-        return
-
-    with max_recursion_limit():
-        assert hasattr(f, "write"), "{} does not support write".format(f)
-        pickle_module.dump(obj, f, pickle_protocol)
-
-
-class dmap:
-    def __init__(self, map_location):
-        self.map_location = map_location
-
-    def __enter__(self):
-        mgb.add_device_map(self.map_location)
-        return self
-
-    def __exit__(self, type, value, traceback):
-        mgb.del_device_map()
-
-
-def _get_callable_map_location(map_location):
-    if map_location is None:
-
-        def callable_map_location(state):
-            return str(get_default_device())
-
-    elif isinstance(map_location, str):
-
-        def callable_map_location(state):
-            return map_location
-
-    elif isinstance(map_location, dict):
-        locator_map = {}
-        for key, value in map_location.items():
-            locator_key = mgb.config.parse_locator(key)[:2]
-            locator_map[locator_key] = value
-
-        def callable_map_location(state):
-            orig = mgb.config.parse_locator(state)[:2]
-            if orig in locator_map.keys():
-                state = locator_map[orig]
-            return state
-
-    else:
-        assert callable(map_location), "map_location should be str, dict or function"
-        callable_map_location = map_location
-    return callable_map_location
-
-
-def load(f, map_location=None, pickle_module=pickle):
-    r"""Load an object saved with save() from a file.
-
-    :type f: text file object
-    :param f: a string of file name or a text file object from which to load.
-    :type map_location: str, dict or a function specifying the map rules
-    :param map_location: Default: ``None``.
-
-        .. note::
-
-            map_location will change the logical locator when loading models,
-            avoiding tensors be loading on non-existent device. If you want to
-            add the mapping relationship between logical locator and physical
-            locator in runtime, please call :func:`mge.set_device_map()`
-
-    :type pickle_module:
-    :param pickle_module: Default: ``pickle``.
-
-    .. note::
-
-        If you will call :func:`mge.set_default_device()`, please do it
-        before :func:`mge.load()`.
-
-    Examples:
-
-    .. testcode:
-
-        import megengine as mge
-        mge.load('model.mge')
-        # Load all tensors based on logical location.
-        mge.load('model.mge', map_location='gpu0')
-        # Load all tensors onto the device: GPU0
-        mge.load('model.mge', map_location={'gpu0':'cpu0'})
-        # Load all tensors based on logical location, but 'GPU0' will be renamed to 'CPU0'
-        mge.load('model.mge', map_location=lambda dev: 'cpu0')
-        # Load all tensors onto the device" CPU0
-
-    """
-    if isinstance(f, str):
-        with open(f, "rb") as fin:
-            return load(fin, map_location=map_location, pickle_module=pickle_module)
-
-    map_location = _get_callable_map_location(map_location)  # callable map_location
-
-    with dmap(map_location):
-        return pickle_module.load(f)
diff --git a/python_module/megengine/core/tensor.py b/python_module/megengine/core/tensor.py
deleted file mode 100644
index f96f90f2..00000000
--- a/python_module/megengine/core/tensor.py
+++ /dev/null
@@ -1,771 +0,0 @@
-# MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
-#
-# Copyright (c) 2014-2020 Megvii Inc. All rights reserved.
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-import collections
-import copy
-import functools
-import itertools
-import weakref
-from typing import Callable, Tuple, Union
-
-import numpy as np
-
-import megengine._internal as mgb
-
-from .graph import _use_default_if_none, get_default_graph
-
-
-def wrap_io_tensor(func):
-    r"""A wrapper to make ``func`` compatible with functions in ``_internal.opr``.
-    """
-
-    @functools.wraps(func)
-    def wrapper(*args, **kwargs):
-        comp_graph = None
-        for i in itertools.chain(args, kwargs.values()):
-            if isinstance(i, Tensor) and i._comp_graph:
-                comp_graph = i._comp_graph
-                break
-        else:
-
-            comp_graph = get_default_graph()
-        new_args = (
-            arg._attach(comp_graph) if isinstance(arg, Tensor) else arg for arg in args
-        )
-        new_kwargs = {
-            k: v._attach(comp_graph) if isinstance(v, Tensor) else v
-            for k, v in kwargs.items()
-        }
-        ret = func(*new_args, **new_kwargs)
-        if isinstance(ret, mgb.SymbolVar):
-            ret = Tensor(ret)
-        elif isinstance(ret, list):
-            ret = [Tensor(t) if isinstance(t, mgb.SymbolVar) else t for t in ret]
-        elif isinstance(ret, tuple):
-            ret = tuple(Tensor(t) if isinstance(t, mgb.SymbolVar) else t for t in ret)
-        return ret
-
-    return wrapper
-
-
-def _wrap_symbolvar_binary_op(f):
-    @functools.wraps(f)
-    def wrapped(self, other):
-        comp_graph = (
-            isinstance(other, Tensor)
-            and other._comp_graph
-            or self._comp_graph
-            or get_default_graph()
-        )
-        if isinstance(other, Tensor):
-            other = other._attach(comp_graph)
-        return Tensor(f(self._attach(comp_graph), other))
-
-    return wrapped
-
-
-def _wrap_slice(inp: slice):
-    r"""
-    A wrapper to handle Tensor values in ``inp`` slice.
-    """
-    start = inp.start._symvar if isinstance(inp.start, Tensor) else inp.start
-    stop = inp.stop._symvar if isinstance(inp.stop, Tensor) else inp.stop
-    step = inp.step._symvar if isinstance(inp.step, Tensor) else inp.step
-    return slice(start, stop, step)
-
-
-def _wrap_idx(idx: Tuple[Union[int, "Tensor"]]):
-    r"""
-    A wrapper to handle Tensor values in ``idx``.
-    """
-    if not isinstance(idx, tuple):
-        idx = (idx,)
-
-    idx = tuple(i._symvar if isinstance(i, Tensor) else i for i in idx)
-    idx = tuple(_wrap_slice(i) if isinstance(i, slice) else i for i in idx)
-    return idx
-
-
-class _MGBIndexWrapper:
-    r"""
-    A wrapper class to handle ``__getitem__`` for index containing Tensor values.
-
-    :param dest: a destination Tensor to do indexing on.
-    :param mgb_index: an ``_internal`` helper function indicating how to index.
-    :param val: a optional Tensor parameter used for ``mgb_index``.
-    """
-
-    def __init__(self, dest: "Tensor", mgb_index: Callable, val=None):
-        self.dest = dest
-        self.val = val
-        self.mgb_index = mgb_index
-
-    def __getitem__(self, idx):
-        if self.val is None:
-            return wrap_io_tensor(self.mgb_index(self.dest._symvar).__getitem__)(
-                _wrap_idx(idx)
-            )
-        else:
-            return wrap_io_tensor(
-                self.mgb_index(self.dest._symvar, self.val._symvar).__getitem__
-            )(_wrap_idx(idx))
-
-
-class _Guard:
-    r"""
-    A wrapper class with custom ``__del__`` method calling ``deleter``.
-
-    :param deleter: a function to be called in ``__del__``.
-    """
-
-    def __init__(self, deleter: Callable):
-        self.deleter = deleter
-
-    def __del__(self):
-        self.deleter()
-
-
-class Tensor:
-    r"""The main data container in MegEngine.
-    Use :func:`~.tensor` to create a Tensor with existed data.
-    """
-    requires_grad = False
-    grad = None
-
-    def __init__(self, val=None, *, requires_grad=None):
-        self._reset(val, requires_grad=requires_grad)
-        self.q_dict = {"mode": None, "scale": None, "zero_point": None}
-
-    def _reset(self, val=None, *, requires_grad=None):
-        self.__sym_override = None
-        if val is None:
-            self.__val = None
-            self.__sym = None
-        elif isinstance(val, mgb.SharedND):
-            self.__val = val
-            self.__sym = None
-        elif isinstance(val, mgb.SymbolVar):
-            self.__val = None
-            self.__sym = val
-        else:
-            raise TypeError("must be initialized with SymbolVar or SharedND")
-        self.requires_grad = requires_grad
-
-    def _as_tensor(self, obj):
-        r"""Convert the data into a ``Tensor``. If the data is already a Tensor
-        with the same dtype and device, no copy will be performed. Otherwise a
-        new Tensor will be returned with computational graph retained.
-
-        """
-        if isinstance(obj, Tensor):
-            return obj
-        if isinstance(obj, mgb.SymbolVar):
-            return Tensor(obj)
-        if isinstance(obj, mgb.SharedScalar):
-            return Tensor(obj._as_sym_var(self._comp_graph, self._comp_node))
-        return tensor(data=obj, device=self.device)
-
-    def numpy(self):
-        r"""Return the tensor value in numpy.ndarray format.
-        """
-        if self.__val is not None:
-            assert self.__sym is None
-            return self.__val.get_value()
-        if self.__sym is None:
-            raise ValueError("uninitialized")
-        if self.__sym.eager_val is not None:
-            return self.__sym.eager_val.get_value()
-        return self.__sym.inferred_value
-
-    def item(self):
-        r"""If tensor only has only one value, return it."""
-        return self.numpy().item()
-
-    def _attach(self, comp_graph, *, volatile=True):
-        sym = self.__sym_override or self.__sym
-        if sym:
-            if sym.owner_graph != comp_graph:
-                raise RuntimeError("internal error")
-            return sym
-        if self.__val:
-            return self.__val.symvar(comp_graph, volatile=volatile)
-        else:
-            raise ValueError("uninitialized")
-
-    @property
-    def _symvar(self):
-        if self.__sym_override:
-            return self.__sym_override
-        if self.__sym:
-            assert not self.__val
-            return self.__sym
-        if not self.__val:
-            raise ValueError("uninitialized")
-
-        return self._attach(get_default_graph())
-
-    def __mgb_symvar__(self, comp_graph=None, **_):
-        if self.__sym_override:
-            return self.__sym_override
-        if self.__val and comp_graph:
-            return self._attach(comp_graph)
-        return self._symvar  # read by mgb.opr
-
-    def _override_symvar_during_trace(self, trace, symvar):
-        assert self.__val and not self.__sym
-        assert trace is type(trace)._active_instance
-        deleters = trace._user_cache.setdefault(Tensor, set())
-        self_ref = weakref.ref(self)
-
-        def restore():
-            self = self_ref()
-            if self is not None:
-                self.__sym_override = None
-
-        deleters.add(_Guard(restore))
-        self.__sym_override = symvar
-
-    @property
-    def dtype(self):
-        r"""Return the data type of the tensor.
-        """
-        if self.__val is not None:
-            return self.__val.dtype
-        return self._symvar.dtype
-
-    @dtype.setter
-    def dtype(self, dtype: str = None):
-        r"""Set the data type of the tensor.
-        """
-        if self.__val is not None:
-            self.__val = mgb.make_shared(self.device, value=self.astype(dtype).numpy())
-        elif self.__sym_override is not None:
-            self.__sym_override = self.__sym_override.astype(dtype)
-        elif self.__sym is not None:
-            self.__sym = self.__sym.astype(dtype)
-
-    @property
-    def name(self):
-        r"""Get the tensor name, does not support Parameter and Buffer.
-        """
-        return self._symvar.name
-
-    @name.setter
-    def name(self, name: str = None):
-        r"""Set the tensor name, does not support Parameter and Buffer.
-        """
-        if self.__val is not None:
-            raise ValueError("name setting is not available for Parameter or Buffer.")
-        if self.__sym_override is not None:
-            self.__sym_override = self.__sym_override.rename(name)
-        if self.__sym is not None:
-            assert not self.__val
-            self.__sym = self.__sym.rename(name)
-
-    @property
-    def _comp_node(self):
-        if self.__val is not None:
-            return self.__val.comp_node
-        return self._symvar.comp_node
-
-    device = _comp_node
-
-    @property
-    def _comp_graph(self):
-        if self.__sym is not None:
-            return self.__sym.owner_graph
-        return None
-
-    @property
-    def shape(self):
-        r"""Return an int tuple that is the shape/layout of the tensor.
-        Could be invalid in static graph mode.
-        """
-        from ..jit import trace
-
-        if trace._active_instance:  # pylint: disable=protected-access
-            # NOTE: this is an hack
-            shape = mgb.opr.get_var_shape(self._symvar)
-            return tuple(Tensor(shape[i]) for i in range(self.ndim))
-        return self._symvar.imm_shape
-
-    def set_value(self, value, *, sync=True, inplace=False, share=False):
-        r"""Set value to the tensor.
-        """
-        if not self.__val:
-            raise ValueError("not detached")
-        if isinstance(value, Tensor):
-            value = value.__val or value.__sym.eager_val
-        self.__val.set_value(value, sync=sync, inplace=inplace, share=share)
-
-    def fill(self, value):
-        r"""Fills the tensor with the specified value.
-        """
-        self.set_value(np.full(self.shape, value, dtype=self.dtype))
-
-    def reset_zero(self):
-        r"""Reset the tensor and fills with zeros.
-        """
-        if not self.__val:
-            raise ValueError("not detached")
-        self.__val.reset_zero()
-
-    def to(self, device):
-        r"""Performs Tensor device conversion, returns Tensor with the specified device.
-        """
-        return wrap_io_tensor(mgb.opr.copy)(self, comp_node=device)
-
-    # https://docs.python.org/3/reference/datamodel.html#object.__hash__
-    # > If a class does not define an __eq__() method it should not define a
-    # > __hash__() operation either
-    __hash__ = None  # type: ignore[assignment]
-
-    def __eq__(self, rhs):
-        rhs = self._as_tensor(rhs)
-        return Tensor(self._symvar._binary_opr("EQ", rhs._symvar))
-
-    def __ne__(self, rhs):
-        return 1 - self.__eq__(rhs)
-
-    def __len__(self):
-        if self._symvar.eager_val is not None:
-            return self._symvar.eager_val.shape[0]
-        raise TypeError(
-            "__len__ and __iter__ is not available for tensors on non eager graph."
-        )
-
-    __add__ = _wrap_symbolvar_binary_op(mgb.SymbolVar.__add__)
-    __radd__ = _wrap_symbolvar_binary_op(mgb.SymbolVar.__radd__)
-    __sub__ = _wrap_symbolvar_binary_op(mgb.SymbolVar.__sub__)
-    __rsub__ = _wrap_symbolvar_binary_op(mgb.SymbolVar.__rsub__)
-    __mul__ = _wrap_symbolvar_binary_op(mgb.SymbolVar.__mul__)
-    __rmul__ = _wrap_symbolvar_binary_op(mgb.SymbolVar.__rmul__)
-    __matmul__ = _wrap_symbolvar_binary_op(mgb.SymbolVar.__matmul__)
-    __rmatmul__ = _wrap_symbolvar_binary_op(mgb.SymbolVar.__rmatmul__)
-    __lshift__ = _wrap_symbolvar_binary_op(mgb.SymbolVar.__lshift__)
-    __rshift__ = _wrap_symbolvar_binary_op(mgb.SymbolVar.__rshift__)
-    __truediv__ = _wrap_symbolvar_binary_op(mgb.SymbolVar.__truediv__)
-    __rtruediv__ = _wrap_symbolvar_binary_op(mgb.SymbolVar.__rtruediv__)
-    __floordiv__ = _wrap_symbolvar_binary_op(mgb.SymbolVar.__floordiv__)
-    __rfloordiv__ = _wrap_symbolvar_binary_op(mgb.SymbolVar.__rfloordiv__)
-    __mod__ = _wrap_symbolvar_binary_op(mgb.SymbolVar.__mod__)
-    __rmod__ = _wrap_symbolvar_binary_op(mgb.SymbolVar.__rmod__)
-    __pow__ = _wrap_symbolvar_binary_op(mgb.SymbolVar.__pow__)
-    __rpow__ = _wrap_symbolvar_binary_op(mgb.SymbolVar.__rpow__)
-    __lt__ = _wrap_symbolvar_binary_op(mgb.SymbolVar.__lt__)
-    __gt__ = _wrap_symbolvar_binary_op(mgb.SymbolVar.__gt__)
-    __le__ = _wrap_symbolvar_binary_op(mgb.SymbolVar.__le__)
-    __ge__ = _wrap_symbolvar_binary_op(mgb.SymbolVar.__ge__)
-    __neg__ = wrap_io_tensor(mgb.SymbolVar.__neg__)
-    sum = wrap_io_tensor(mgb.SymbolVar.sum)
-    """
-    Sum up the given tensors.
-    """
-    max = wrap_io_tensor(mgb.SymbolVar.max)
-    """
-    Return the maximum value of given tensor.
-    """
-    min = wrap_io_tensor(mgb.SymbolVar.min)
-    """
-    Return the minimum value of given tensor.
-    """
-    prod = wrap_io_tensor(mgb.SymbolVar.prod)
-    """
-    Return the product value of the given tensor.
-    """
-    mean = wrap_io_tensor(mgb.SymbolVar.mean)
-    """
-    Return the mean value of the given tensor.
-    """
-    dimshuffle = wrap_io_tensor(mgb.SymbolVar.dimshuffle)
-    """
-    See more details in :func:`~.functional.tensor.dimshuffle`.
-    """
-    astype = wrap_io_tensor(mgb.SymbolVar.astype)
-    """
-    Cast the tensor to a specified type.
-    """
-
-    def reshape(self, *target_shape):
-        r"""Return a tensor which has given target shape
-
-        Examples:
-
-        .. testcode::
-
-            import numpy as np
-            from megengine import tensor
-
-            inp = tensor(np.arange(1, 17, dtype=np.int32).reshape(4,4))
-            out = tensor(np.arange(100, 116, dtype=np.int32).reshape(1,16))
-            out = out.reshape(inp.shape)
-            print(out.numpy())
-
-        .. testoutput::
-
-           [[100 101 102 103]
-            [104 105 106 107]
-            [108 109 110 111]
-            [112 113 114 115]]
-        """
-
-        if isinstance(target_shape[0], tuple):
-            if len(target_shape) > 1:
-                raise ValueError("Only single tuple is accepted in reshape")
-            target_shape = target_shape[0]
-        target_shape = (t._symvar if isinstance(t, Tensor) else t for t in target_shape)
-        return Tensor(mgb.SymbolVar.reshape(self._symvar, *target_shape))
-
-    def broadcast(self, *target_shape):
-        r"""Return a tesnor broadcasted by current tensor to given target shape
-
-        Examples:
-
-        .. testcode::
-
-            import numpy as np
-            from megengine import tensor
-
-            data = tensor(np.arange(100, 104, dtype=np.int32).reshape(1,4))
-            data = data.broadcast((4,4))
-            print(data.numpy())
-
-        .. testoutput::
-
-            [[100 101 102 103]
-             [100 101 102 103]
-             [100 101 102 103]
-             [100 101 102 103]]
-        """
-
-        if isinstance(target_shape[0], tuple):
-            if len(target_shape) > 1:
-                raise ValueError("Only single tuple is accepted in broadcast")
-            target_shape = target_shape[0]
-        target_shape = (t._symvar if isinstance(t, Tensor) else t for t in target_shape)
-        return Tensor(mgb.SymbolVar.broadcast(self._symvar, *target_shape))
-
-    # Prefer operators on Tensor instead of convert to numpy
-    __array_priority__ = 1000
-
-    # mgb indexing family
-    def __getitem__(self, idx):
-        return wrap_io_tensor(self._symvar.__getitem__)(_wrap_idx(idx))
-
-    def set_subtensor(self, val: "Tensor") -> _MGBIndexWrapper:
-        r"""
-        Return a object which supports using ``__getitem__`` to set subtensor.
-
-        ``c = a.set_subtensor(b)[idx]`` is equivalent to ``c = a.copy()`` and ``c[idx] = b``.
-        """
-        return _MGBIndexWrapper(self, mgb.opr.set_subtensor, val)
-
-    def incr_subtensor(self, val: "Tensor") -> _MGBIndexWrapper:
-        r"""
-        Return a object which supports using ``__getitem__`` to increase subtensor.
-
-        ``c = a.incr_subtensor(b)[idx]`` is equivalent to ``c = a.copy()`` and ``c[idx] += b``.
-        """
-        return _MGBIndexWrapper(self, mgb.opr.incr_subtensor, val)
-
-    @property
-    def ai(self) -> _MGBIndexWrapper:
-        r"""
-        Return a object which supports complex index method to get subtensor.
-
-        Examples:
-
-        .. testcode::
-
-            from megengine import tensor
-            a = tensor(np.arange(16, dtype=np.float32).reshape((4, 4)))
-            print(a.ai[:, [2, 3]])
-
-        Outputs:
-
-        .. testoutput::
-
-            Tensor([[ 2.  3.]
-                    [ 6.  7.]
-                    [10. 11.]
-                    [14. 15.]])
-        """
-        return _MGBIndexWrapper(self, mgb.opr.advanced_indexing)
-
-    def set_ai(self, val: "Tensor") -> _MGBIndexWrapper:
-        r"""
-        Equal to :meth:`~.Tensor.set_subtensor` which supports advanced indexing.
-        """
-        return _MGBIndexWrapper(self, mgb.opr.set_advanced_indexing, val)
-
-    def incr_ai(self, val: "Tensor") -> _MGBIndexWrapper:
-        r"""
-        Equal to :meth:`~.Tensor.incr_subtensor` which supports advanced indexing.
-        """
-        return _MGBIndexWrapper(self, mgb.opr.incr_advanced_indexing, val)
-
-    @property
-    def mi(self) -> _MGBIndexWrapper:
-        r"""
-        Return a object which supports getting subtensor by
-        the coordinates which is Cartesian product of given index.
-
-        Examples:
-
-        .. testcode::
-
-            from megengine import tensor
-            a = tensor(np.arange(16, dtype=np.float32).reshape((4, 4)))
-            print(a.mi[[1, 2], [2, 3]])
-            # is equal to elements on [1, 2] * [2, 3] = [[(1,2), (1, 3)], [(2, 2), (2, 3)]]
-            # a[1,2] = 6, a[1,3] = 7, a[2,2] = 10, a[2,3] = 11
-
-        Outputs:
-
-        .. testoutput::
-
-            Tensor([[ 6.  7.]
-                    [10. 11.]])
-        """
-        return _MGBIndexWrapper(self, mgb.opr.mesh_indexing)
-
-    def set_mi(self, val: "Tensor") -> _MGBIndexWrapper:
-        r"""
-        Equal to :meth:`~.Tensor.set_subtensor` which using mesh indexing.
-        """
-        return _MGBIndexWrapper(self, mgb.opr.set_mesh_indexing, val)
-
-    def incr_mi(self, val: "Tensor") -> _MGBIndexWrapper:
-        r"""
-        Equal to :meth:`~.Tensor.incr_subtensor` which using mesh indexing.
-        """
-        return _MGBIndexWrapper(self, mgb.opr.incr_mesh_indexing, val)
-
-    @property
-    def batched_mi(self) -> _MGBIndexWrapper:
-        r"""
-        Return a object which supports getting subtensor by
-        batched mesh indexing.
-
-        For Tensor ``a`` and index ``idx``, each value of the ``idx`` need to be a 2-dim matrix or slice.
-        Cartesian product ``... * idx[k-1][i] * idx[k][i] * idx[k+1][i] * ...`` will be a subtensor from ``a[i]``.
-        Each matrix ``idx[k]`` should have the size of ``batched_dim`` rows as ``idx[0]`` indicated.
-        And for slice value, it will apply same slice for each ``batched_dim``. For more details see the example below.
-
-        Examples:
-
-        .. testcode::
-
-            from megengine import tensor
-            a = tensor(np.arange(144, dtype=np.float32).reshape((3, 3, 4, 4)))
-
-            print(a.batched_mi[:2, [[0],[1]],[[0,1],[2,3]],[[0],[1]]])
-            # is equal to elements from a[0] with ``[0] * [0,1] * [0] = [[[(0,0,0)], [(0,1,0)]]]``(shape is [1,2,1])
-            # and from a[1] with ``[1] * [2,3] * [1] = [[[(1,2,1)], [(1,3,1)]]]``(shape is also [1,2,1])
-            # a[0,0,0,0] = 0, a[0,0,1,0] = 4, a[1,1,2,1] = 73, a[1,1,3,1] = 77
-
-            print(a.batched_mi[:2, [[0],[1]], :2, :1])
-            # is equal to ``a.batched_mi[:2, [[0],[1]], [[0,1],[0,1]],[[0],[0]]]``
-
-        Outputs:
-
-        .. testoutput::
-
-            Tensor([[[[ 0.]
-                      [ 4.]]]
-                    [[[73.]
-                      [77.]]]])
-            Tensor([[[[ 0.]
-                      [ 4.]]]
-                    [[[64.]
-                      [68.]]]])
-        """
-        return _MGBIndexWrapper(self, mgb.opr.batched_mesh_indexing)
-
-    def batched_set_mi(self, val: "Tensor") -> _MGBIndexWrapper:
-        r"""
-        Equal to :meth:`~.Tensor.incr_subtensor` which using batched mesh indexing.
-        """
-        return _MGBIndexWrapper(self, mgb.opr.batched_set_mesh_indexing, val)
-
-    def batched_incr_mi(self, val: "Tensor") -> _MGBIndexWrapper:
-        r"""
-        Equal to :meth:`~.Tensor.incr_subtensor` which using batched mesh indexing.
-        """
-        return _MGBIndexWrapper(self, mgb.opr.batched_incr_mesh_indexing, val)
-
-    def __array__(self, dtype=None):
-        if dtype is None:
-            return self.numpy()
-        else:
-            return self.numpy().astype(dtype, copy=False)
-
-    def __int__(self):
-        return int(self.item())
-
-    def __index__(self):
-        return int(self.item())
-
-    def __round__(self, ndigits=0):
-        if ndigits != 0:
-            raise ValueError("ndigits must be 0 for Tensor.round")
-        return Tensor(mgb.opr.elemwise([self._symvar], mode="ROUND"))
-
-    round = __round__
-
-    def sqrt(self):
-        r"""Return a tensor that each element is the square root of its
-        original value.
-
-        """
-        return Tensor(mgb.opr.sqrt(self._symvar))
-
-    def shapeof(self, axis=None):
-        r"""Return a Tensor that represent the shape of the tensor.
-        """
-        return Tensor(mgb.opr.get_var_shape(self._symvar, axis=axis))
-
-    @property
-    def ndim(self):
-        r"""Return the number of dimensions of the tensor.
-        """
-        return len(self._symvar.imm_shape)
-
-    def __repr__(self):
-        piece = "Tensor("
-        with np.printoptions(precision=4, suppress=True):
-            piece += "{}".format(str(self.numpy()))
-        if self.dtype != np.float32:
-            piece += ", dtype={}".format(np.dtype(self.dtype).name)
-        if self._comp_node.locator_logical != ("XPU", -1, 0):
-            piece += ", device={}".format(self.device)
-        piece += ")"
-        return piece
-
-    def __bool__(self):
-        raise RuntimeError(
-            "Tensor object should not be converted to bool or used in a if statement. Use .numpy(), int() or float() if you want to use its value in if statement, be aware that this may lead to incorrect result in non-eager mode."
-        )
-
-    def __getstate__(self):
-        r""" __getstate__ will be called for pickle serialization or deep copy
-        """
-
-        assert (self.__val is not None) and (
-            self.__sym is None
-        ), "Only SharedND initialized Tensor can be serialized or deep copied"
-        metadata = {"requires_grad": self.requires_grad}
-        state = {
-            "data": self.numpy(),
-            "device": self.device,
-            "dtype": self.dtype,
-            "metadata": metadata,
-        }
-        return state
-
-    def __setstate__(self, state):
-        data = state.pop("data")
-        device = state.pop("device")
-        dtype = state.pop("dtype")
-        metadata = state.pop("metadata", {})
-        requires_grad = metadata.pop("requires_grad", None)
-        snd = mgb.make_shared(device, value=data, dtype=dtype)
-        self._reset(snd, requires_grad=requires_grad)
-
-    def __deepcopy__(self, memo):
-        """
-        The default deepcopy will ignore other attributes except those defined at
-        __getstate__ and __setstate__ method.
-        So we need to add __deepcopy__ method to deepcopy correct attributes.
-        """
-        assert (self.__val is not None) and (
-            self.__sym is None
-        ), "Only SharedND initialized Tensor can be serialized or deep copied"
-        cls = self.__class__
-        result = cls.__new__(cls)
-        memo[id(self)] = result
-        for k, v in self.__dict__.items():
-            setattr(result, k, copy.deepcopy(v, memo))
-        return result
-
-
-def tensor(
-    data: Union[list, np.ndarray] = None,
-    *,
-    dtype: str = None,
-    device: mgb.CompNode = None,
-    requires_grad: bool = None
-):
-    r"""A helper function to create a :class:`~.Tensor` using existing data.
-
-    :param data: an existing data array, must be Python list, NumPy array or None.
-    :param dtype: target Tensor data type, one of ``("uint8", "int8", "int16", "int32", "float32", "float16")``.
-    :param device: target device for Tensor storing.
-    :param requires_grad: whether its gradiant will be calculated during :meth:`~.Optimizer.backward`
-    """
-    supported_dtypes = ("uint8", "int8", "int16", "int32", "float32", "float16")
-    if isinstance(data, Tensor):
-        raise NotImplementedError
-    if dtype is not None and np.dtype(dtype).name not in supported_dtypes:
-        raise TypeError("unsupported dtype {}".format(dtype))
-    if data is not None:
-        if not isinstance(data, np.ndarray):
-            data = np.array(data, dtype=dtype)
-            # In order to accept tensor([1]),
-            # Automaticlly convert to  32-bit number instead of numpy's default 64-bit when input data is not nparray.
-            dtype = mgb.to_mgb_supported_dtype(data.dtype)
-        if dtype is None:
-            if data.dtype.name not in supported_dtypes:
-                raise TypeError("unsupported dtype {}".format(data.dtype))
-
-    device, _ = _use_default_if_none(device, None)
-    shared_nd = mgb.make_shared(device, value=data, dtype=dtype)
-    return Tensor(shared_nd, requires_grad=requires_grad)
-
-
-class TensorDict(collections.MutableMapping):
-    r"""
-    A helper class to maintain dict with Tensor key.
-    """
-
-    def __init__(self, *args, **kwargs):
-        self.data = {}
-        for i in args:
-            self.update(i)
-        self.update(**kwargs)
-
-    class keyfn:
-        def __new__(cls, x: Tensor):
-            if not isinstance(x, Tensor):
-                return x
-            return super().__new__(cls)
-
-        def __init__(self, x: Tensor):
-            self._data = x  # do not save id directly to make pickle work
-
-        def __hash__(self):
-            return id(self._data)
-
-        def __eq__(self, other):
-            return isinstance(other, type(self)) and id(self._data) == id(other._data)
-
-    def __getitem__(self, key):
-        _, v = self.data[self.keyfn(key)]
-        return v
-
-    def __setitem__(self, key, value):
-        self.data[self.keyfn(key)] = key, value
-
-    def __delitem__(self, key):
-        del self.data[self.keyfn(key)]
-
-    def __iter__(self):
-        for _, (k, _) in self.data.items():
-            yield k
-
-    def __len__(self):
-        return len(self.data)
diff --git a/python_module/megengine/core/tensor_factory.py b/python_module/megengine/core/tensor_factory.py
deleted file mode 100644
index 4e83bdba..00000000
--- a/python_module/megengine/core/tensor_factory.py
+++ /dev/null
@@ -1,109 +0,0 @@
-# -*- coding: utf-8 -*-
-# MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
-#
-# Copyright (c) 2014-2020 Megvii Inc. All rights reserved.
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-from typing import Iterable, Optional, Union
-
-import megengine._internal as mgb
-
-from .graph import _use_default_if_none
-from .tensor import Tensor
-
-__all__ = ["zeros", "ones"]
-
-
-def scalar(
-    value,
-    dtype: type = None,
-    device: Optional[mgb.CompNode] = None,
-    comp_graph: Optional[mgb.CompGraph] = None,
-) -> Tensor:
-    """
-    convert ``value`` to the type of :class:`~.Tensor`.
-    """
-    device, comp_graph = _use_default_if_none(device, comp_graph)
-    return Tensor(mgb.make_immutable(device, comp_graph, value, dtype=dtype, name=None))
-
-
-def zeros(
-    shape: Union[int, Iterable[int], Tensor],
-    dtype: type = None,
-    device: Optional[mgb.CompNode] = None,
-    comp_graph: Optional[mgb.CompGraph] = None,
-) -> Tensor:
-    """
-    Create a tensor filled with 0.
-
-    :param shape: tensor shape
-    :param dtype: data type, Default: "int32"
-    :param device: Compute node of the matrix, Default: None
-    :param comp_graph: Compute graph of the matrix, Default: None
-    :return: tensor of zeros
-
-    Examples:
-
-    .. testcode::
-
-        import megengine as mge
-
-        t = mge.zeros((2, 2), dtype="int32")
-        print(t.numpy())
-
-    Outputs:
-
-    .. testoutput::
-
-        [[0 0]
-         [0 0]]
-
-    """
-    device, comp_graph = _use_default_if_none(device, comp_graph)
-    if isinstance(shape, (int, Tensor)):
-        shape = (shape,)
-    tensor = scalar(0, dtype=dtype, device=device, comp_graph=comp_graph)
-    tensor = tensor.broadcast(*shape)
-    return tensor
-
-
-def ones(
-    shape: Union[int, Iterable[int], Tensor],
-    dtype: type = None,
-    device: Optional[mgb.CompNode] = None,
-    comp_graph: Optional[mgb.CompGraph] = None,
-) -> Tensor:
-    """
-    Create a tensor filled with 1.
-
-    :param shape: tensor shape
-    :param dtype: data type, Default: "int32"
-    :param device: Compute node of the matrix, Default: None
-    :param comp_graph: Compute graph of the matrix, Default: None
-    :return: tensor of ones
-
-    Examples:
-
-    .. testcode::
-
-        import megengine as mge
-
-        t = mge.ones((2, 2), dtype="float32")
-        print(t.numpy())
-
-    Outputs:
-
-    .. testoutput::
-
-        [[1. 1.]
-         [1. 1.]]
-
-    """
-    device, comp_graph = _use_default_if_none(device, comp_graph)
-    if isinstance(shape, (int, Tensor)):
-        shape = (shape,)
-    tensor = scalar(1, dtype=dtype, device=device, comp_graph=comp_graph)
-    tensor = tensor.broadcast(*shape)
-    return tensor
diff --git a/python_module/megengine/core/tensor_nn.py b/python_module/megengine/core/tensor_nn.py
deleted file mode 100644
index e2bbc927..00000000
--- a/python_module/megengine/core/tensor_nn.py
+++ /dev/null
@@ -1,45 +0,0 @@
-# -*- coding: utf-8 -*-
-# MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
-#
-# Copyright (c) 2014-2020 Megvii Inc. All rights reserved.
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-from .tensor import Tensor, tensor
-
-
-class Buffer(Tensor):
-    r"""A kind of Tensor with ``requires_grad=False``.
-    """
-
-    def __init__(self, value, *, dtype=None, device=None, requires_grad=False):
-        # pylint: disable=super-init-not-called
-        t = tensor(value, dtype=dtype, device=device, requires_grad=requires_grad)
-        self.__dict__.update(t.__dict__)
-
-
-class Parameter(Tensor):
-    r"""A kind of Tensor that is to be considered a module parameter.
-    """
-
-    def __init__(self, value, *, dtype=None, device=None, requires_grad=True):
-        # pylint: disable=super-init-not-called
-        if isinstance(value, Tensor):
-            t = value
-        else:
-            t = tensor(value, dtype=dtype, device=device, requires_grad=requires_grad)
-        self.__dict__.update(t.__dict__)
-
-        # broadcast and allreduce will not be performed in optimizer if replica_mode is False
-        self.replica_mode = True
-
-    @property
-    def shape(self):
-        r"""Return shape of parameter.
-        """
-        if self._Tensor__val is not None:
-            return self._Tensor__val.shape
-        elif self._Tensor__sym is not None:
-            return self._Tensor__sym.imm_shape
-        return None
diff --git a/python_module/megengine/data/__init__.py b/python_module/megengine/data/__init__.py
deleted file mode 100644
index 3b1e0d55..00000000
--- a/python_module/megengine/data/__init__.py
+++ /dev/null
@@ -1,17 +0,0 @@
-# -*- coding: utf-8 -*-
-# MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
-#
-# Copyright (c) 2014-2020 Megvii Inc. All rights reserved.
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-from .collator import Collator
-from .dataloader import DataLoader
-from .sampler import (
-    Infinite,
-    RandomSampler,
-    ReplacementSampler,
-    Sampler,
-    SequentialSampler,
-)
diff --git a/python_module/megengine/data/_queue.py b/python_module/megengine/data/_queue.py
deleted file mode 100644
index 8e359ae0..00000000
--- a/python_module/megengine/data/_queue.py
+++ /dev/null
@@ -1,144 +0,0 @@
-# -*- coding: utf-8 -*-
-# MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
-#
-# Copyright (c) 2014-2020 Megvii Inc. All rights reserved.
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-import binascii
-import os
-import queue
-import subprocess
-from multiprocessing import Queue
-
-import pyarrow
-import pyarrow.plasma as plasma
-
-MGE_PLASMA_MEMORY = int(os.environ.get("MGE_PLASMA_MEMORY", 4000000000))  # 4GB
-
-# Each process only need to start one plasma store, so we set it as a global variable.
-# TODO: how to share between different processes?
-MGE_PLASMA_STORE_MANAGER = None
-
-
-def _clear_plasma_store():
-    # `_PlasmaStoreManager.__del__` will not be called automaticly in subprocess,
-    # so this function should be called explicitly
-    global MGE_PLASMA_STORE_MANAGER
-    if MGE_PLASMA_STORE_MANAGER is not None and MGE_PLASMA_STORE_MANAGER.refcount == 0:
-        del MGE_PLASMA_STORE_MANAGER
-        MGE_PLASMA_STORE_MANAGER = None
-
-
-class _PlasmaStoreManager:
-    __initialized = False
-
-    def __init__(self):
-        self.socket_name = "/tmp/mge_plasma_{}".format(
-            binascii.hexlify(os.urandom(8)).decode()
-        )
-        debug_flag = bool(os.environ.get("MGE_DATALOADER_PLASMA_DEBUG", 0))
-        # NOTE: this is a hack. Directly use `plasma_store` may make subprocess
-        # difficult to handle the exception happened in `plasma-store-server`.
-        # For `plasma_store` is just a wrapper of `plasma-store-server`, which use
-        # `os.execv` to call the executable `plasma-store-server`.
-        cmd_path = os.path.join(pyarrow.__path__[0], "plasma-store-server")
-        self.plasma_store = subprocess.Popen(
-            [cmd_path, "-s", self.socket_name, "-m", str(MGE_PLASMA_MEMORY),],
-            stdout=None if debug_flag else subprocess.DEVNULL,
-            stderr=None if debug_flag else subprocess.DEVNULL,
-        )
-        self.__initialized = True
-        self.refcount = 1
-
-    def __del__(self):
-        if self.__initialized and self.plasma_store.returncode is None:
-            self.plasma_store.kill()
-
-
-class PlasmaShmQueue:
-    def __init__(self, maxsize: int = 0):
-        r"""Use pyarrow in-memory plasma store to implement shared memory queue.
-
-        Compared to native `multiprocess.Queue`, `PlasmaShmQueue` avoid pickle/unpickle
-        and communication overhead, leading to better performance in multi-process
-        application.
-
-        :type maxsize: int
-        :param maxsize: maximum size of the queue, `None` means no limit. (default: ``None``)
-        """
-
-        # Lazy start the plasma store manager
-        global MGE_PLASMA_STORE_MANAGER
-        if MGE_PLASMA_STORE_MANAGER is None:
-            try:
-                MGE_PLASMA_STORE_MANAGER = _PlasmaStoreManager()
-            except Exception as e:
-                err_info = (
-                    "Please make sure pyarrow installed correctly!\n"
-                    "You can try reinstall pyarrow and see if you can run "
-                    "`plasma_store -s /tmp/mge_plasma_xxx -m 1000` normally."
-                )
-                raise RuntimeError(
-                    "Exception happened in starting plasma_store: {}\n"
-                    "Tips: {}".format(str(e), err_info)
-                )
-        else:
-            MGE_PLASMA_STORE_MANAGER.refcount += 1
-
-        self.socket_name = MGE_PLASMA_STORE_MANAGER.socket_name
-
-        # TODO: how to catch the exception happened in `plasma.connect`?
-        self.client = None
-
-        # Used to store the header for the data.(ObjectIDs)
-        self.queue = Queue(maxsize)  # type: Queue
-
-    def put(self, data, block=True, timeout=None):
-        if self.client is None:
-            self.client = plasma.connect(self.socket_name)
-        try:
-            object_id = self.client.put(data)
-        except plasma.PlasmaStoreFull:
-            raise RuntimeError("plasma store out of memory!")
-        try:
-            self.queue.put(object_id, block, timeout)
-        except queue.Full:
-            self.client.delete([object_id])
-            raise queue.Full
-
-    def get(self, block=True, timeout=None):
-        if self.client is None:
-            self.client = plasma.connect(self.socket_name)
-        object_id = self.queue.get(block, timeout)
-        if not self.client.contains(object_id):
-            raise RuntimeError(
-                "ObjectID: {} not found in plasma store".format(object_id)
-            )
-        data = self.client.get(object_id)
-        self.client.delete([object_id])
-        return data
-
-    def qsize(self):
-        return self.queue.qsize()
-
-    def empty(self):
-        return self.queue.empty()
-
-    def join(self):
-        self.queue.join()
-
-    def disconnect_client(self):
-        if self.client is not None:
-            self.client.disconnect()
-
-    def close(self):
-        self.queue.close()
-        self.disconnect_client()
-        global MGE_PLASMA_STORE_MANAGER
-        MGE_PLASMA_STORE_MANAGER.refcount -= 1
-        _clear_plasma_store()
-
-    def cancel_join_thread(self):
-        self.queue.cancel_join_thread()
diff --git a/python_module/megengine/data/collator.py b/python_module/megengine/data/collator.py
deleted file mode 100644
index 952fc398..00000000
--- a/python_module/megengine/data/collator.py
+++ /dev/null
@@ -1,76 +0,0 @@
-# -*- coding: utf-8 -*-
-# Copyright (c) 2016-     Facebook, Inc            (Adam Paszke)
-# Copyright (c) 2014-     Facebook, Inc            (Soumith Chintala)
-# Copyright (c) 2011-2014 Idiap Research Institute (Ronan Collobert)
-# Copyright (c) 2012-2014 Deepmind Technologies    (Koray Kavukcuoglu)
-# Copyright (c) 2011-2012 NEC Laboratories America (Koray Kavukcuoglu)
-# Copyright (c) 2011-2013 NYU                      (Clement Farabet)
-# Copyright (c) 2006-2010 NEC Laboratories America (Ronan Collobert, Leon Bottou, Iain Melvin, Jason Weston)
-# Copyright (c) 2006      Idiap Research Institute (Samy Bengio)
-# Copyright (c) 2001-2004 Idiap Research Institute (Ronan Collobert, Samy Bengio, Johnny Mariethoz)
-# ---------------------------------------------------------------------
-# MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
-#
-# Copyright (c) 2014-2020 Megvii Inc. All rights reserved.
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-#
-# This file has been modified by Megvii ("Megvii Modifications").
-# All Megvii Modifications are Copyright (C) 2014-2020 Megvii Inc. All rights reserved.
-# ----------------------------------------------------------------------
-import collections.abc
-import re
-
-import numpy as np
-
-np_str_obj_array_pattern = re.compile(r"[aO]")
-default_collate_err_msg_format = (
-    "default_collator: inputs must contain numpy arrays, numbers, "
-    "Unicode strings, bytes, dicts or lists; found {}"
-)
-
-
-class Collator:
-    r"""
-    Used for merge a list of samples to form a mini-batch of Tenor(s). Used when using batched loading from a dataset.
-    modified from https://github.com/pytorch/pytorch/blob/master/torch/utils/data/_utils/collate.py
-    """
-
-    def apply(self, inputs):
-        """
-        input : sequence_N(tuple(CHW, C, CK))
-        output : tuple(NCHW, NC, NCK)
-        """
-        elem = inputs[0]
-        elem_type = type(elem)
-        if (
-            elem_type.__module__ == "numpy"
-            and elem_type.__name__ != "str_"
-            and elem_type.__name__ != "string_"
-        ):
-            elem = inputs[0]
-            if elem_type.__name__ == "ndarray":
-                # array of string classes and object
-                if np_str_obj_array_pattern.search(elem.dtype.str) is not None:
-                    raise TypeError(default_collate_err_msg_format.format(elem.dtype))
-
-                return np.ascontiguousarray(np.stack(inputs))
-            elif elem.shape == ():  # scalars
-                return np.array(inputs)
-        elif isinstance(elem, float):
-            return np.array(inputs, dtype=np.float64)
-        elif isinstance(elem, int):
-            return np.array(inputs)
-        elif isinstance(elem, (str, bytes)):
-            return inputs
-        elif isinstance(elem, collections.abc.Mapping):
-            return {key: self.apply([d[key] for d in inputs]) for key in elem}
-        elif isinstance(elem, tuple) and hasattr(elem, "_fields"):  # namedtuple
-            return elem_type(*(self.apply(samples) for samples in zip(*inputs)))
-        elif isinstance(elem, collections.abc.Sequence):
-            transposed = zip(*inputs)
-            return [self.apply(samples) for samples in transposed]
-
-        raise TypeError(default_collate_err_msg_format.format(elem_type))
diff --git a/python_module/megengine/data/dataloader.py b/python_module/megengine/data/dataloader.py
deleted file mode 100644
index 1fd3482d..00000000
--- a/python_module/megengine/data/dataloader.py
+++ /dev/null
@@ -1,500 +0,0 @@
-# -*- coding: utf-8 -*-
-# MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
-#
-# Copyright (c) 2014-2020 Megvii Inc. All rights reserved.
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-import collections
-import math
-import multiprocessing
-import queue
-import random
-import time
-
-import numpy as np
-
-from ..logger import get_logger
-from ..random.rng import _random_seed_generator
-from .collator import Collator
-from .dataset import Dataset
-from .sampler import Sampler, SequentialSampler
-from .transform import PseudoTransform, Transform
-
-logger = get_logger(__name__)
-
-
-MP_QUEUE_GET_TIMEOUT = 5
-
-
-class DataLoader:
-    __initialized = False
-
-    def __init__(
-        self,
-        dataset: Dataset,
-        sampler: Sampler = None,
-        transform: Transform = None,
-        collator: Collator = None,
-        num_workers: int = 0,
-        timeout: int = 0,
-        divide: bool = False,
-    ):
-        r"""Provides a convenient way to iterate on a given dataset.
-
-        `DataLoader` combines a dataset with sampler, transform and collator,
-        make it flexible to get minibatch continually from a dataset.
-
-        :type dataset: Dataset
-        :param dataset: dataset from which to load the minibatch.
-        :type sampler: Sampler
-        :param sampler: defines the strategy to sample data from the dataset.
-            If specified, :attr:`shuffle` must be ``False``.
-        :type transform: Transform
-        :param transform: defined the transforming strategy for a sampled batch.
-            (default: ``None``)
-        :type collator: Collator
-        :param collator: defined the merging strategy for a transformed batch.
-            (default: ``None``)
-        :type num_workers: int
-        :param num_workers: the number of sub-process to load, transform and collate
-            the batch. ``0`` means using single-process. (default: ``0``)
-        :type timeout: int
-        :param timeout: if positive, means the timeout value(second) for collecting a
-            batch from workers. (default: 0)
-        :type divide: bool
-        :param divide: define the paralleling strategy in multi-processing mode.
-            ``True`` means one batch is divided into :attr:`num_workers` pieces, and
-            the workers will process these pieces parallelly. ``False`` means
-            different sub-process will process different batch. (default: ``False``)
-
-        """
-
-        if num_workers < 0:
-            raise ValueError("num_workers should not be negative")
-
-        if timeout < 0:
-            raise ValueError("timeout should not be negative")
-
-        if divide and num_workers <= 1:
-            raise ValueError("divide should not be set to True when num_workers <= 1")
-
-        self.dataset = dataset
-        self.num_workers = num_workers
-        self.timeout = timeout
-
-        self.divide = divide
-
-        if sampler is None:
-            self.sampler = SequentialSampler(dataset, batch_size=1, drop_last=False)
-        else:
-            self.sampler = sampler
-
-        if divide:
-            if self.sampler.batch_size <= self.num_workers:
-                raise ValueError(
-                    "batch size must not smaller than num_workers in divide mode."
-                )
-            elif self.sampler.batch_size % self.num_workers:
-                logger.warning(
-                    "batch size is not divisible by num_workers, may lose performance in divide mode."
-                )
-
-        if transform is None:
-            self.transform = PseudoTransform()
-        else:
-            self.transform = transform
-
-        if collator is None:
-            self.collator = Collator()
-        else:
-            self.collator = collator
-
-        self.__initialized = True
-
-    def __iter__(self):
-        if self.num_workers == 0:
-            return _SerialDataLoaderIter(self)
-        else:
-            return _ParallelDataLoaderIter(self)
-
-    def __len__(self):
-        return len(self.sampler)
-
-
-class _BaseDataLoaderIter:
-    def __init__(self, loader):
-        self.dataset = loader.dataset
-        self.sampler = loader.sampler
-        self.seed = _random_seed_generator().__next__()
-        self.transform = loader.transform
-        self.collator = loader.collator
-        self.num_workers = loader.num_workers
-        self.timeout = loader.timeout
-        self.divide = loader.divide
-        self.num_processed = 0
-
-    def _get_next_batch(self):
-        raise NotImplementedError
-
-    def __len__(self):
-        return len(self.sampler)
-
-    def __iter__(self):
-        return self
-
-    def __next__(self):
-        if self.num_processed >= len(self):
-            raise StopIteration
-        minibatch = self._get_next_batch()
-        self.num_processed += 1
-        return minibatch
-
-
-class _SerialDataLoaderIter(_BaseDataLoaderIter):
-    def __init__(self, loader):
-        super(_SerialDataLoaderIter, self).__init__(loader)
-        self.indices_iter = iter(self.sampler)
-
-    def _get_next_batch(self):
-        indices = next(self.indices_iter)
-        items = [self.dataset[idx] for idx in indices]
-        trans_items = self.transform.apply_batch(items)
-        return self.collator.apply(trans_items)
-
-
-class _ParallelDataLoaderIter(_BaseDataLoaderIter):
-    __initialized = False
-
-    def __init__(self, loader):
-        super(_ParallelDataLoaderIter, self).__init__(loader)
-
-        self.task_queues = [
-            multiprocessing.Queue(maxsize=2) for _ in range(self.num_workers)
-        ]
-
-        self.feed_batch_idx = multiprocessing.Value("i", 0)
-        self.target_batch_idx = multiprocessing.Value("i", 0)
-        self.shutdown_flag = multiprocessing.Value("i", 0)
-
-        self.trans_data_queues = [
-            multiprocessing.Queue(maxsize=1) for _ in range(self.num_workers)
-        ]
-
-        # use shared-memory queue implemented by pyarrow plasma store.
-        from ._queue import PlasmaShmQueue
-
-        self.batch_queue = PlasmaShmQueue(maxsize=2)
-
-        self.task_feeding_worker = multiprocessing.Process(
-            target=_task_feeding_loop,
-            args=(
-                iter(self.sampler),
-                self.task_queues,
-                self.num_workers,
-                self.divide,
-                self.shutdown_flag,
-                self.feed_batch_idx,
-            ),
-            daemon=True,
-        )
-        self.task_feeding_worker.start()
-
-        self.workers = []
-        for worker_id in range(self.num_workers):
-            worker = multiprocessing.Process(
-                target=_worker_loop,
-                args=(
-                    self.dataset,
-                    self.task_queues[worker_id],
-                    self.trans_data_queues[worker_id],
-                    self.transform,
-                    self.seed + worker_id + 1,
-                    self.shutdown_flag,
-                ),
-                daemon=True,
-            )
-            worker.start()
-            self.workers.append(worker)
-
-        if self.divide:
-            self.data_collecting_worker = multiprocessing.Process(
-                target=_data_gathering_loop,
-                args=(
-                    self.trans_data_queues,
-                    self.batch_queue,
-                    self.collator,
-                    len(self),
-                    self.num_workers,
-                    self.shutdown_flag,
-                    self.target_batch_idx,
-                ),
-                daemon=True,
-            )
-        else:
-            self.data_collecting_worker = multiprocessing.Process(
-                target=_data_selecting_loop,
-                args=(
-                    self.trans_data_queues,
-                    self.batch_queue,
-                    self.collator,
-                    len(self),
-                    self.num_workers,
-                    self.shutdown_flag,
-                    self.target_batch_idx,
-                ),
-                daemon=True,
-            )
-        self.data_collecting_worker.start()
-
-        self.__initialized = True
-
-    def _check_workers(self):
-        # Check the status of each worker.
-        if not self.data_collecting_worker.is_alive():
-            exitcode = self.task_feeding_worker.exitcode
-            if exitcode != 0:
-                raise RuntimeError("data collecting worker died. {}".format(exitcode))
-
-        if not self.task_feeding_worker.is_alive():
-            exitcode = self.task_feeding_worker.exitcode
-            if exitcode != 0:
-                raise RuntimeError("task feeding worker died. {}".format(exitcode))
-
-        for worker_id, worker in enumerate(self.workers):
-            if not worker.is_alive():
-                exitcode = worker.exitcode
-                if exitcode != 0:
-                    raise RuntimeError("worker:{} died. {}".format(worker_id, exitcode))
-
-        logger.debug("all workers are alive.")
-
-    def _try_get_next_batch(self):
-        start_time = time.time()
-        while True:
-            self._check_workers()
-            try:
-                return self.batch_queue.get(timeout=1)
-            except queue.Empty:
-                logger.debug("batch queue empty!")
-            waited_time = time.time() - start_time
-            if self.timeout > 0:
-                if waited_time > self.timeout:
-                    raise RuntimeError("get_next_batch timeout!")
-
-    def _get_next_batch(self):
-        batch_data = self._try_get_next_batch()
-        return batch_data
-
-    def _shutdown(self):
-        with self.shutdown_flag.get_lock():
-            self.shutdown_flag.value = 1
-
-        if self.task_feeding_worker.is_alive():
-            self.task_feeding_worker.terminate()
-        self.task_feeding_worker.join()
-
-        if self.data_collecting_worker.is_alive():
-            self.data_collecting_worker.terminate()
-        self.data_collecting_worker.join()
-
-        for worker in self.workers:
-            if worker.is_alive():
-                worker.terminate()
-            worker.join()
-
-        for q in self.trans_data_queues:
-            q.cancel_join_thread()
-            q.close()
-
-        for q in self.task_queues:
-            q.cancel_join_thread()
-            q.close()
-
-        self.batch_queue.cancel_join_thread()
-        self.batch_queue.close()
-
-    def __del__(self):
-        if self.__initialized:
-            self._shutdown()
-
-
-def _task_feeding_loop(
-    indices_iter, task_queues, num_workers, divide, shutdown_flag, feed_batch_idx
-):
-    # Feed the indices into the task queues
-    while True:
-        if shutdown_flag.value == 1:
-            break
-        batch_idx = feed_batch_idx.value
-        try:
-            indices = next(indices_iter)
-        except StopIteration:
-            break
-        if divide:
-            # make sure all task_queues is ready for put
-            while any([q.full() for q in task_queues]):
-                if shutdown_flag.value == 1:
-                    return
-            # divide into small pieces, feed to different workers.
-            sub_num = math.ceil(len(indices) / num_workers)
-            for worker_id in range(num_workers):
-                sub_indices = indices[worker_id * sub_num : (worker_id + 1) * sub_num]
-                task_queues[worker_id].put((batch_idx, sub_indices))
-        else:
-            # distribute tasks to different workers uniformly.
-            target_id = batch_idx % num_workers
-            while task_queues[target_id].full():
-                if shutdown_flag.value == 1:
-                    return
-            task_queues[target_id].put((batch_idx, indices))
-        with feed_batch_idx.get_lock():
-            feed_batch_idx.value += 1
-
-
-def _worker_loop(dataset, task_queue, trans_data_queue, transform, seed, shutdown_flag):
-    # Get dataset items and do the transform
-    random.seed(seed)
-    np.random.seed(seed)
-    while True:
-        if shutdown_flag.value == 1:
-            break
-        try:
-            batch_idx, indices = task_queue.get(timeout=MP_QUEUE_GET_TIMEOUT)
-        except queue.Empty:
-            continue
-        if len(indices) > 0:
-            items = [dataset[idx] for idx in indices]
-            trans_items = transform.apply_batch(items)
-        else:
-            # in case of incomplete last batch
-            trans_items = ()
-        while True:
-            try:
-                trans_data_queue.put((batch_idx, trans_items), timeout=1)
-                break
-            except queue.Full:
-                if shutdown_flag.value == 1:
-                    break
-                logger.debug("batch part queue is full!")
-
-
-def _data_gathering_loop(
-    trans_data_queues,
-    batch_queue,
-    collator,
-    length,
-    num_workers,
-    shutdown_flag,
-    target_idx,
-):
-    # Gathering the small pieces of batch data into full batch data
-    while True:
-        if shutdown_flag.value == 1:
-            break
-
-        target_batch_idx = target_idx.value
-
-        if target_batch_idx >= length:
-            break
-
-        full_trans_items = []
-        for worker_id in range(num_workers):
-            while True:
-                try:
-                    batch_idx, trans_items = trans_data_queues[worker_id].get(
-                        timeout=MP_QUEUE_GET_TIMEOUT
-                    )
-                    break
-                except queue.Empty:
-                    if shutdown_flag.value == 1:
-                        break
-                    logger.debug(
-                        "worker:{} data queue get timeout! target batch idx:{}".format(
-                            worker_id, target_batch_idx
-                        )
-                    )
-            if batch_idx != target_batch_idx:
-                raise RuntimeError(
-                    "Unexperted batch_idx in data gathering loop. worker_id:{}.".format(
-                        worker_id
-                    )
-                )
-            else:
-                full_trans_items.extend(trans_items)
-
-        # Merge different parts into a batch.
-        full_batch = collator.apply(full_trans_items)
-
-        while True:
-            try:
-                batch_queue.put(full_batch, timeout=1)
-                break
-            except queue.Full:
-                if shutdown_flag.value == 1:
-                    break
-                logger.debug("batch queue is full!")
-
-        with target_idx.get_lock():
-            target_idx.value += 1
-
-    batch_queue.disconnect_client()
-
-
-def _data_selecting_loop(
-    trans_data_queues,
-    batch_queue,
-    collator,
-    length,
-    num_workers,
-    shutdown_flag,
-    target_idx,
-):
-    # Make sure that batch is generated exactly with the same order as generated indices
-    while True:
-        if shutdown_flag.value == 1:
-            break
-
-        target_batch_idx = target_idx.value
-
-        if target_batch_idx >= length:
-            break
-
-        target_worker_id = target_batch_idx % num_workers
-        while True:
-            try:
-                batch_idx, trans_items = trans_data_queues[target_worker_id].get(
-                    timeout=MP_QUEUE_GET_TIMEOUT
-                )
-                batch_data = collator.apply(trans_items)
-                break
-            except queue.Empty:
-                if shutdown_flag.value == 1:
-                    break
-                logger.debug(
-                    "worker:{} data queue get timeout! target batch idx:{}".format(
-                        target_worker_id, target_batch_idx
-                    )
-                )
-
-        if batch_idx != target_batch_idx:
-            raise RuntimeError(
-                "batch_idx {} mismatch the target_batch_idx {}".format(
-                    batch_idx, target_batch_idx
-                )
-            )
-
-        while True:
-            try:
-                batch_queue.put(batch_data, timeout=1)
-                break
-            except queue.Full:
-                if shutdown_flag.value == 1:
-                    break
-                logger.debug("batch queue is full!")
-
-        with target_idx.get_lock():
-            target_idx.value += 1
-
-    batch_queue.disconnect_client()
diff --git a/python_module/megengine/data/dataset/__init__.py b/python_module/megengine/data/dataset/__init__.py
deleted file mode 100644
index 8b70d221..00000000
--- a/python_module/megengine/data/dataset/__init__.py
+++ /dev/null
@@ -1,10 +0,0 @@
-# -*- coding: utf-8 -*-
-# MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
-#
-# Copyright (c) 2014-2020 Megvii Inc. All rights reserved.
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-from .meta_dataset import ArrayDataset, Dataset, MapDataset, StreamDataset
-from .vision import *
diff --git a/python_module/megengine/data/dataset/meta_dataset.py b/python_module/megengine/data/dataset/meta_dataset.py
deleted file mode 100644
index 4415a427..00000000
--- a/python_module/megengine/data/dataset/meta_dataset.py
+++ /dev/null
@@ -1,73 +0,0 @@
-# -*- coding: utf-8 -*-
-# MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
-#
-# Copyright (c) 2014-2020 Megvii Inc. All rights reserved.
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-from abc import ABC, abstractmethod
-from typing import Tuple
-
-
-class Dataset(ABC):
-    r"""
-    An abstract class for all Datasets
-    """
-
-    @abstractmethod
-    def __init__(self):
-        pass
-
-
-class MapDataset(Dataset):
-    r"""
-    An abstract class for map data
-    __getitem__ and __len__ method are aditionally needed
-    """
-
-    @abstractmethod
-    def __init__(self):
-        pass
-
-    @abstractmethod
-    def __getitem__(self, index):
-        pass
-
-    @abstractmethod
-    def __len__(self):
-        pass
-
-
-class StreamDataset(Dataset):
-    r"""
-    An abstract class for stream data
-    __iter__ method is aditionally needed
-    """
-
-    @abstractmethod
-    def __init__(self):
-        pass
-
-    @abstractmethod
-    def __iter__(self):
-        pass
-
-
-class ArrayDataset(MapDataset):
-    def __init__(self, *arrays):
-        r"""
-        ArrayDataset is a dataset for numpy array data, one or more numpy arrays
-         are needed to initiate the dataset. And the dimensions represented sample number
-         are expected to be the same.
-        """
-        super().__init__()
-        if not all(len(arrays[0]) == len(array) for array in arrays):
-            raise ValueError("lengths of input arrays are inconsistent")
-        self.arrays = arrays
-
-    def __getitem__(self, index: int) -> Tuple:
-        return tuple(array[index] for array in self.arrays)
-
-    def __len__(self) -> int:
-        return len(self.arrays[0])
diff --git a/python_module/megengine/data/dataset/vision/__init__.py b/python_module/megengine/data/dataset/vision/__init__.py
deleted file mode 100644
index dd2b0fc3..00000000
--- a/python_module/megengine/data/dataset/vision/__init__.py
+++ /dev/null
@@ -1,17 +0,0 @@
-# -*- coding: utf-8 -*-
-# MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
-#
-# Copyright (c) 2014-2020 Megvii Inc. All rights reserved.
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-from .cifar import CIFAR10, CIFAR100
-from .cityscapes import Cityscapes
-from .coco import COCO
-from .folder import ImageFolder
-from .imagenet import ImageNet
-from .meta_vision import VisionDataset
-from .mnist import MNIST
-from .objects365 import Objects365
-from .voc import PascalVOC
diff --git a/python_module/megengine/data/dataset/vision/cifar.py b/python_module/megengine/data/dataset/vision/cifar.py
deleted file mode 100644
index 9ce73688..00000000
--- a/python_module/megengine/data/dataset/vision/cifar.py
+++ /dev/null
@@ -1,171 +0,0 @@
-# -*- coding: utf-8 -*-
-# MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
-#
-# Copyright (c) 2014-2020 Megvii Inc. All rights reserved.
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-import os
-import pickle
-import tarfile
-from typing import Tuple
-
-import numpy as np
-
-from ....logger import get_logger
-from .meta_vision import VisionDataset
-from .utils import _default_dataset_root, load_raw_data_from_url
-
-logger = get_logger(__name__)
-
-
-class CIFAR10(VisionDataset):
-    r""" ``Dataset`` for CIFAR10 meta data
-    """
-
-    url_path = "http://www.cs.utoronto.ca/~kriz/"
-    raw_file_name = "cifar-10-python.tar.gz"
-    raw_file_md5 = "c58f30108f718f92721af3b95e74349a"
-    raw_file_dir = "cifar-10-batches-py"
-    train_batch = [
-        "data_batch_1",
-        "data_batch_2",
-        "data_batch_3",
-        "data_batch_4",
-        "data_batch_5",
-    ]
-    test_batch = ["test_batch"]
-    meta_info = {"name": "batches.meta"}
-
-    def __init__(
-        self,
-        root: str = None,
-        train: bool = True,
-        download: bool = True,
-        timeout: int = 500,
-    ):
-        super().__init__(root, order=("image", "image_category"))
-
-        self.timeout = timeout
-
-        # process the root path
-        if root is None:
-            self.root = self._default_root
-            if not os.path.exists(self.root):
-                os.makedirs(self.root)
-        else:
-            self.root = root
-            if not os.path.exists(self.root):
-                if download:
-                    logger.debug(
-                        "dir %s does not exist, will be automatically created",
-                        self.root,
-                    )
-                    os.makedirs(self.root)
-                else:
-                    raise ValueError("dir %s does not exist" % self.root)
-
-        self.target_file = os.path.join(self.root, self.raw_file_dir)
-
-        # check existence of target pickle dir, if exists load the
-        # pickle file no matter what download is set
-        if os.path.exists(self.target_file):
-            if train:
-                self.arrays = self.bytes2array(self.train_batch)
-            else:
-                self.arrays = self.bytes2array(self.test_batch)
-        else:
-            if download:
-                self.download()
-                if train:
-                    self.arrays = self.bytes2array(self.train_batch)
-                else:
-                    self.arrays = self.bytes2array(self.test_batch)
-            else:
-                raise ValueError(
-                    "dir does not contain target file %s, please set download=True"
-                    % (self.target_file)
-                )
-
-    def __getitem__(self, index: int) -> Tuple:
-        return tuple(array[index] for array in self.arrays)
-
-    def __len__(self) -> int:
-        return len(self.arrays[0])
-
-    @property
-    def _default_root(self):
-        return os.path.join(_default_dataset_root(), self.__class__.__name__)
-
-    @property
-    def meta(self):
-        meta_path = os.path.join(self.root, self.raw_file_dir, self.meta_info["name"])
-        with open(meta_path, "rb") as f:
-            meta = pickle.load(f, encoding="bytes")
-        return meta
-
-    def download(self):
-        url = self.url_path + self.raw_file_name
-        load_raw_data_from_url(
-            url, self.raw_file_name, self.raw_file_md5, self.root, self.timeout
-        )
-        self.process()
-
-    def untar(self, file_path, dirs):
-        assert file_path.endswith(".tar.gz")
-        logger.debug("untar file %s to %s", file_path, dirs)
-        t = tarfile.open(file_path)
-        t.extractall(path=dirs)
-
-    def bytes2array(self, filenames):
-        data = []
-        label = []
-        for filename in filenames:
-            path = os.path.join(self.root, self.raw_file_dir, filename)
-            logger.debug("unpickle file %s", path)
-            with open(path, "rb") as fo:
-                dic = pickle.load(fo, encoding="bytes")
-                batch_data = dic[b"data"].reshape(-1, 3, 32, 32).transpose((0, 2, 3, 1))
-                data.extend(list(batch_data[..., [2, 1, 0]]))
-                label.extend(dic[b"labels"])
-        label = np.array(label, dtype=np.int32)
-        return (data, label)
-
-    def process(self):
-        logger.info("process raw data ...")
-        self.untar(os.path.join(self.root, self.raw_file_name), self.root)
-
-
-class CIFAR100(CIFAR10):
-    url_path = "http://www.cs.utoronto.ca/~kriz/"
-    raw_file_name = "cifar-100-python.tar.gz"
-    raw_file_md5 = "eb9058c3a382ffc7106e4002c42a8d85"
-    raw_file_dir = "cifar-100-python"
-    train_batch = ["train"]
-    test_batch = ["test"]
-    meta_info = {"name": "meta"}
-
-    @property
-    def meta(self):
-        meta_path = os.path.join(self.root, self.raw_file_dir, self.meta_info["name"])
-        with open(meta_path, "rb") as f:
-            meta = pickle.load(f, encoding="bytes")
-        return meta
-
-    def bytes2array(self, filenames):
-        data = []
-        fine_label = []
-        coarse_label = []
-        for filename in filenames:
-            path = os.path.join(self.root, self.raw_file_dir, filename)
-            logger.debug("unpickle file %s", path)
-            with open(path, "rb") as fo:
-                dic = pickle.load(fo, encoding="bytes")
-                batch_data = dic[b"data"].reshape(-1, 3, 32, 32).transpose((0, 2, 3, 1))
-                data.extend(list(batch_data[..., [2, 1, 0]]))
-                fine_label.extend(dic[b"fine_labels"])
-                coarse_label.extend(dic[b"coarse_labels"])
-        fine_label = np.array(fine_label, dtype=np.int32)
-        coarse_label = np.array(coarse_label, dtype=np.int32)
-        return data, fine_label, coarse_label
diff --git a/python_module/megengine/data/dataset/vision/cityscapes.py b/python_module/megengine/data/dataset/vision/cityscapes.py
deleted file mode 100644
index aa05ac92..00000000
--- a/python_module/megengine/data/dataset/vision/cityscapes.py
+++ /dev/null
@@ -1,151 +0,0 @@
-# -*- coding: utf-8 -*-
-# MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
-#
-# Copyright (c) 2014-2020 Megvii Inc. All rights reserved.
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# ---------------------------------------------------------------------
-# Part of the following code in this file refs to torchvision
-# BSD 3-Clause License
-#
-# Copyright (c) Soumith Chintala 2016,
-# All rights reserved.
-# ---------------------------------------------------------------------
-import json
-import os
-
-import cv2
-import numpy as np
-
-from .meta_vision import VisionDataset
-
-
-class Cityscapes(VisionDataset):
-    r"""`Cityscapes <http://www.cityscapes-dataset.com/>`_ Dataset.
-    """
-
-    supported_order = (
-        "image",
-        "mask",
-        "info",
-    )
-
-    def __init__(self, root, image_set, mode, *, order=None):
-        super().__init__(root, order=order, supported_order=self.supported_order)
-
-        city_root = self.root
-        if not os.path.isdir(city_root):
-            raise RuntimeError("Dataset not found or corrupted.")
-
-        self.mode = mode
-        self.images_dir = os.path.join(city_root, "leftImg8bit", image_set)
-        self.masks_dir = os.path.join(city_root, self.mode, image_set)
-        self.images, self.masks = [], []
-        # self.target_type = ["instance", "semantic", "polygon", "color"]
-
-        # for semantic segmentation
-        if mode == "gtFine":
-            valid_modes = ("train", "test", "val")
-        else:
-            valid_modes = ("train", "train_extra", "val")
-
-        for city in os.listdir(self.images_dir):
-            img_dir = os.path.join(self.images_dir, city)
-            mask_dir = os.path.join(self.masks_dir, city)
-            for file_name in os.listdir(img_dir):
-                mask_name = "{}_{}".format(
-                    file_name.split("_leftImg8bit")[0],
-                    self._get_target_suffix(self.mode, "semantic"),
-                )
-                self.images.append(os.path.join(img_dir, file_name))
-                self.masks.append(os.path.join(mask_dir, mask_name))
-
-    def __getitem__(self, index):
-        target = []
-        for k in self.order:
-            if k == "image":
-                image = cv2.imread(self.images[index], cv2.IMREAD_COLOR)
-                target.append(image)
-            elif k == "mask":
-                mask = cv2.imread(self.masks[index], cv2.IMREAD_GRAYSCALE)
-                mask = self._trans_mask(mask)
-                mask = mask[:, :, np.newaxis]
-                target.append(mask)
-            elif k == "info":
-                if image is None:
-                    image = cv2.imread(self.images[index], cv2.IMREAD_COLOR)
-                info = [image.shape[0], image.shape[1], self.images[index]]
-                target.append(info)
-            else:
-                raise NotImplementedError
-
-        return tuple(target)
-
-    def __len__(self):
-        return len(self.images)
-
-    def _trans_mask(self, mask):
-        trans_labels = [
-            7,
-            8,
-            11,
-            12,
-            13,
-            17,
-            19,
-            20,
-            21,
-            22,
-            23,
-            24,
-            25,
-            26,
-            27,
-            28,
-            31,
-            32,
-            33,
-        ]
-        label = np.ones(mask.shape) * 255
-        for i, tl in enumerate(trans_labels):
-            label[mask == tl] = i
-        return label.astype(np.uint8)
-
-    def _get_target_suffix(self, mode, target_type):
-        if target_type == "instance":
-            return "{}_instanceIds.png".format(mode)
-        elif target_type == "semantic":
-            return "{}_labelIds.png".format(mode)
-        elif target_type == "color":
-            return "{}_color.png".format(mode)
-        else:
-            return "{}_polygons.json".format(mode)
-
-    def _load_json(self, path):
-        with open(path, "r") as file:
-            data = json.load(file)
-        return data
-
-    class_names = (
-        "road",
-        "sidewalk",
-        "building",
-        "wall",
-        "fence",
-        "pole",
-        "traffic light",
-        "traffic sign",
-        "vegetation",
-        "terrain",
-        "sky",
-        "person",
-        "rider",
-        "car",
-        "truck",
-        "bus",
-        "train",
-        "motorcycle",
-        "bicycle",
-    )
diff --git a/python_module/megengine/data/dataset/vision/coco.py b/python_module/megengine/data/dataset/vision/coco.py
deleted file mode 100644
index 11366de0..00000000
--- a/python_module/megengine/data/dataset/vision/coco.py
+++ /dev/null
@@ -1,366 +0,0 @@
-# -*- coding: utf-8 -*-
-# MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
-#
-# Copyright (c) 2014-2020 Megvii Inc. All rights reserved.
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# ---------------------------------------------------------------------
-# Part of the following code in this file refs to maskrcnn-benchmark
-# MIT License
-#
-# Copyright (c) 2018 Facebook
-# ---------------------------------------------------------------------
-import json
-import os
-from collections import defaultdict
-
-import cv2
-import numpy as np
-
-from .meta_vision import VisionDataset
-
-min_keypoints_per_image = 10
-
-
-def _count_visible_keypoints(anno):
-    return sum(sum(1 for v in ann["keypoints"][2::3] if v > 0) for ann in anno)
-
-
-def has_valid_annotation(anno, order):
-    # if it"s empty, there is no annotation
-    if len(anno) == 0:
-        return False
-    if "boxes" in order or "boxes_category" in order:
-        if "bbox" not in anno[0]:
-            return False
-    if "keypoints" in order:
-        if "keypoints" not in anno[0]:
-            return False
-        # for keypoint detection tasks, only consider valid images those
-        # containing at least min_keypoints_per_image
-        if _count_visible_keypoints(anno) < min_keypoints_per_image:
-            return False
-    return True
-
-
-class COCO(VisionDataset):
-    r"""`MS COCO <http://cocodataset.org/#home>`_ Dataset.
-    """
-
-    supported_order = (
-        "image",
-        "boxes",
-        "boxes_category",
-        "keypoints",
-        # TODO: need to check
-        # "polygons",
-        "info",
-    )
-
-    def __init__(
-        self, root, ann_file, remove_images_without_annotations=False, *, order=None
-    ):
-        super().__init__(root, order=order, supported_order=self.supported_order)
-
-        with open(ann_file, "r") as f:
-            dataset = json.load(f)
-
-        self.imgs = dict()
-        for img in dataset["images"]:
-            # for saving memory
-            if "license" in img:
-                del img["license"]
-            if "coco_url" in img:
-                del img["coco_url"]
-            if "date_captured" in img:
-                del img["date_captured"]
-            if "flickr_url" in img:
-                del img["flickr_url"]
-            self.imgs[img["id"]] = img
-
-        self.img_to_anns = defaultdict(list)
-        for ann in dataset["annotations"]:
-            # for saving memory
-            if (
-                "boxes" not in self.order
-                and "boxes_category" not in self.order
-                and "bbox" in ann
-            ):
-                del ann["bbox"]
-            if "polygons" not in self.order and "segmentation" in ann:
-                del ann["segmentation"]
-            self.img_to_anns[ann["image_id"]].append(ann)
-
-        self.cats = dict()
-        for cat in dataset["categories"]:
-            self.cats[cat["id"]] = cat
-
-        self.ids = list(sorted(self.imgs.keys()))
-
-        # filter images without detection annotations
-        if remove_images_without_annotations:
-            ids = []
-            for img_id in self.ids:
-                anno = self.img_to_anns[img_id]
-                # filter crowd annotations
-                anno = [obj for obj in anno if obj["iscrowd"] == 0]
-                anno = [
-                    obj for obj in anno if obj["bbox"][2] > 0 and obj["bbox"][3] > 0
-                ]
-                if has_valid_annotation(anno, order):
-                    ids.append(img_id)
-                    self.img_to_anns[img_id] = anno
-                else:
-                    del self.imgs[img_id]
-                    del self.img_to_anns[img_id]
-            self.ids = ids
-
-        self.json_category_id_to_contiguous_id = {
-            v: i + 1 for i, v in enumerate(sorted(self.cats.keys()))
-        }
-
-        self.contiguous_category_id_to_json_id = {
-            v: k for k, v in self.json_category_id_to_contiguous_id.items()
-        }
-
-    def __getitem__(self, index):
-        img_id = self.ids[index]
-        anno = self.img_to_anns[img_id]
-
-        target = []
-        for k in self.order:
-            if k == "image":
-                file_name = self.imgs[img_id]["file_name"]
-                path = os.path.join(self.root, file_name)
-                image = cv2.imread(path, cv2.IMREAD_COLOR)
-                target.append(image)
-            elif k == "boxes":
-                boxes = [obj["bbox"] for obj in anno]
-                boxes = np.array(boxes, dtype=np.float32).reshape(-1, 4)
-                # transfer boxes from xywh to xyxy
-                boxes[:, 2:] += boxes[:, :2]
-                target.append(boxes)
-            elif k == "boxes_category":
-                boxes_category = [obj["category_id"] for obj in anno]
-                boxes_category = [
-                    self.json_category_id_to_contiguous_id[c] for c in boxes_category
-                ]
-                boxes_category = np.array(boxes_category, dtype=np.int32)
-                target.append(boxes_category)
-            elif k == "keypoints":
-                keypoints = [obj["keypoints"] for obj in anno]
-                keypoints = np.array(keypoints, dtype=np.float32).reshape(
-                    -1, len(self.keypoint_names), 3
-                )
-                target.append(keypoints)
-            elif k == "polygons":
-                polygons = [obj["segmentation"] for obj in anno]
-                polygons = [
-                    [np.array(p, dtype=np.float32).reshape(-1, 2) for p in ps]
-                    for ps in polygons
-                ]
-                target.append(polygons)
-            elif k == "info":
-                info = self.imgs[img_id]
-                info = [info["height"], info["width"], info["file_name"]]
-                target.append(info)
-            else:
-                raise NotImplementedError
-
-        return tuple(target)
-
-    def __len__(self):
-        return len(self.ids)
-
-    def get_img_info(self, index):
-        img_id = self.ids[index]
-        img_info = self.imgs[img_id]
-        return img_info
-
-    class_names = (
-        "person",
-        "bicycle",
-        "car",
-        "motorcycle",
-        "airplane",
-        "bus",
-        "train",
-        "truck",
-        "boat",
-        "traffic light",
-        "fire hydrant",
-        "stop sign",
-        "parking meter",
-        "bench",
-        "bird",
-        "cat",
-        "dog",
-        "horse",
-        "sheep",
-        "cow",
-        "elephant",
-        "bear",
-        "zebra",
-        "giraffe",
-        "backpack",
-        "umbrella",
-        "handbag",
-        "tie",
-        "suitcase",
-        "frisbee",
-        "skis",
-        "snowboard",
-        "sports ball",
-        "kite",
-        "baseball bat",
-        "baseball glove",
-        "skateboard",
-        "surfboard",
-        "tennis racket",
-        "bottle",
-        "wine glass",
-        "cup",
-        "fork",
-        "knife",
-        "spoon",
-        "bowl",
-        "banana",
-        "apple",
-        "sandwich",
-        "orange",
-        "broccoli",
-        "carrot",
-        "hot dog",
-        "pizza",
-        "donut",
-        "cake",
-        "chair",
-        "couch",
-        "potted plant",
-        "bed",
-        "dining table",
-        "toilet",
-        "tv",
-        "laptop",
-        "mouse",
-        "remote",
-        "keyboard",
-        "cell phone",
-        "microwave",
-        "oven",
-        "toaster",
-        "sink",
-        "refrigerator",
-        "book",
-        "clock",
-        "vase",
-        "scissors",
-        "teddy bear",
-        "hair drier",
-        "toothbrush",
-    )
-
-    classes_originID = {
-        "person": 1,
-        "bicycle": 2,
-        "car": 3,
-        "motorcycle": 4,
-        "airplane": 5,
-        "bus": 6,
-        "train": 7,
-        "truck": 8,
-        "boat": 9,
-        "traffic light": 10,
-        "fire hydrant": 11,
-        "stop sign": 13,
-        "parking meter": 14,
-        "bench": 15,
-        "bird": 16,
-        "cat": 17,
-        "dog": 18,
-        "horse": 19,
-        "sheep": 20,
-        "cow": 21,
-        "elephant": 22,
-        "bear": 23,
-        "zebra": 24,
-        "giraffe": 25,
-        "backpack": 27,
-        "umbrella": 28,
-        "handbag": 31,
-        "tie": 32,
-        "suitcase": 33,
-        "frisbee": 34,
-        "skis": 35,
-        "snowboard": 36,
-        "sports ball": 37,
-        "kite": 38,
-        "baseball bat": 39,
-        "baseball glove": 40,
-        "skateboard": 41,
-        "surfboard": 42,
-        "tennis racket": 43,
-        "bottle": 44,
-        "wine glass": 46,
-        "cup": 47,
-        "fork": 48,
-        "knife": 49,
-        "spoon": 50,
-        "bowl": 51,
-        "banana": 52,
-        "apple": 53,
-        "sandwich": 54,
-        "orange": 55,
-        "broccoli": 56,
-        "carrot": 57,
-        "hot dog": 58,
-        "pizza": 59,
-        "donut": 60,
-        "cake": 61,
-        "chair": 62,
-        "couch": 63,
-        "potted plant": 64,
-        "bed": 65,
-        "dining table": 67,
-        "toilet": 70,
-        "tv": 72,
-        "laptop": 73,
-        "mouse": 74,
-        "remote": 75,
-        "keyboard": 76,
-        "cell phone": 77,
-        "microwave": 78,
-        "oven": 79,
-        "toaster": 80,
-        "sink": 81,
-        "refrigerator": 82,
-        "book": 84,
-        "clock": 85,
-        "vase": 86,
-        "scissors": 87,
-        "teddy bear": 88,
-        "hair drier": 89,
-        "toothbrush": 90,
-    }
-
-    keypoint_names = (
-        "nose",
-        "left_eye",
-        "right_eye",
-        "left_ear",
-        "right_ear",
-        "left_shoulder",
-        "right_shoulder",
-        "left_elbow",
-        "right_elbow",
-        "left_wrist",
-        "right_wrist",
-        "left_hip",
-        "right_hip",
-        "left_knee",
-        "right_knee",
-        "left_ankle",
-        "right_ankle",
-    )
diff --git a/python_module/megengine/data/dataset/vision/folder.py b/python_module/megengine/data/dataset/vision/folder.py
deleted file mode 100644
index 7124ef56..00000000
--- a/python_module/megengine/data/dataset/vision/folder.py
+++ /dev/null
@@ -1,90 +0,0 @@
-# -*- coding: utf-8 -*-
-# BSD 3-Clause License
-
-# Copyright (c) Soumith Chintala 2016,
-# All rights reserved.
-# ---------------------------------------------------------------------
-# MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
-#
-# Copyright (c) 2014-2020 Megvii Inc. All rights reserved.
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-#
-# This file has been modified by Megvii ("Megvii Modifications").
-# All Megvii Modifications are Copyright (C) 2014-2020 Megvii Inc. All rights reserved.
-# ---------------------------------------------------------------------
-import os
-from typing import Dict, List, Tuple
-
-import cv2
-import numpy as np
-
-from .meta_vision import VisionDataset
-from .utils import is_img
-
-
-class ImageFolder(VisionDataset):
-    def __init__(self, root: str, check_valid_func=None, class_name: bool = False):
-        r"""
-        ImageFolder is a class for loading image data and labels from a organized folder.
-
-        the folder is expected to be organized as followed
-        root/cls/xxx.img_ext
-
-        labels are indices of sorted classes in the root directory
-
-        :param root: root directory of an image folder
-        :param loader: a function used to load image from path,
-                       if ``None``, default function that loads
-                       images with PILwill be called
-        :param check_valid_func: a function used to check if files in folder are
-                                 expected image files, if ``None``, default function
-                                 that checks file extensions will be called
-        :param class_name: if ``True``, return class name instead of class index
-
-        """
-        super().__init__(root, order=("image", "image_category"))
-
-        self.root = root
-
-        if check_valid_func is not None:
-            self.check_valid = check_valid_func
-        else:
-            self.check_valid = is_img
-
-        self.class_name = class_name
-
-        self.class_dict = self.collect_class()
-        self.samples = self.collect_samples()
-
-    def collect_samples(self) -> List:
-        samples = []
-        directory = os.path.expanduser(self.root)
-        for key in sorted(self.class_dict.keys()):
-            d = os.path.join(directory, key)
-            if not os.path.isdir(d):
-                continue
-            for r, _, filename in sorted(os.walk(d, followlinks=True)):
-                for name in sorted(filename):
-                    path = os.path.join(r, name)
-                    if self.check_valid(path):
-                        if self.class_name:
-                            samples.append((path, key))
-                        else:
-                            samples.append((path, self.class_dict[key]))
-        return samples
-
-    def collect_class(self) -> Dict:
-        classes = [d.name for d in os.scandir(self.root) if d.is_dir()]
-        classes.sort()
-        return {classes[i]: np.int32(i) for i in range(len(classes))}
-
-    def __getitem__(self, index: int) -> Tuple:
-        path, label = self.samples[index]
-        img = cv2.imread(path, cv2.IMREAD_COLOR)
-        return img, label
-
-    def __len__(self):
-        return len(self.samples)
diff --git a/python_module/megengine/data/dataset/vision/imagenet.py b/python_module/megengine/data/dataset/vision/imagenet.py
deleted file mode 100644
index 449401bf..00000000
--- a/python_module/megengine/data/dataset/vision/imagenet.py
+++ /dev/null
@@ -1,248 +0,0 @@
-# -*- coding: utf-8 -*-
-# BSD 3-Clause License
-#
-# Copyright (c) Soumith Chintala 2016,
-# All rights reserved.
-# ---------------------------------------------------------------------
-# MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
-#
-# Copyright (c) 2014-2020 Megvii Inc. All rights reserved.
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-#
-# This file has been modified by Megvii ("Megvii Modifications").
-# All Megvii Modifications are Copyright (C) 2014-2020 Megvii Inc. All rights reserved.
-# ---------------------------------------------------------------------
-import os
-import shutil
-
-from tqdm import tqdm
-
-from ....core.serialization import load, save
-from ....distributed.util import is_distributed
-from ....logger import get_logger
-from .folder import ImageFolder
-from .utils import _default_dataset_root, calculate_md5, untar, untargz
-
-logger = get_logger(__name__)
-
-
-class ImageNet(ImageFolder):
-    r"""
-    Load ImageNet from raw files or folder, expected folder looks like
-
-    .. code-block:: bash
-
-        ${root}/
-        |       [REQUIRED TAR FILES]
-        |-  ILSVRC2012_img_train.tar
-        |-  ILSVRC2012_img_val.tar
-        |-  ILSVRC2012_devkit_t12.tar.gz
-        |       [OPTIONAL IMAGE FOLDERS]
-        |-  train/cls/xxx.${img_ext}
-        |-  val/cls/xxx.${img_ext}
-        |-  ILSVRC2012_devkit_t12/data/meta.mat
-        |-  ILSVRC2012_devkit_t12/data/ILSVRC2012_validation_ground_truth.txt
-
-    If the image folders don't exist, raw tar files are required to get extracted and processed.
-    """
-
-    raw_file_meta = {
-        "train": ("ILSVRC2012_img_train.tar", "1d675b47d978889d74fa0da5fadfb00e"),
-        "val": ("ILSVRC2012_img_val.tar", "29b22e2961454d5413ddabcf34fc5622"),
-        "devkit": ("ILSVRC2012_devkit_t12.tar.gz", "fa75699e90414af021442c21a62c3abf"),
-    }  # ImageNet raw files
-    default_train_dir = "train"
-    default_val_dir = "val"
-    default_devkit_dir = "ILSVRC2012_devkit_t12"
-
-    def __init__(self, root: str = None, train: bool = True, **kwargs):
-        r"""
-        initialization:
-
-        * if ``root`` contains ``self.target_folder`` depent on ``train``:
-
-          * initialize ImageFolder with target_folder
-
-        * else:
-
-          * if all raw files are in ``root``:
-
-            * parse ``self.target_folder`` from raw files
-            * initialize ImageFolder with ``self.target_folder``
-
-          * else:
-
-            * raise error
-
-        :param root: root directory of imagenet data, if root is ``None``, used default_dataset_root
-        :param train: if ``True``, load the train split, otherwise load the validation split
-        """
-
-        # process the root path
-        if root is None:
-            self.root = self._default_root
-        else:
-            self.root = root
-
-        if not os.path.exists(self.root):
-            raise FileNotFoundError("dir %s does not exist" % self.root)
-
-        self.devkit_dir = os.path.join(self.root, self.default_devkit_dir)
-
-        if not os.path.exists(self.devkit_dir):
-            logger.warning("devkit directory %s does not exists", self.devkit_dir)
-            self._prepare_devkit()
-
-        self.train = train
-
-        if train:
-            self.target_folder = os.path.join(self.root, self.default_train_dir)
-        else:
-            self.target_folder = os.path.join(self.root, self.default_val_dir)
-
-        if not os.path.exists(self.target_folder):
-            logger.warning(
-                "expected image folder %s does not exist, try to load from raw file",
-                self.target_folder,
-            )
-            if not self.check_raw_file():
-                raise FileNotFoundError(
-                    "expected image folder %s does not exist, and raw files do not exist in %s"
-                    % (self.target_folder, self.root)
-                )
-            elif is_distributed():
-                raise RuntimeError(
-                    "extracting raw file shouldn't be done in distributed mode, use single process instead"
-                )
-            elif train:
-                self._prepare_train()
-            else:
-                self._prepare_val()
-
-        super().__init__(self.target_folder, **kwargs)
-
-    @property
-    def _default_root(self):
-        return os.path.join(_default_dataset_root(), self.__class__.__name__)
-
-    @property
-    def valid_ground_truth(self):
-        groud_truth_path = os.path.join(
-            self.devkit_dir, "data", "ILSVRC2012_validation_ground_truth.txt"
-        )
-        if os.path.exists(groud_truth_path):
-            with open(groud_truth_path, "r") as f:
-                val_labels = f.readlines()
-                return [int(val_label) for val_label in val_labels]
-        else:
-            raise FileNotFoundError(
-                "valid ground truth file %s does not exist" % groud_truth_path
-            )
-
-    @property
-    def meta(self):
-        try:
-            return load(os.path.join(self.devkit_dir, "meta.pkl"))
-        except FileNotFoundError:
-            import scipy.io
-
-            meta_path = os.path.join(self.devkit_dir, "data", "meta.mat")
-            if not os.path.exists(meta_path):
-                raise FileNotFoundError("meta file %s does not exist" % meta_path)
-            meta = scipy.io.loadmat(meta_path, squeeze_me=True)["synsets"]
-            nums_children = list(zip(*meta))[4]
-            meta = [
-                meta[idx]
-                for idx, num_children in enumerate(nums_children)
-                if num_children == 0
-            ]
-            idcs, wnids, classes = list(zip(*meta))[:3]
-            classes = [tuple(clss.split(", ")) for clss in classes]
-            idx_to_wnid = dict(zip(idcs, wnids))
-            wnid_to_classes = dict(zip(wnids, classes))
-            logger.info(
-                "saving cached meta file to %s",
-                os.path.join(self.devkit_dir, "meta.pkl"),
-            )
-            save(
-                (idx_to_wnid, wnid_to_classes),
-                os.path.join(self.devkit_dir, "meta.pkl"),
-            )
-            return idx_to_wnid, wnid_to_classes
-
-    def check_raw_file(self) -> bool:
-        return all(
-            [
-                os.path.exists(os.path.join(self.root, value[0]))
-                for _, value in self.raw_file_meta.items()
-            ]
-        )
-
-    def _organize_val_data(self):
-        id2wnid = self.meta[0]
-        val_idcs = self.valid_ground_truth
-        val_wnids = [id2wnid[idx] for idx in val_idcs]
-
-        val_images = sorted(
-            [
-                os.path.join(self.target_folder, image)
-                for image in os.listdir(self.target_folder)
-            ]
-        )
-
-        logger.debug("mkdir for val set wnids")
-        for wnid in set(val_wnids):
-            os.makedirs(os.path.join(self.root, self.default_val_dir, wnid))
-
-        logger.debug("mv val images into wnids dir")
-        for wnid, img_file in tqdm(zip(val_wnids, val_images)):
-            shutil.move(
-                img_file,
-                os.path.join(
-                    self.root, self.default_val_dir, wnid, os.path.basename(img_file)
-                ),
-            )
-
-    def _prepare_val(self):
-        assert not self.train
-        raw_filename, checksum = self.raw_file_meta["val"]
-        raw_file = os.path.join(self.root, raw_filename)
-        logger.info("checksum valid tar file %s ...", raw_file)
-        assert (
-            calculate_md5(raw_file) == checksum
-        ), "checksum mismatch, {} may be damaged".format(raw_file)
-        logger.info("extract valid tar file... this may take 10-20 minutes")
-        untar(os.path.join(self.root, raw_file), self.target_folder)
-        self._organize_val_data()
-
-    def _prepare_train(self):
-        assert self.train
-        raw_filename, checksum = self.raw_file_meta["train"]
-        raw_file = os.path.join(self.root, raw_filename)
-        logger.info("checksum train tar file %s ...", raw_file)
-        assert (
-            calculate_md5(raw_file) == checksum
-        ), "checksum mismatch, {} may be damaged".format(raw_file)
-        logger.info("extract train tar file.. this may take several hours")
-        untar(
-            os.path.join(self.root, raw_file), self.target_folder,
-        )
-        paths = [
-            os.path.join(self.target_folder, child_dir)
-            for child_dir in os.listdir(self.target_folder)
-        ]
-        for path in tqdm(paths):
-            untar(path, os.path.splitext(path)[0], remove=True)
-
-    def _prepare_devkit(self):
-        raw_filename, checksum = self.raw_file_meta["devkit"]
-        raw_file = os.path.join(self.root, raw_filename)
-        logger.info("checksum devkit tar file %s ...", raw_file)
-        assert (
-            calculate_md5(raw_file) == checksum
-        ), "checksum mismatch, {} may be damaged".format(raw_file)
-        logger.info("extract devkit file..")
-        untargz(os.path.join(self.root, self.raw_file_meta["devkit"][0]))
diff --git a/python_module/megengine/data/dataset/vision/meta_vision.py b/python_module/megengine/data/dataset/vision/meta_vision.py
deleted file mode 100644
index 6d03d3ed..00000000
--- a/python_module/megengine/data/dataset/vision/meta_vision.py
+++ /dev/null
@@ -1,41 +0,0 @@
-# -*- coding: utf-8 -*-
-# MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
-#
-# Copyright (c) 2014-2020 Megvii Inc. All rights reserved.
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-import collections.abc
-import os
-
-from ..meta_dataset import MapDataset
-
-
-class VisionDataset(MapDataset):
-    _repr_indent = 4
-
-    def __init__(self, root, *, order=None, supported_order=None):
-        if isinstance(root, (str, bytes)):
-            root = os.path.expanduser(root)
-        self.root = root
-
-        if order is None:
-            order = ("image",)
-        if not isinstance(order, collections.abc.Sequence):
-            raise ValueError(
-                "order should be a sequence, but got order={}".format(order)
-            )
-
-        if supported_order is not None:
-            assert isinstance(supported_order, collections.abc.Sequence)
-            for k in order:
-                if k not in supported_order:
-                    raise NotImplementedError("{} is unsupported data type".format(k))
-        self.order = order
-
-    def __getitem__(self, index):
-        raise NotImplementedError
-
-    def __len__(self):
-        raise NotImplementedError
diff --git a/python_module/megengine/data/dataset/vision/mnist.py b/python_module/megengine/data/dataset/vision/mnist.py
deleted file mode 100644
index 5e89a314..00000000
--- a/python_module/megengine/data/dataset/vision/mnist.py
+++ /dev/null
@@ -1,197 +0,0 @@
-# -*- coding: utf-8 -*-
-# MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
-#
-# Copyright (c) 2014-2020 Megvii Inc. All rights reserved.
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-import gzip
-import os
-import struct
-from typing import Tuple
-
-import numpy as np
-from tqdm import tqdm
-
-from ....logger import get_logger
-from .meta_vision import VisionDataset
-from .utils import _default_dataset_root, load_raw_data_from_url
-
-logger = get_logger(__name__)
-
-
-class MNIST(VisionDataset):
-    r""" ``Dataset`` for MNIST meta data
-    """
-
-    url_path = "http://yann.lecun.com/exdb/mnist/"
-    """
-    url prefix for downloading raw file
-    """
-    raw_file_name = [
-        "train-images-idx3-ubyte.gz",
-        "train-labels-idx1-ubyte.gz",
-        "t10k-images-idx3-ubyte.gz",
-        "t10k-labels-idx1-ubyte.gz",
-    ]
-    """
-    raw file names of both training set and test set (10k)
-    """
-    raw_file_md5 = [
-        "f68b3c2dcbeaaa9fbdd348bbdeb94873",
-        "d53e105ee54ea40749a09fcbcd1e9432",
-        "9fb629c4189551a2d022fa330f9573f3",
-        "ec29112dd5afa0611ce80d1b7f02629c",
-    ]
-    """
-    md5 for checking raw files
-    """
-
-    def __init__(
-        self,
-        root: str = None,
-        train: bool = True,
-        download: bool = True,
-        timeout: int = 500,
-    ):
-        r"""
-        :param root: path for mnist dataset downloading or loading, if ``None``,
-            set ``root`` to the ``_default_root``
-        :param train: if ``True``, loading trainingset, else loading test set
-        :param download: if raw files do not exists and download sets to ``True``,
-            download raw files and process, otherwise raise ValueError, default is True
-
-        """
-        super().__init__(root, order=("image", "image_category"))
-
-        self.timeout = timeout
-
-        # process the root path
-        if root is None:
-            self.root = self._default_root
-            if not os.path.exists(self.root):
-                os.makedirs(self.root)
-        else:
-            self.root = root
-            if not os.path.exists(self.root):
-                if download:
-                    logger.debug(
-                        "dir %s does not exist, will be automatically created",
-                        self.root,
-                    )
-                    os.makedirs(self.root)
-                else:
-                    raise ValueError("dir %s does not exist" % self.root)
-
-        if self._check_raw_files():
-            self.process(train)
-        elif download:
-            self.download()
-            self.process(train)
-        else:
-            raise ValueError(
-                "root does not contain valid raw files, please set download=True"
-            )
-
-    def __getitem__(self, index: int) -> Tuple:
-        return tuple(array[index] for array in self.arrays)
-
-    def __len__(self) -> int:
-        return len(self.arrays[0])
-
-    @property
-    def _default_root(self):
-        return os.path.join(_default_dataset_root(), self.__class__.__name__)
-
-    @property
-    def meta(self):
-        return self._meta_data
-
-    def _check_raw_files(self):
-        return all(
-            [
-                os.path.exists(os.path.join(self.root, path))
-                for path in self.raw_file_name
-            ]
-        )
-
-    def download(self):
-        for file_name, md5 in zip(self.raw_file_name, self.raw_file_md5):
-            url = self.url_path + file_name
-            load_raw_data_from_url(url, file_name, md5, self.root, self.timeout)
-
-    def process(self, train):
-        # load raw files and transform them into meta data and datasets Tuple(np.array)
-        logger.info("process the raw files of %s set...", "train" if train else "test")
-        if train:
-            meta_data_images, images = parse_idx3(
-                os.path.join(self.root, self.raw_file_name[0])
-            )
-            meta_data_labels, labels = parse_idx1(
-                os.path.join(self.root, self.raw_file_name[1])
-            )
-        else:
-            meta_data_images, images = parse_idx3(
-                os.path.join(self.root, self.raw_file_name[2])
-            )
-            meta_data_labels, labels = parse_idx1(
-                os.path.join(self.root, self.raw_file_name[3])
-            )
-
-        self._meta_data = {
-            "images": meta_data_images,
-            "labels": meta_data_labels,
-        }
-        self.arrays = (images, labels.astype(np.int32))
-
-
-def parse_idx3(idx3_file):
-    # parse idx3 file to meta data and data in numpy array (images)
-    logger.debug("parse idx3 file %s ...", idx3_file)
-    assert idx3_file.endswith(".gz")
-    with gzip.open(idx3_file, "rb") as f:
-        bin_data = f.read()
-
-    #  parse meta data
-    offset = 0
-    fmt_header = ">iiii"
-    magic, imgs, height, width = struct.unpack_from(fmt_header, bin_data, offset)
-    meta_data = {"magic": magic, "imgs": imgs, "height": height, "width": width}
-
-    # parse images
-    image_size = height * width
-    offset += struct.calcsize(fmt_header)
-    fmt_image = ">" + str(image_size) + "B"
-    images = []
-    bar = tqdm(total=meta_data["imgs"], ncols=80)
-    for image in struct.iter_unpack(fmt_image, bin_data[offset:]):
-        images.append(np.array(image, dtype=np.uint8).reshape((height, width, 1)))
-        bar.update()
-    bar.close()
-    return meta_data, images
-
-
-def parse_idx1(idx1_file):
-    # parse idx1 file to meta data and data in numpy array (labels)
-    logger.debug("parse idx1 file %s ...", idx1_file)
-    assert idx1_file.endswith(".gz")
-    with gzip.open(idx1_file, "rb") as f:
-        bin_data = f.read()
-
-    # parse meta data
-    offset = 0
-    fmt_header = ">ii"
-    magic, imgs = struct.unpack_from(fmt_header, bin_data, offset)
-    meta_data = {"magic": magic, "imgs": imgs}
-
-    # parse labels
-    offset += struct.calcsize(fmt_header)
-    fmt_image = ">B"
-    labels = np.empty(imgs, dtype=int)
-    bar = tqdm(total=meta_data["imgs"], ncols=80)
-    for i, label in enumerate(struct.iter_unpack(fmt_image, bin_data[offset:])):
-        labels[i] = label[0]
-        bar.update()
-    bar.close()
-    return meta_data, labels
diff --git a/python_module/megengine/data/dataset/vision/objects365.py b/python_module/megengine/data/dataset/vision/objects365.py
deleted file mode 100644
index e56e6462..00000000
--- a/python_module/megengine/data/dataset/vision/objects365.py
+++ /dev/null
@@ -1,498 +0,0 @@
-# -*- coding: utf-8 -*-
-# MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
-#
-# Copyright (c) 2014-2020 Megvii Inc. All rights reserved.
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# ---------------------------------------------------------------------
-# Part of the following code in this file refs to maskrcnn-benchmark
-# MIT License
-#
-# Copyright (c) 2018 Facebook
-# ---------------------------------------------------------------------
-import json
-import os
-from collections import defaultdict
-
-import cv2
-import numpy as np
-
-from .meta_vision import VisionDataset
-
-
-class Objects365(VisionDataset):
-    r"""`Objects365 <https://www.objects365.org/overview.html>`_ Dataset.
-    """
-
-    supported_order = (
-        "image",
-        "boxes",
-        "boxes_category",
-        "info",
-    )
-
-    def __init__(
-        self, root, ann_file, remove_images_without_annotations=False, *, order=None
-    ):
-        super().__init__(root, order=order, supported_order=self.supported_order)
-
-        with open(ann_file, "r") as f:
-            dataset = json.load(f)
-
-        self.imgs = dict()
-        for img in dataset["images"]:
-            self.imgs[img["id"]] = img
-
-        self.img_to_anns = defaultdict(list)
-        for ann in dataset["annotations"]:
-            # for saving memory
-            if (
-                "boxes" not in self.order
-                and "boxes_category" not in self.order
-                and "bbox" in ann
-            ):
-                del ann["bbox"]
-            self.img_to_anns[ann["image_id"]].append(ann)
-
-        self.cats = dict()
-        for cat in dataset["categories"]:
-            self.cats[cat["id"]] = cat
-
-        self.ids = list(sorted(self.imgs.keys()))
-
-        # filter images without detection annotations
-        if remove_images_without_annotations:
-            ids = []
-            for img_id in self.ids:
-                anno = self.img_to_anns[img_id]
-                # filter crowd annotations
-                anno = [obj for obj in anno if obj["iscrowd"] == 0]
-                anno = [
-                    obj for obj in anno if obj["bbox"][2] > 0 and obj["bbox"][3] > 0
-                ]
-                if len(anno) > 0:
-                    ids.append(img_id)
-                    self.img_to_anns[img_id] = anno
-                else:
-                    del self.imgs[img_id]
-                    del self.img_to_anns[img_id]
-            self.ids = ids
-
-        self.json_category_id_to_contiguous_id = {
-            v: i + 1 for i, v in enumerate(sorted(self.cats.keys()))
-        }
-
-        self.contiguous_category_id_to_json_id = {
-            v: k for k, v in self.json_category_id_to_contiguous_id.items()
-        }
-
-    def __getitem__(self, index):
-        img_id = self.ids[index]
-        anno = self.img_to_anns[img_id]
-
-        target = []
-        for k in self.order:
-            if k == "image":
-                file_name = self.imgs[img_id]["file_name"]
-                path = os.path.join(self.root, file_name)
-                image = cv2.imread(path, cv2.IMREAD_COLOR)
-                target.append(image)
-            elif k == "boxes":
-                boxes = [obj["bbox"] for obj in anno]
-                boxes = np.array(boxes, dtype=np.float32).reshape(-1, 4)
-                # transfer boxes from xywh to xyxy
-                boxes[:, 2:] += boxes[:, :2]
-                target.append(boxes)
-            elif k == "boxes_category":
-                boxes_category = [obj["category_id"] for obj in anno]
-                boxes_category = [
-                    self.json_category_id_to_contiguous_id[c] for c in boxes_category
-                ]
-                boxes_category = np.array(boxes_category, dtype=np.int32)
-                target.append(boxes_category)
-            elif k == "info":
-                info = self.imgs[img_id]
-                info = [info["height"], info["width"], info["file_name"]]
-                target.append(info)
-            else:
-                raise NotImplementedError
-
-        return tuple(target)
-
-    def __len__(self):
-        return len(self.ids)
-
-    def get_img_info(self, index):
-        img_id = self.ids[index]
-        img_info = self.imgs[img_id]
-        return img_info
-
-    class_names = (
-        "person",
-        "sneakers",
-        "chair",
-        "hat",
-        "lamp",
-        "bottle",
-        "cabinet/shelf",
-        "cup",
-        "car",
-        "glasses",
-        "picture/frame",
-        "desk",
-        "handbag",
-        "street lights",
-        "book",
-        "plate",
-        "helmet",
-        "leather shoes",
-        "pillow",
-        "glove",
-        "potted plant",
-        "bracelet",
-        "flower",
-        "tv",
-        "storage box",
-        "vase",
-        "bench",
-        "wine glass",
-        "boots",
-        "bowl",
-        "dining table",
-        "umbrella",
-        "boat",
-        "flag",
-        "speaker",
-        "trash bin/can",
-        "stool",
-        "backpack",
-        "couch",
-        "belt",
-        "carpet",
-        "basket",
-        "towel/napkin",
-        "slippers",
-        "barrel/bucket",
-        "coffee table",
-        "suv",
-        "toy",
-        "tie",
-        "bed",
-        "traffic light",
-        "pen/pencil",
-        "microphone",
-        "sandals",
-        "canned",
-        "necklace",
-        "mirror",
-        "faucet",
-        "bicycle",
-        "bread",
-        "high heels",
-        "ring",
-        "van",
-        "watch",
-        "sink",
-        "horse",
-        "fish",
-        "apple",
-        "camera",
-        "candle",
-        "teddy bear",
-        "cake",
-        "motorcycle",
-        "wild bird",
-        "laptop",
-        "knife",
-        "traffic sign",
-        "cell phone",
-        "paddle",
-        "truck",
-        "cow",
-        "power outlet",
-        "clock",
-        "drum",
-        "fork",
-        "bus",
-        "hanger",
-        "nightstand",
-        "pot/pan",
-        "sheep",
-        "guitar",
-        "traffic cone",
-        "tea pot",
-        "keyboard",
-        "tripod",
-        "hockey",
-        "fan",
-        "dog",
-        "spoon",
-        "blackboard/whiteboard",
-        "balloon",
-        "air conditioner",
-        "cymbal",
-        "mouse",
-        "telephone",
-        "pickup truck",
-        "orange",
-        "banana",
-        "airplane",
-        "luggage",
-        "skis",
-        "soccer",
-        "trolley",
-        "oven",
-        "remote",
-        "baseball glove",
-        "paper towel",
-        "refrigerator",
-        "train",
-        "tomato",
-        "machinery vehicle",
-        "tent",
-        "shampoo/shower gel",
-        "head phone",
-        "lantern",
-        "donut",
-        "cleaning products",
-        "sailboat",
-        "tangerine",
-        "pizza",
-        "kite",
-        "computer box",
-        "elephant",
-        "toiletries",
-        "gas stove",
-        "broccoli",
-        "toilet",
-        "stroller",
-        "shovel",
-        "baseball bat",
-        "microwave",
-        "skateboard",
-        "surfboard",
-        "surveillance camera",
-        "gun",
-        "life saver",
-        "cat",
-        "lemon",
-        "liquid soap",
-        "zebra",
-        "duck",
-        "sports car",
-        "giraffe",
-        "pumpkin",
-        "piano",
-        "stop sign",
-        "radiator",
-        "converter",
-        "tissue ",
-        "carrot",
-        "washing machine",
-        "vent",
-        "cookies",
-        "cutting/chopping board",
-        "tennis racket",
-        "candy",
-        "skating and skiing shoes",
-        "scissors",
-        "folder",
-        "baseball",
-        "strawberry",
-        "bow tie",
-        "pigeon",
-        "pepper",
-        "coffee machine",
-        "bathtub",
-        "snowboard",
-        "suitcase",
-        "grapes",
-        "ladder",
-        "pear",
-        "american football",
-        "basketball",
-        "potato",
-        "paint brush",
-        "printer",
-        "billiards",
-        "fire hydrant",
-        "goose",
-        "projector",
-        "sausage",
-        "fire extinguisher",
-        "extension cord",
-        "facial mask",
-        "tennis ball",
-        "chopsticks",
-        "electronic stove and gas stove",
-        "pie",
-        "frisbee",
-        "kettle",
-        "hamburger",
-        "golf club",
-        "cucumber",
-        "clutch",
-        "blender",
-        "tong",
-        "slide",
-        "hot dog",
-        "toothbrush",
-        "facial cleanser",
-        "mango",
-        "deer",
-        "egg",
-        "violin",
-        "marker",
-        "ship",
-        "chicken",
-        "onion",
-        "ice cream",
-        "tape",
-        "wheelchair",
-        "plum",
-        "bar soap",
-        "scale",
-        "watermelon",
-        "cabbage",
-        "router/modem",
-        "golf ball",
-        "pine apple",
-        "crane",
-        "fire truck",
-        "peach",
-        "cello",
-        "notepaper",
-        "tricycle",
-        "toaster",
-        "helicopter",
-        "green beans",
-        "brush",
-        "carriage",
-        "cigar",
-        "earphone",
-        "penguin",
-        "hurdle",
-        "swing",
-        "radio",
-        "CD",
-        "parking meter",
-        "swan",
-        "garlic",
-        "french fries",
-        "horn",
-        "avocado",
-        "saxophone",
-        "trumpet",
-        "sandwich",
-        "cue",
-        "kiwi fruit",
-        "bear",
-        "fishing rod",
-        "cherry",
-        "tablet",
-        "green vegetables",
-        "nuts",
-        "corn",
-        "key",
-        "screwdriver",
-        "globe",
-        "broom",
-        "pliers",
-        "volleyball",
-        "hammer",
-        "eggplant",
-        "trophy",
-        "dates",
-        "board eraser",
-        "rice",
-        "tape measure/ruler",
-        "dumbbell",
-        "hamimelon",
-        "stapler",
-        "camel",
-        "lettuce",
-        "goldfish",
-        "meat balls",
-        "medal",
-        "toothpaste",
-        "antelope",
-        "shrimp",
-        "rickshaw",
-        "trombone",
-        "pomegranate",
-        "coconut",
-        "jellyfish",
-        "mushroom",
-        "calculator",
-        "treadmill",
-        "butterfly",
-        "egg tart",
-        "cheese",
-        "pig",
-        "pomelo",
-        "race car",
-        "rice cooker",
-        "tuba",
-        "crosswalk sign",
-        "papaya",
-        "hair drier",
-        "green onion",
-        "chips",
-        "dolphin",
-        "sushi",
-        "urinal",
-        "donkey",
-        "electric drill",
-        "spring rolls",
-        "tortoise/turtle",
-        "parrot",
-        "flute",
-        "measuring cup",
-        "shark",
-        "steak",
-        "poker card",
-        "binoculars",
-        "llama",
-        "radish",
-        "noodles",
-        "yak",
-        "mop",
-        "crab",
-        "microscope",
-        "barbell",
-        "bread/bun",
-        "baozi",
-        "lion",
-        "red cabbage",
-        "polar bear",
-        "lighter",
-        "seal",
-        "mangosteen",
-        "comb",
-        "eraser",
-        "pitaya",
-        "scallop",
-        "pencil case",
-        "saw",
-        "table tennis paddle",
-        "okra",
-        "starfish",
-        "eagle",
-        "monkey",
-        "durian",
-        "game board",
-        "rabbit",
-        "french horn",
-        "ambulance",
-        "asparagus",
-        "hoverboard",
-        "pasta",
-        "target",
-        "hotair balloon",
-        "chainsaw",
-        "lobster",
-        "iron",
-        "flashlight",
-    )
diff --git a/python_module/megengine/data/dataset/vision/utils.py b/python_module/megengine/data/dataset/vision/utils.py
deleted file mode 100644
index de6657d7..00000000
--- a/python_module/megengine/data/dataset/vision/utils.py
+++ /dev/null
@@ -1,89 +0,0 @@
-# -*- coding: utf-8 -*-
-# MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
-#
-# Copyright (c) 2014-2020 Megvii Inc. All rights reserved.
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-import hashlib
-import os
-import tarfile
-
-from ....distributed.util import is_distributed
-from ....logger import get_logger
-from ....utils.http_download import download_from_url
-
-IMG_EXT = (".jpg", ".png", ".jpeg", ".ppm", ".bmp", ".pgm", ".tif", ".tiff", ".webp")
-
-logger = get_logger(__name__)
-
-
-def _default_dataset_root():
-    default_dataset_root = os.path.expanduser(
-        os.path.join(os.getenv("XDG_CACHE_HOME", "~/.cache"), "megengine")
-    )
-
-    return default_dataset_root
-
-
-def load_raw_data_from_url(
-    url: str, filename: str, target_md5: str, raw_data_dir: str, timeout: int
-):
-    cached_file = os.path.join(raw_data_dir, filename)
-    logger.debug(
-        "load_raw_data_from_url: downloading to or using cached %s ...", cached_file
-    )
-    if not os.path.exists(cached_file):
-        if is_distributed():
-            logger.warning(
-                "Downloading raw data in DISTRIBUTED mode\n"
-                "    File may be downloaded multiple times. We recommend\n"
-                "    users to download in single process first."
-            )
-        md5 = download_from_url(url, cached_file, http_read_timeout=timeout)
-    else:
-        md5 = calculate_md5(cached_file)
-    if target_md5 == md5:
-        logger.debug("%s exists with correct md5: %s", filename, target_md5)
-    else:
-        os.remove(cached_file)
-        raise RuntimeError("{} exists but fail to match md5".format(filename))
-
-
-def calculate_md5(filename):
-    m = hashlib.md5()
-    with open(filename, "rb") as f:
-        while True:
-            data = f.read(4096)
-            if not data:
-                break
-            m.update(data)
-    return m.hexdigest()
-
-
-def is_img(filename):
-    return filename.lower().endswith(IMG_EXT)
-
-
-def untar(path, to=None, remove=False):
-    if to is None:
-        to = os.path.dirname(path)
-    with tarfile.open(path, "r") as tar:
-        tar.extractall(path=to)
-
-    if remove:
-        os.remove(path)
-
-
-def untargz(path, to=None, remove=False):
-    if path.endswith(".tar.gz"):
-        if to is None:
-            to = os.path.dirname(path)
-        with tarfile.open(path, "r:gz") as tar:
-            tar.extractall(path=to)
-    else:
-        raise ValueError("path %s does not end with .tar" % path)
-
-    if remove:
-        os.remove(path)
diff --git a/python_module/megengine/data/dataset/vision/voc.py b/python_module/megengine/data/dataset/vision/voc.py
deleted file mode 100644
index b22fd2fa..00000000
--- a/python_module/megengine/data/dataset/vision/voc.py
+++ /dev/null
@@ -1,185 +0,0 @@
-# -*- coding: utf-8 -*-
-# MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
-#
-# Copyright (c) 2014-2020 Megvii Inc. All rights reserved.
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# ---------------------------------------------------------------------
-# Part of the following code in this file refs to torchvision
-# BSD 3-Clause License
-#
-# Copyright (c) Soumith Chintala 2016,
-# All rights reserved.
-# ---------------------------------------------------------------------
-import collections.abc
-import os
-import xml.etree.ElementTree as ET
-
-import cv2
-import numpy as np
-
-from .meta_vision import VisionDataset
-
-
-class PascalVOC(VisionDataset):
-    r"""`Pascal VOC <http://host.robots.ox.ac.uk/pascal/VOC/>`_ Dataset.
-    """
-
-    supported_order = (
-        "image",
-        "boxes",
-        "boxes_category",
-        "mask",
-        "info",
-    )
-
-    def __init__(self, root, image_set, *, order=None):
-        if ("boxes" in order or "boxes_category" in order) and "mask" in order:
-            raise ValueError(
-                "PascalVOC only supports boxes & boxes_category or mask, not both."
-            )
-
-        super().__init__(root, order=order, supported_order=self.supported_order)
-
-        if not os.path.isdir(self.root):
-            raise RuntimeError("Dataset not found or corrupted.")
-
-        self.image_set = image_set
-        image_dir = os.path.join(self.root, "JPEGImages")
-
-        if "boxes" in order or "boxes_category" in order:
-            annotation_dir = os.path.join(self.root, "Annotations")
-            splitdet_dir = os.path.join(self.root, "ImageSets/Main")
-            split_f = os.path.join(splitdet_dir, image_set.rstrip("\n") + ".txt")
-            with open(os.path.join(split_f), "r") as f:
-                self.file_names = [x.strip() for x in f.readlines()]
-            self.images = [os.path.join(image_dir, x + ".jpg") for x in self.file_names]
-            self.annotations = [
-                os.path.join(annotation_dir, x + ".xml") for x in self.file_names
-            ]
-            assert len(self.images) == len(self.annotations)
-        elif "mask" in order:
-            if "aug" in image_set:
-                mask_dir = os.path.join(self.root, "SegmentationClass_aug")
-            else:
-                mask_dir = os.path.join(self.root, "SegmentationClass")
-            splitmask_dir = os.path.join(self.root, "ImageSets/Segmentation")
-            split_f = os.path.join(splitmask_dir, image_set.rstrip("\n") + ".txt")
-            with open(os.path.join(split_f), "r") as f:
-                self.file_names = [x.strip() for x in f.readlines()]
-            self.images = [os.path.join(image_dir, x + ".jpg") for x in self.file_names]
-            self.masks = [os.path.join(mask_dir, x + ".png") for x in self.file_names]
-            assert len(self.images) == len(self.masks)
-        else:
-            raise NotImplementedError
-
-        self.img_infos = dict()
-
-    def __getitem__(self, index):
-        target = []
-        for k in self.order:
-            if k == "image":
-                image = cv2.imread(self.images[index], cv2.IMREAD_COLOR)
-                target.append(image)
-            elif k == "boxes":
-                anno = self.parse_voc_xml(ET.parse(self.annotations[index]).getroot())
-                boxes = [obj["bndbox"] for obj in anno["annotation"]["object"]]
-                # boxes type xyxy
-                boxes = [
-                    (bb["xmin"], bb["ymin"], bb["xmax"], bb["ymax"]) for bb in boxes
-                ]
-                boxes = np.array(boxes, dtype=np.float32).reshape(-1, 4)
-                target.append(boxes)
-            elif k == "boxes_category":
-                anno = self.parse_voc_xml(ET.parse(self.annotations[index]).getroot())
-                boxes_category = [obj["name"] for obj in anno["annotation"]["object"]]
-                boxes_category = [
-                    self.class_names.index(bc) + 1 for bc in boxes_category
-                ]
-                boxes_category = np.array(boxes_category, dtype=np.int32)
-                target.append(boxes_category)
-            elif k == "mask":
-                if "aug" in self.image_set:
-                    mask = cv2.imread(self.masks[index], cv2.IMREAD_GRAYSCALE)
-                else:
-                    mask = cv2.imread(self.masks[index], cv2.IMREAD_COLOR)
-                    mask = self._trans_mask(mask)
-                mask = mask[:, :, np.newaxis]
-                target.append(mask)
-            elif k == "info":
-                info = self.get_img_info(index, image)
-                info = [info["height"], info["width"], info["file_name"]]
-                target.append(info)
-            else:
-                raise NotImplementedError
-
-        return tuple(target)
-
-    def __len__(self):
-        return len(self.images)
-
-    def get_img_info(self, index, image=None):
-        if index not in self.img_infos:
-            if image is None:
-                image = cv2.imread(self.images[index], cv2.IMREAD_COLOR)
-            self.img_infos[index] = dict(
-                height=image.shape[0],
-                width=image.shape[1],
-                file_name=self.file_names[index],
-            )
-        return self.img_infos[index]
-
-    def _trans_mask(self, mask):
-        label = np.ones(mask.shape[:2]) * 255
-        for i in range(len(self.class_colors)):
-            b, g, r = self.class_colors[i]
-            label[
-                (mask[:, :, 0] == b) & (mask[:, :, 1] == g) & (mask[:, :, 2] == r)
-            ] = i
-        return label.astype(np.uint8)
-
-    def parse_voc_xml(self, node):
-        voc_dict = {}
-        children = list(node)
-        if children:
-            def_dic = collections.defaultdict(list)
-            for dc in map(self.parse_voc_xml, children):
-                for ind, v in dc.items():
-                    def_dic[ind].append(v)
-            if node.tag == "annotation":
-                def_dic["object"] = [def_dic["object"]]
-            voc_dict = {
-                node.tag: {
-                    ind: v[0] if len(v) == 1 else v for ind, v in def_dic.items()
-                }
-            }
-        if node.text:
-            text = node.text.strip()
-            if not children:
-                voc_dict[node.tag] = text
-        return voc_dict
-
-    class_names = (
-        "aeroplane",
-        "bicycle",
-        "bird",
-        "boat",
-        "bottle",
-        "bus",
-        "car",
-        "cat",
-        "chair",
-        "cow",
-        "diningtable",
-        "dog",
-        "horse",
-        "motorbike",
-        "person",
-        "pottedplant",
-        "sheep",
-        "sofa",
-        "train",
-        "tvmonitor",
-    )
diff --git a/python_module/megengine/data/sampler.py b/python_module/megengine/data/sampler.py
deleted file mode 100644
index dbd5d3a3..00000000
--- a/python_module/megengine/data/sampler.py
+++ /dev/null
@@ -1,274 +0,0 @@
-# -*- coding: utf-8 -*-
-# MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
-#
-# Copyright (c) 2014-2020 Megvii Inc. All rights reserved.
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-import collections.abc
-import math
-from abc import ABC
-from typing import Any, Generator, Iterator, List, Union
-
-import numpy as np
-
-import megengine.distributed as dist
-
-
-class Sampler(ABC):
-    def __init__(
-        self,
-        dataset,
-        batch_size=1,
-        drop_last=False,
-        num_samples=None,
-        world_size=None,
-        rank=None,
-        seed=None,
-    ):
-        r"""
-        An abstract class for all sampler
-
-        :type dataset: `dataset`
-        :param dataset: dataset to sample from
-        :type batch_size: positive integer
-        :param batch_size: batch size for batch method
-        :type drop_last: bool
-        :param drop_last: set ``True`` to drop the last incomplete batch,
-            if the dataset size is not divisible by the batch size. If ``False`` and 
-            the size of dataset is not divisible by the batch_size, then the last batch will
-            be smaller. (default: ``False``)
-        :type num_samples: positive integer
-        :param num_samples: number of samples assigned to one rank
-        :type world_size: positive integer
-        :param world_size: number of ranks
-        :type rank: non-negative integer within 0 and world_size
-        :param rank: rank id, non-negative interger within 0 and ``world_size``
-        :type seed: non-negative integer
-        :param seed: seed for random operators
-        """
-        if (
-            not isinstance(batch_size, int)
-            or isinstance(batch_size, bool)
-            or batch_size <= 0
-        ):
-            raise ValueError(
-                "batch_size should be a positive integer value, "
-                "but got batch_size={}".format(batch_size)
-            )
-        if not isinstance(drop_last, bool):
-            raise ValueError(
-                "drop_last should be a boolean value, but got "
-                "drop_last={}".format(drop_last)
-            )
-        if num_samples is not None and (
-            not isinstance(num_samples, int)
-            or isinstance(num_samples, bool)
-            or num_samples <= 0
-        ):
-            raise ValueError(
-                "num_samples should be a positive integer "
-                "value, but got num_samples={}".format(num_samples)
-            )
-
-        self.batch_size = batch_size
-        self.dataset = dataset
-        self.drop_last = drop_last
-
-        if world_size is None:
-            world_size = dist.get_world_size() if dist.is_distributed() else 1
-        self.world_size = world_size
-        if rank is None:
-            rank = dist.get_rank() if dist.is_distributed() else 0
-        self.rank = rank
-
-        if num_samples is None:
-            num_samples = len(self.dataset)
-        self.num_samples = int(math.ceil(num_samples / self.world_size))
-
-        # Make sure seeds are the same at each rank
-        if seed is None and self.world_size > 1:
-            seed = 0
-        self.rng = np.random.RandomState(seed)
-
-    def __iter__(self) -> Union[Generator, Iterator]:
-        return self.batch()
-
-    def __len__(self) -> int:
-        if self.drop_last:
-            return self.num_samples // self.batch_size
-        else:
-            return int(math.ceil(self.num_samples / self.batch_size))
-
-    def sample(self):
-        """
-        return a list contains all sample indices
-        """
-        raise NotImplementedError
-
-    def scatter(self, indices) -> List:
-        r"""
-        scatter method is used for splitting indices into subset, each subset
-        will be assigned to a rank. Indices are evenly splitted by default.
-        If customized indices assignment method is needed, please rewrite this method
-        """
-        total_size = self.num_samples * self.world_size
-
-        # add extra indices to make it evenly divisible
-        indices += indices[: (total_size - len(indices))]
-        assert len(indices) == total_size
-
-        # subsample
-        indices = indices[self.rank : total_size : self.world_size]
-        assert len(indices) == self.num_samples
-
-        return indices
-
-    def batch(self) -> Iterator[List[Any]]:
-        r"""
-        batch method provides a batch indices generator
-        """
-        indices = list(self.sample())
-
-        # user might pass the world_size parameter without dist,
-        # so dist.is_distributed() should not be used
-        if self.world_size > 1:
-            indices = self.scatter(indices)
-
-        step, length = self.batch_size, len(indices)
-        batch_index = [indices[i : i + step] for i in range(0, length, step)]
-
-        if self.drop_last and len(batch_index[-1]) < self.batch_size:
-            batch_index.pop()
-
-        return iter(batch_index)
-
-
-class SequentialSampler(Sampler):
-    def __init__(
-        self,
-        dataset,
-        batch_size=1,
-        drop_last=False,
-        indices=None,
-        world_size=None,
-        rank=None,
-    ):
-        r"""
-        Sample elements sequentially
-        """
-        super().__init__(dataset, batch_size, drop_last, None, world_size, rank)
-        if indices is not None and not isinstance(indices, collections.abc.Sequence):
-            raise ValueError(
-                "indices should be None or a sequence, "
-                "but got indices={}".format(indices)
-            )
-        self.indices = indices
-
-    def sample(self) -> Iterator[Any]:
-        r"""
-        return a generator 
-        """
-        if self.indices is None:
-            return iter(range(len(self.dataset)))
-        else:
-            return self.indices
-
-
-class RandomSampler(Sampler):
-    def __init__(
-        self,
-        dataset,
-        batch_size=1,
-        drop_last=False,
-        indices=None,
-        world_size=None,
-        rank=None,
-        seed=None,
-    ):
-        r"""
-        Sample elements randomly without replacement
-        """
-        super().__init__(dataset, batch_size, drop_last, None, world_size, rank, seed)
-        if indices is not None and not isinstance(indices, collections.abc.Sequence):
-            raise ValueError(
-                "indices should be None or a sequence, "
-                "but got indices={}".format(indices)
-            )
-        self.indices = indices
-
-    def sample(self) -> List:
-        if self.indices is None:
-            return self.rng.permutation(len(self.dataset)).tolist()
-        else:
-            return self.rng.permutation(self.indices).tolist()
-
-
-class ReplacementSampler(Sampler):
-    def __init__(
-        self,
-        dataset,
-        batch_size=1,
-        drop_last=False,
-        num_samples=None,
-        weights=None,
-        world_size=None,
-        rank=None,
-        seed=None,
-    ):
-        r"""
-        Sample elements randomly with replacement
-
-        :type weights: List
-        :param weights: weights for sampling indices, it could be unnormalized weights
-        """
-        super().__init__(
-            dataset, batch_size, drop_last, num_samples, world_size, rank, seed
-        )
-        if weights is not None:
-            if not isinstance(weights, collections.abc.Sequence):
-                raise ValueError(
-                    "weights should be None or a sequence, "
-                    "but got weights={}".format(weights)
-                )
-            if len(weights) != len(dataset):
-                raise ValueError(
-                    "len(dataset)={} should be equal to"
-                    "len(weights)={}".format(len(dataset), len(weights))
-                )
-        self.weights = weights
-        if self.weights is not None:
-            self.weights = np.array(weights) / sum(weights)
-
-    def sample(self) -> List:
-        n = len(self.dataset)
-        if self.weights is None:
-            return self.rng.randint(n, size=self.num_samples).tolist()
-        else:
-            return self.rng.multinomial(n, self.weights, self.num_samples).tolist()
-
-
-class Infinite(Sampler):
-    r"""Infinite Sampler warper for basic sampler"""
-
-    def sample(self):
-        raise NotImplementedError("sample method not supported in Infinite")
-
-    def __init__(self, sampler):
-        self.sampler = sampler
-        self.sampler_iter = iter(self.sampler)
-
-    def __iter__(self):
-        return self
-
-    def __next__(self):
-        try:
-            index = next(self.sampler_iter)
-        except StopIteration:
-            self.sampler_iter = iter(self.sampler)
-            index = next(self.sampler_iter)
-        return index
-
-    def __len__(self):
-        return np.iinfo(np.int64).max
diff --git a/python_module/megengine/data/transform/__init__.py b/python_module/megengine/data/transform/__init__.py
deleted file mode 100644
index 30424cbc..00000000
--- a/python_module/megengine/data/transform/__init__.py
+++ /dev/null
@@ -1,10 +0,0 @@
-# -*- coding: utf-8 -*-
-# MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
-#
-# Copyright (c) 2014-2020 Megvii Inc. All rights reserved.
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-from .meta_transform import PseudoTransform, Transform
-from .vision import *
diff --git a/python_module/megengine/data/transform/meta_transform.py b/python_module/megengine/data/transform/meta_transform.py
deleted file mode 100644
index d7fd4f47..00000000
--- a/python_module/megengine/data/transform/meta_transform.py
+++ /dev/null
@@ -1,31 +0,0 @@
-# -*- coding: utf-8 -*-
-# MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
-#
-# Copyright (c) 2014-2020 Megvii Inc. All rights reserved.
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-from abc import ABC, abstractmethod
-from typing import Sequence, Tuple
-
-
-class Transform(ABC):
-    """
-    rewrite apply method in subclass
-    """
-
-    def apply_batch(self, inputs: Sequence[Tuple]):
-        return tuple(self.apply(input) for input in inputs)
-
-    @abstractmethod
-    def apply(self, input: Tuple):
-        pass
-
-    def __repr__(self):
-        return self.__class__.__name__
-
-
-class PseudoTransform(Transform):
-    def apply(self, input: Tuple):
-        return input
diff --git a/python_module/megengine/data/transform/vision/__init__.py b/python_module/megengine/data/transform/vision/__init__.py
deleted file mode 100644
index d90c9e98..00000000
--- a/python_module/megengine/data/transform/vision/__init__.py
+++ /dev/null
@@ -1,9 +0,0 @@
-# -*- coding: utf-8 -*-
-# MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
-#
-# Copyright (c) 2014-2020 Megvii Inc. All rights reserved.
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-from .transform import *
diff --git a/python_module/megengine/data/transform/vision/functional.py b/python_module/megengine/data/transform/vision/functional.py
deleted file mode 100644
index e2f4e512..00000000
--- a/python_module/megengine/data/transform/vision/functional.py
+++ /dev/null
@@ -1,111 +0,0 @@
-# -*- coding: utf-8 -*-
-# MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
-#
-# Copyright (c) 2014-2020 Megvii Inc. All rights reserved.
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-import collections.abc
-import functools
-import random
-
-import cv2
-import numpy as np
-
-
-def wrap_keepdims(func):
-    """Wraper to keep the dimension of input images unchanged"""
-
-    @functools.wraps(func)
-    def wrapper(image, *args, **kwargs):
-        if len(image.shape) != 3:
-            raise ValueError(
-                "image must have 3 dims, but got {} dims".format(len(image.shape))
-            )
-        ret = func(image, *args, **kwargs)
-        if len(ret.shape) == 2:
-            ret = ret[:, :, np.newaxis]
-        return ret
-
-    return wrapper
-
-
-@wrap_keepdims
-def to_gray(image):
-    r"""
-    Change BGR format image's color space to gray
-
-    :param image: Input BGR format image, with (H, W, C) shape
-    :return: Gray format image, with (H, W, C) shape
-    """
-    return cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
-
-
-@wrap_keepdims
-def to_bgr(image):
-    r"""
-    Change gray format image's color space to BGR
-
-    :param image: input Gray format image, with (H, W, C) shape
-    :return: BGR format image, with (H, W, C) shape
-    """
-    return cv2.cvtColor(image, cv2.COLOR_GRAY2BGR)
-
-
-@wrap_keepdims
-def pad(input, size, value):
-    r"""
-    Pad input data with *value* and given *size*
-
-    :param input: Input data, with (H, W, C) shape
-    :param size: Padding size of input data, it could be integer or sequence.
-        If it's an integer, the input data will be padded in four directions.
-        If it's a sequence contains two integer, the bottom and right side
-        of input data will be padded.
-        If it's a sequence contains four integer, the top, bottom, left, right
-        side of input data will be padded with given size.
-    :param value: Padding value of data, could be a sequence of int or float.
-        if it's float value, the dtype of image will be casted to float32 also.
-    :return: Padded image
-    """
-    if isinstance(size, int):
-        size = (size, size, size, size)
-    elif isinstance(size, collections.abc.Sequence) and len(size) == 2:
-        size = (0, size[0], 0, size[1])
-    if np.array(value).dtype == float:
-        input = input.astype(np.float32)
-    return cv2.copyMakeBorder(input, *size, cv2.BORDER_CONSTANT, value=value)
-
-
-@wrap_keepdims
-def flip(image, flipCode):
-    r"""
-    Accordding to the flipCode (the type of flip), flip the input image
-
-    :param image: Input image, with (H, W, C) shape
-    :param flipCode: code that indicates the type of flip.
-        1 : Flip horizontally
-        0 : Flip vertically
-        -1 : Flip horizontally and vertically
-    :return: BGR format image, with (H, W, C) shape
-    """
-    return cv2.flip(image, flipCode=flipCode)
-
-
-@wrap_keepdims
-def resize(input, size, interpolation=cv2.INTER_LINEAR):
-    r"""
-    resize the input data to given size
-
-    :param input: Input data, could be image or masks, with (H, W, C) shape
-    :param size: Target size of input data, with (height, width) shape.
-    :param interpolation: Interpolation method.
-    :return: Resized data, with (H, W, C) shape
-    """
-    if len(size) != 2:
-        raise ValueError("resize needs (h, w), but got {}".format(size))
-
-    if isinstance(interpolation, collections.abc.Sequence):
-        interpolation = random.choice(interpolation)
-    return cv2.resize(input, size[::-1], interpolation=interpolation)
diff --git a/python_module/megengine/data/transform/vision/transform.py b/python_module/megengine/data/transform/vision/transform.py
deleted file mode 100644
index bf3834a9..00000000
--- a/python_module/megengine/data/transform/vision/transform.py
+++ /dev/null
@@ -1,1025 +0,0 @@
-# -*- coding: utf-8 -*-
-# MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
-#
-# Copyright (c) 2014-2020 Megvii Inc. All rights reserved.
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-import collections.abc
-import math
-from typing import Sequence, Tuple
-
-import cv2
-import numpy as np
-
-from megengine.data.transform import Transform
-from megengine.data.transform.vision import functional as F
-
-__all__ = [
-    "VisionTransform",
-    "ToMode",
-    "Compose",
-    "TorchTransformCompose",
-    "Pad",
-    "Resize",
-    "ShortestEdgeResize",
-    "RandomResize",
-    "RandomCrop",
-    "RandomResizedCrop",
-    "CenterCrop",
-    "RandomHorizontalFlip",
-    "RandomVerticalFlip",
-    "Normalize",
-    "GaussianNoise",
-    "BrightnessTransform",
-    "SaturationTransform",
-    "ContrastTransform",
-    "HueTransform",
-    "ColorJitter",
-    "Lighting",
-]
-
-
-class VisionTransform(Transform):
-    r"""
-    Base class of all transforms used in computer vision.
-    calling logic: apply_batch() -> apply() -> _apply_image() and other _apply_*()
-    method. If you want to implement a self-defined transform method for image,
-    rewrite _apply_image method in subclass.
-
-    :param order: Input type order. Input is a tuple contains different structures,
-        order is used to specify the order of structures. For example, if your input
-        is (image, boxes) type, then the order should be ("image", "boxes").
-        Current available strings & data type are describe below:
-
-        * "image": input image, with shape of (H, W, C)
-        * "coords": coordinates, with shape of (N, 2)
-        * "boxes": bounding boxes, with shape of (N, 4), "xyxy" format,
-          the 1st "xy" represents top left point of a box,
-          the 2nd "xy" represents right bottom point.
-        * "mask": map used for segmentation, with shape of (H, W, 1)
-        * "keypoints": keypoints with shape of (N, K, 3), N for number of instances,
-          and K for number of keypoints in one instance. The first two dimensions
-          of last axis is coordinate of keypoints and the the 3rd dimension is
-          the label of keypoints.
-        * "polygons": A sequence contains numpy array, its length is number of instances.
-          Each numpy array represents polygon coordinate of one instance.
-        * "category": categories for some data type. For example, "image_category"
-          means category of the input image and "boxes_category" means categories of
-          bounding boxes.
-        * "info": information for images such as image shapes and image path.
-
-        You can also customize your data types only if you implement the corresponding
-        _apply_*() methods, otherwise ``NotImplementedError`` will be raised.
-    """
-
-    def __init__(self, order=None):
-        super().__init__()
-        if order is None:
-            order = ("image",)
-        elif not isinstance(order, collections.abc.Sequence):
-            raise ValueError(
-                "order should be a sequence, but got order={}".format(order)
-            )
-        for k in order:
-            if k in ("batch",):
-                raise ValueError("{} is invalid data type".format(k))
-            elif k.endswith("category") or k.endswith("info"):
-                # when the key is *category or info, we should do nothing
-                # if the corresponding apply methods are not implemented.
-                continue
-            elif self._get_apply(k) is None:
-                raise NotImplementedError("{} is unsupported data type".format(k))
-        self.order = order
-
-    def apply_batch(self, inputs: Sequence[Tuple]):
-        r"""Apply transform on batch input data"""
-        return tuple(self.apply(input) for input in inputs)
-
-    def apply(self, input: Tuple):
-        r"""Apply transform on single input data"""
-        if not isinstance(input, tuple):
-            input = (input,)
-
-        output = []
-        for i in range(min(len(input), len(self.order))):
-            apply_func = self._get_apply(self.order[i])
-            if apply_func is None:
-                output.append(input[i])
-            else:
-                output.append(apply_func(input[i]))
-        if len(input) > len(self.order):
-            output.extend(input[len(self.order) :])
-
-        if len(output) == 1:
-            output = output[0]
-        else:
-            output = tuple(output)
-        return output
-
-    def _get_apply(self, key):
-        return getattr(self, "_apply_{}".format(key), None)
-
-    def _get_image(self, input: Tuple):
-        if not isinstance(input, tuple):
-            input = (input,)
-        return input[self.order.index("image")]
-
-    def _apply_image(self, image):
-        raise NotImplementedError
-
-    def _apply_coords(self, coords):
-        raise NotImplementedError
-
-    def _apply_boxes(self, boxes):
-        idxs = np.array([(0, 1), (2, 1), (0, 3), (2, 3)]).flatten()
-        coords = np.asarray(boxes).reshape(-1, 4)[:, idxs].reshape(-1, 2)
-        coords = self._apply_coords(coords).reshape((-1, 4, 2))
-        minxy = coords.min(axis=1)
-        maxxy = coords.max(axis=1)
-        trans_boxes = np.concatenate((minxy, maxxy), axis=1)
-        return trans_boxes
-
-    def _apply_mask(self, mask):
-        raise NotImplementedError
-
-    def _apply_keypoints(self, keypoints):
-        coords, visibility = keypoints[..., :2], keypoints[..., 2:]
-        trans_coords = [self._apply_coords(p) for p in coords]
-        return np.concatenate((trans_coords, visibility), axis=-1)
-
-    def _apply_polygons(self, polygons):
-        return [[self._apply_coords(p) for p in instance] for instance in polygons]
-
-
-class ToMode(VisionTransform):
-    r"""Change input data to a target mode.
-    For example, most transforms use HWC mode image,
-    while the Neural Network might use CHW mode input tensor
-
-    :param mode: Output mode of input. Use "CHW" mode by default.
-    :param order: The same with :class:`VisionTransform`
-    """
-
-    def __init__(self, mode="CHW", *, order=None):
-        super().__init__(order)
-        assert mode in ["CHW"], "unsupported mode: {}".format(mode)
-        self.mode = mode
-
-    def _apply_image(self, image):
-        if self.mode == "CHW":
-            return np.ascontiguousarray(np.rollaxis(image, 2))
-        return image
-
-    def _apply_coords(self, coords):
-        return coords
-
-    def _apply_mask(self, mask):
-        if self.mode == "CHW":
-            return np.ascontiguousarray(np.rollaxis(mask, 2))
-        return mask
-
-
-class Compose(VisionTransform):
-    r"""
-    Composes several transforms together.
-
-    :param transforms: List of :class:`VisionTransform` to compose.
-    :param batch_compose: Whether use shuffle_indices for batch data or not.
-        If True, use original input sequence.
-        Otherwise, the shuffle_indices will be used for transforms.
-    :param shuffle_indices: Indices used for random shuffle, start at 1.
-        For example, if shuffle_indices is [(1, 3), (2, 4)], then the 1st and 3rd transform
-        will be random shuffled, the 2nd and 4th transform will also be shuffled.
-    :param order: The same with :class:`VisionTransform`
-
-    Example:
-
-    ..testcode::
-
-        from megengine.data.transform import RandomHorizontalFlip, RandomVerticalFlip, CenterCrop, ToMode, Compose
-
-        transform_func = Compose([
-            RandomHorizontalFlip(),
-            RandomVerticalFlip(),
-            CenterCrop(100),
-            ToMode("CHW"),
-            ],
-            shuffle_indices=[(1, 2, 3)]
-            )
-    """
-
-    def __init__(
-        self, transforms=[], batch_compose=False, shuffle_indices=None, *, order=None
-    ):
-        super().__init__(order)
-        self.transforms = transforms
-        self._set_order()
-
-        if batch_compose and shuffle_indices is not None:
-            raise ValueError(
-                "Do not support shuffle when apply transforms along the whole batch"
-            )
-        self.batch_compose = batch_compose
-
-        if shuffle_indices is not None:
-            shuffle_indices = [tuple(x - 1 for x in idx) for idx in shuffle_indices]
-        self.shuffle_indices = shuffle_indices
-
-    def _set_order(self):
-        for t in self.transforms:
-            t.order = self.order
-            if isinstance(t, Compose):
-                t._set_order()
-
-    def apply_batch(self, inputs: Sequence[Tuple]):
-        if self.batch_compose:
-            for t in self.transforms:
-                inputs = t.apply_batch(inputs)
-            return inputs
-        else:
-            return super().apply_batch(inputs)
-
-    def apply(self, input: Tuple):
-        for t in self._shuffle():
-            input = t.apply(input)
-        return input
-
-    def _shuffle(self):
-        if self.shuffle_indices is not None:
-            source_idx = list(range(len(self.transforms)))
-            for idx in self.shuffle_indices:
-                shuffled = np.random.permutation(idx).tolist()
-                for src, dst in zip(idx, shuffled):
-                    source_idx[src] = dst
-            return [self.transforms[i] for i in source_idx]
-        else:
-            return self.transforms
-
-
-class TorchTransformCompose(VisionTransform):
-    r"""
-    Compose class used for transforms in torchvision, only support PIL image,
-    some transforms with tensor in torchvision are not supported,
-    such as Normalize and ToTensor in torchvision.
-
-    :param transforms: The same with ``Compose``
-    :param order: The same with :class:`VisionTransform`
-    """
-
-    def __init__(self, transforms, *, order=None):
-        super().__init__(order)
-        self.transforms = transforms
-
-    def _apply_image(self, image):
-        from PIL import Image
-
-        try:
-            import accimage
-        except ImportError:
-            accimage = None
-
-        if image.shape[0] == 3:  # CHW
-            image = np.ascontiguousarray(image[[2, 1, 0]])
-        elif image.shape[2] == 3:  # HWC
-            image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
-        image = Image.fromarray(image.astype(np.uint8))
-
-        for t in self.transforms:
-            image = t(image)
-
-        if isinstance(image, Image.Image) or (
-            accimage is not None and isinstance(image, accimage.Image)
-        ):
-            image = np.array(image, dtype=np.uint8)
-        if image.shape[0] == 3:  # CHW
-            image = np.ascontiguousarray(image[[2, 1, 0]])
-        elif image.shape[2] == 3:  # HWC
-            image = cv2.cvtColor(image, cv2.COLOR_RGB2BGR)
-        return image
-
-
-class Pad(VisionTransform):
-    r"""Pad the input data.
-
-    :param size: Padding size of input image, it could be integer or sequence.
-        If it's an integer, the input image will be padded in four directions.
-        If it's a sequence contains two integer, the bottom and right side
-        of image will be padded.
-        If it's a sequence contains four integer, the top, bottom, left, right
-        side of image will be padded with given size.
-    :param value: Padding value of image, could be a sequence of int or float.
-        if it's float value, the dtype of image will be casted to float32 also.
-    :param mask_value: Padding value of segmentation map.
-    :param order: The same with :class:`VisionTransform`
-    """
-
-    def __init__(self, size=0, value=0, mask_value=0, *, order=None):
-        super().__init__(order)
-        if isinstance(size, int):
-            size = (size, size, size, size)
-        elif isinstance(size, collections.abc.Sequence) and len(size) == 2:
-            size = (0, size[0], 0, size[1])
-        elif not (isinstance(size, collections.abc.Sequence) and len(size) == 4):
-            raise ValueError(
-                "size should be a list/tuple which contains "
-                "(top, down, left, right) four pad sizes."
-            )
-        self.size = size
-        self.value = value
-        if not isinstance(mask_value, int):
-            raise ValueError(
-                "mask_value should be a positive integer, "
-                "but got mask_value={}".format(mask_value)
-            )
-        self.mask_value = mask_value
-
-    def _apply_image(self, image):
-        return F.pad(image, self.size, self.value)
-
-    def _apply_coords(self, coords):
-        coords[:, 0] += self.size[2]
-        coords[:, 1] += self.size[0]
-        return coords
-
-    def _apply_mask(self, mask):
-        return F.pad(mask, self.size, self.mask_value)
-
-
-class Resize(VisionTransform):
-    r"""Resize the input data.
-
-    :param output_size: Target size of image, with (height, width) shape.
-    :param interpolation: Interpolation method. All methods are listed below:
-
-        * cv2.INTER_NEAREST – a nearest-neighbor interpolation.
-        * cv2.INTER_LINEAR – a bilinear interpolation (used by default).
-        * cv2.INTER_AREA – resampling using pixel area relation.
-        * cv2.INTER_CUBIC – a bicubic interpolation over 4×4 pixel neighborhood.
-        * cv2.INTER_LANCZOS4 – a Lanczos interpolation over 8×8 pixel neighborhood.
-    :param order: The same with :class:`VisionTransform`
-    """
-
-    def __init__(self, output_size, interpolation=cv2.INTER_LINEAR, *, order=None):
-        super().__init__(order)
-        self.output_size = output_size
-        self.interpolation = interpolation
-
-    def apply(self, input: Tuple):
-        self._shape_info = self._get_shape(self._get_image(input))
-        return super().apply(input)
-
-    def _apply_image(self, image):
-        h, w, th, tw = self._shape_info
-        if h == th and w == tw:
-            return image
-        return F.resize(image, (th, tw), self.interpolation)
-
-    def _apply_coords(self, coords):
-        h, w, th, tw = self._shape_info
-        if h == th and w == tw:
-            return coords
-        coords[:, 0] = coords[:, 0] * (tw / w)
-        coords[:, 1] = coords[:, 1] * (th / h)
-        return coords
-
-    def _apply_mask(self, mask):
-        h, w, th, tw = self._shape_info
-        if h == th and w == tw:
-            return mask
-        return F.resize(mask, (th, tw), cv2.INTER_NEAREST)
-
-    def _get_shape(self, image):
-        h, w, _ = image.shape
-        if isinstance(self.output_size, int):
-            if min(h, w) == self.output_size:
-                return h, w, h, w
-            if h < w:
-                th = self.output_size
-                tw = int(self.output_size * w / h)
-            else:
-                tw = self.output_size
-                th = int(self.output_size * h / w)
-            return h, w, th, tw
-        else:
-            return (h, w, *self.output_size)
-
-
-class ShortestEdgeResize(VisionTransform):
-    def __init__(
-        self,
-        min_size,
-        max_size,
-        sample_style="range",
-        interpolation=cv2.INTER_LINEAR,
-        *,
-        order=None
-    ):
-        super().__init__(order)
-        if sample_style not in ("range", "choice"):
-            raise NotImplementedError(
-                "{} is unsupported sample style".format(sample_style)
-            )
-        self.sample_style = sample_style
-        if isinstance(min_size, int):
-            min_size = (min_size, min_size)
-        self.min_size = min_size
-        self.max_size = max_size
-        self.interpolation = interpolation
-
-    def apply(self, input: Tuple):
-        self._shape_info = self._get_shape(self._get_image(input))
-        return super().apply(input)
-
-    def _apply_image(self, image):
-        h, w, th, tw = self._shape_info
-        if h == th and w == tw:
-            return image
-        return F.resize(image, (th, tw), self.interpolation)
-
-    def _apply_coords(self, coords):
-        h, w, th, tw = self._shape_info
-        if h == th and w == tw:
-            return coords
-        coords[:, 0] = coords[:, 0] * (tw / w)
-        coords[:, 1] = coords[:, 1] * (th / h)
-        return coords
-
-    def _apply_mask(self, mask):
-        h, w, th, tw = self._shape_info
-        if h == th and w == tw:
-            return mask
-        return F.resize(mask, (th, tw), cv2.INTER_NEAREST)
-
-    def _get_shape(self, image):
-        h, w, _ = image.shape
-        if self.sample_style == "range":
-            size = np.random.randint(self.min_size[0], self.min_size[1] + 1)
-        else:
-            size = np.random.choice(self.min_size)
-
-        scale = size / min(h, w)
-        if h < w:
-            th, tw = size, scale * w
-        else:
-            th, tw = scale * h, size
-        if max(th, tw) > self.max_size:
-            scale = self.max_size / max(th, tw)
-            th = th * scale
-            tw = tw * scale
-        th = int(round(th))
-        tw = int(round(tw))
-        return h, w, th, tw
-
-
-class RandomResize(VisionTransform):
-    r"""Resize the input data randomly.
-
-    :param scale_range: .
-    :param order: The same with :class:`VisionTransform`
-    """
-
-    def __init__(self, scale_range, interpolation=cv2.INTER_LINEAR, *, order=None):
-        super().__init__(order)
-        self.scale_range = scale_range
-        self.interpolation = interpolation
-
-    def apply(self, input: Tuple):
-        self._shape_info = self._get_shape(self._get_image(input))
-        return super().apply(input)
-
-    def _apply_image(self, image):
-        h, w, th, tw = self._shape_info
-        if h == th and w == tw:
-            return image
-        return F.resize(image, (th, tw), self.interpolation)
-
-    def _apply_coords(self, coords):
-        h, w, th, tw = self._shape_info
-        if h == th and w == tw:
-            return coords
-        coords[:, 0] = coords[:, 0] * (tw / w)
-        coords[:, 1] = coords[:, 1] * (th / h)
-        return coords
-
-    def _apply_mask(self, mask):
-        h, w, th, tw = self._shape_info
-        if h == th and w == tw:
-            return mask
-        return F.resize(mask, (th, tw), cv2.INTER_NEAREST)
-
-    def _get_shape(self, image):
-        h, w, _ = image.shape
-        scale = np.random.uniform(*self.scale_range)
-        th = int(round(h * scale))
-        tw = int(round(w * scale))
-        return h, w, th, tw
-
-
-class RandomCrop(VisionTransform):
-    r"""Crop the input data randomly. Before applying the crop transform,
-    pad the image first. And if target size is still bigger than the size of
-    padded image, pad the image size to target size.
-
-    :param output_size: Target size of output image, with (height, width) shape.
-    :param padding_size: The same with `size` in ``Pad``
-    :param padding_value: The same with `value` in ``Pad``
-    :param order: The same with :class:`VisionTransform`
-    """
-
-    def __init__(
-        self,
-        output_size,
-        padding_size=0,
-        padding_value=[0, 0, 0],
-        padding_maskvalue=0,
-        *,
-        order=None
-    ):
-        super().__init__(order)
-        if isinstance(output_size, int):
-            self.output_size = (output_size, output_size)
-        else:
-            self.output_size = output_size
-        self.pad = Pad(padding_size, padding_value, order=self.order)
-        self.padding_value = padding_value
-        self.padding_maskvalue = padding_maskvalue
-
-    def apply(self, input):
-        input = self.pad.apply(input)
-        self._h, self._w, _ = self._get_image(input).shape
-        self._th, self._tw = self.output_size
-        self._x = np.random.randint(0, max(0, self._w - self._tw) + 1)
-        self._y = np.random.randint(0, max(0, self._h - self._th) + 1)
-        return super().apply(input)
-
-    def _apply_image(self, image):
-        if self._th > self._h:
-            image = F.pad(image, (self._th - self._h, 0), self.padding_value)
-        if self._tw > self._w:
-            image = F.pad(image, (0, self._tw - self._w), self.padding_value)
-        return image[self._y : self._y + self._th, self._x : self._x + self._tw]
-
-    def _apply_coords(self, coords):
-        coords[:, 0] -= self._x
-        coords[:, 1] -= self._y
-        return coords
-
-    def _apply_mask(self, mask):
-        if self._th > self._h:
-            mask = F.pad(mask, (self._th - self._h, 0), self.padding_maskvalue)
-        if self._tw > self._w:
-            mask = F.pad(mask, (0, self._tw - self._w), self.padding_maskvalue)
-        return mask[self._y : self._y + self._th, self._x : self._x + self._tw]
-
-
-class RandomResizedCrop(VisionTransform):
-    r"""Crop the input data to random size and aspect ratio.
-    A crop of random size (default: of 0.08 to 1.0) of the original size and a random
-    aspect ratio (default: of 3/4 to 1.33) of the original aspect ratio is made.
-    After applying crop transfrom, the input data will be resized to given size.
-
-    :param output_size: Target size of output image, with (height, width) shape.
-    :param scale_range: Range of size of the origin size cropped. Default: (0.08, 1.0)
-    :param ratio_range: Range of aspect ratio of the origin aspect ratio cropped. Default: (0.75, 1.33)
-    :param order: The same with :class:`VisionTransform`
-    """
-
-    def __init__(
-        self,
-        output_size,
-        scale_range=(0.08, 1.0),
-        ratio_range=(3.0 / 4, 4.0 / 3),
-        interpolation=cv2.INTER_LINEAR,
-        *,
-        order=None
-    ):
-        super().__init__(order)
-        if isinstance(output_size, int):
-            self.output_size = (output_size, output_size)
-        else:
-            self.output_size = output_size
-        assert (
-            scale_range[0] <= scale_range[1]
-        ), "scale_range should be of kind (min, max)"
-        assert (
-            ratio_range[0] <= ratio_range[1]
-        ), "ratio_range should be of kind (min, max)"
-        self.scale_range = scale_range
-        self.ratio_range = ratio_range
-        self.interpolation = interpolation
-
-    def apply(self, input: Tuple):
-        self._coord_info = self._get_coord(self._get_image(input))
-        return super().apply(input)
-
-    def _apply_image(self, image):
-        x, y, w, h = self._coord_info
-        cropped_img = image[y : y + h, x : x + w]
-        return F.resize(cropped_img, self.output_size, self.interpolation)
-
-    def _apply_coords(self, coords):
-        x, y, w, h = self._coord_info
-        coords[:, 0] = (coords[:, 0] - x) * self.output_size[1] / w
-        coords[:, 1] = (coords[:, 1] - y) * self.output_size[0] / h
-        return coords
-
-    def _apply_mask(self, mask):
-        x, y, w, h = self._coord_info
-        cropped_mask = mask[y : y + h, x : x + w]
-        return F.resize(cropped_mask, self.output_size, cv2.INTER_NEAREST)
-
-    def _get_coord(self, image, attempts=10):
-        height, width, _ = image.shape
-        area = height * width
-
-        for _ in range(attempts):
-            target_area = np.random.uniform(*self.scale_range) * area
-            log_ratio = tuple(math.log(x) for x in self.ratio_range)
-            aspect_ratio = math.exp(np.random.uniform(*log_ratio))
-
-            w = int(round(math.sqrt(target_area * aspect_ratio)))
-            h = int(round(math.sqrt(target_area / aspect_ratio)))
-
-            if 0 < w <= width and 0 < h <= height:
-                x = np.random.randint(0, width - w + 1)
-                y = np.random.randint(0, height - h + 1)
-                return x, y, w, h
-
-        # Fallback to central crop
-        in_ratio = float(width) / float(height)
-        if in_ratio < min(self.ratio_range):
-            w = width
-            h = int(round(w / min(self.ratio_range)))
-        elif in_ratio > max(self.ratio_range):
-            h = height
-            w = int(round(h * max(self.ratio_range)))
-        else:  # whole image
-            w = width
-            h = height
-        x = (width - w) // 2
-        y = (height - h) // 2
-        return x, y, w, h
-
-
-class CenterCrop(VisionTransform):
-    r"""Crops the given the input data at the center.
-
-    :param output_size: Target size of output image, with (height, width) shape.
-    :param order: The same with :class:`VisionTransform`
-    """
-
-    def __init__(self, output_size, *, order=None):
-        super().__init__(order)
-        if isinstance(output_size, int):
-            self.output_size = (output_size, output_size)
-        else:
-            self.output_size = output_size
-
-    def apply(self, input: Tuple):
-        self._coord_info = self._get_coord(self._get_image(input))
-        return super().apply(input)
-
-    def _apply_image(self, image):
-        x, y = self._coord_info
-        th, tw = self.output_size
-        return image[y : y + th, x : x + tw]
-
-    def _apply_coords(self, coords):
-        x, y = self._coord_info
-        coords[:, 0] -= x
-        coords[:, 1] -= y
-        return coords
-
-    def _apply_mask(self, mask):
-        x, y = self._coord_info
-        th, tw = self.output_size
-        return mask[y : y + th, x : x + tw]
-
-    def _get_coord(self, image):
-        th, tw = self.output_size
-        h, w, _ = image.shape
-        assert th <= h and tw <= w, "output size is bigger than image size"
-        x = int(round((w - tw) / 2.0))
-        y = int(round((h - th) / 2.0))
-        return x, y
-
-
-class RandomHorizontalFlip(VisionTransform):
-    r"""Horizontally flip the input data randomly with a given probability.
-
-    :param p: probability of the input data being flipped. Default: 0.5
-    :param order: The same with :class:`VisionTransform`
-    """
-
-    def __init__(self, prob: float = 0.5, *, order=None):
-        super().__init__(order)
-        self.prob = prob
-
-    def apply(self, input: Tuple):
-        self._flipped = np.random.random() < self.prob
-        self._w = self._get_image(input).shape[1]
-        return super().apply(input)
-
-    def _apply_image(self, image):
-        if self._flipped:
-            return F.flip(image, flipCode=1)
-        return image
-
-    def _apply_coords(self, coords):
-        if self._flipped:
-            coords[:, 0] = self._w - coords[:, 0]
-        return coords
-
-    def _apply_mask(self, mask):
-        if self._flipped:
-            return F.flip(mask, flipCode=1)
-        return mask
-
-
-class RandomVerticalFlip(VisionTransform):
-    r"""Vertically flip the input data randomly with a given probability.
-
-    :param p: probability of the input data being flipped. Default: 0.5
-    :param order: The same with :class:`VisionTransform`
-    """
-
-    def __init__(self, prob: float = 0.5, *, order=None):
-        super().__init__(order)
-        self.prob = prob
-
-    def apply(self, input: Tuple):
-        self._flipped = np.random.random() < self.prob
-        self._h = self._get_image(input).shape[0]
-        return super().apply(input)
-
-    def _apply_image(self, image):
-        if self._flipped:
-            return F.flip(image, flipCode=0)
-        return image
-
-    def _apply_coords(self, coords):
-        if self._flipped:
-            coords[:, 1] = self._h - coords[:, 1]
-        return coords
-
-    def _apply_mask(self, mask):
-        if self._flipped:
-            return F.flip(mask, flipCode=0)
-        return mask
-
-
-class Normalize(VisionTransform):
-    r"""Normalize the input data with mean and standard deviation.
-    Given mean: ``(M1,...,Mn)`` and std: ``(S1,..,Sn)`` for ``n`` channels,
-    this transform will normalize each channel of the input data.
-    ``output[channel] = (input[channel] - mean[channel]) / std[channel]``
-
-    :param mean: Sequence of means for each channel.
-    :param std: Sequence of standard deviations for each channel.
-    :param order: The same with :class:`VisionTransform`
-    """
-
-    def __init__(self, mean=0.0, std=1.0, *, order=None):
-        super().__init__(order)
-        self.mean = np.array(mean, dtype=np.float32)
-        self.std = np.array(std, dtype=np.float32)
-
-    def _apply_image(self, image):
-        return (image - self.mean) / self.std
-
-    def _apply_coords(self, coords):
-        return coords
-
-    def _apply_mask(self, mask):
-        return mask
-
-
-class GaussianNoise(VisionTransform):
-    r"""Add random gaussian noise to the input data.
-    Gaussian noise is generated with given mean and std.
-
-    :param mean: Gaussian mean used to generate noise.
-    :param std: Gaussian standard deviation used to generate noise.
-    :param order: The same with :class:`VisionTransform`
-    """
-
-    def __init__(self, mean=0.0, std=1.0, *, order=None):
-        super().__init__(order)
-        self.mean = np.array(mean, dtype=np.float32)
-        self.std = np.array(std, dtype=np.float32)
-
-    def _apply_image(self, image):
-        dtype = image.dtype
-        noise = np.random.normal(self.mean, self.std, image.shape) * 255
-        image = image + noise.astype(np.float32)
-        return np.clip(image, 0, 255).astype(dtype)
-
-    def _apply_coords(self, coords):
-        return coords
-
-    def _apply_mask(self, mask):
-        return mask
-
-
-class BrightnessTransform(VisionTransform):
-    r"""Adjust brightness of the input data.
-
-    :param value: How much to adjust the brightness. Can be any
-        non negative number. 0 gives the original image
-    :param order: The same with :class:`VisionTransform`
-    """
-
-    def __init__(self, value, *, order=None):
-        super().__init__(order)
-        if value < 0:
-            raise ValueError("brightness value should be non-negative")
-        self.value = value
-
-    def _apply_image(self, image):
-        if self.value == 0:
-            return image
-
-        dtype = image.dtype
-        image = image.astype(np.float32)
-        alpha = np.random.uniform(max(0, 1 - self.value), 1 + self.value)
-        image = image * alpha
-        return image.clip(0, 255).astype(dtype)
-
-    def _apply_coords(self, coords):
-        return coords
-
-    def _apply_mask(self, mask):
-        return mask
-
-
-class ContrastTransform(VisionTransform):
-    r"""Adjust contrast of the input data.
-
-    :param value: How much to adjust the contrast. Can be any
-        non negative number. 0 gives the original image
-    :param order: The same with :class:`VisionTransform`
-    """
-
-    def __init__(self, value, *, order=None):
-        super().__init__(order)
-        if value < 0:
-            raise ValueError("contrast value should be non-negative")
-        self.value = value
-
-    def _apply_image(self, image):
-        if self.value == 0:
-            return image
-
-        dtype = image.dtype
-        image = image.astype(np.float32)
-        alpha = np.random.uniform(max(0, 1 - self.value), 1 + self.value)
-        image = image * alpha + F.to_gray(image).mean() * (1 - alpha)
-        return image.clip(0, 255).astype(dtype)
-
-    def _apply_coords(self, coords):
-        return coords
-
-    def _apply_mask(self, mask):
-        return mask
-
-
-class SaturationTransform(VisionTransform):
-    r"""Adjust saturation of the input data.
-
-    :param value: How much to adjust the saturation. Can be any
-        non negative number. 0 gives the original image
-    :param order: The same with :class:`VisionTransform`
-    """
-
-    def __init__(self, value, *, order=None):
-        super().__init__(order)
-        if value < 0:
-            raise ValueError("saturation value should be non-negative")
-        self.value = value
-
-    def _apply_image(self, image):
-        if self.value == 0:
-            return image
-
-        dtype = image.dtype
-        image = image.astype(np.float32)
-        alpha = np.random.uniform(max(0, 1 - self.value), 1 + self.value)
-        image = image * alpha + F.to_gray(image) * (1 - alpha)
-        return image.clip(0, 255).astype(dtype)
-
-    def _apply_coords(self, coords):
-        return coords
-
-    def _apply_mask(self, mask):
-        return mask
-
-
-class HueTransform(VisionTransform):
-    r"""Adjust hue of the input data.
-
-    :param value: How much to adjust the hue. Can be any number
-        between 0 and 0.5, 0 gives the original image
-    :param order: The same with :class:`VisionTransform`
-    """
-
-    def __init__(self, value, *, order=None):
-        super().__init__(order)
-        if value < 0 or value > 0.5:
-            raise ValueError("hue value should be in [0.0, 0.5]")
-        self.value = value
-
-    def _apply_image(self, image):
-        if self.value == 0:
-            return image
-
-        dtype = image.dtype
-        image = image.astype(np.uint8)
-        hsv_image = cv2.cvtColor(image, cv2.COLOR_BGR2HSV_FULL)
-        h, s, v = cv2.split(hsv_image)
-
-        alpha = np.random.uniform(-self.value, self.value)
-        h = h.astype(np.uint8)
-        # uint8 addition take cares of rotation across boundaries
-        with np.errstate(over="ignore"):
-            h += np.uint8(alpha * 255)
-        hsv_image = cv2.merge([h, s, v])
-        return cv2.cvtColor(hsv_image, cv2.COLOR_HSV2BGR_FULL).astype(dtype)
-
-    def _apply_coords(self, coords):
-        return coords
-
-    def _apply_mask(self, mask):
-        return mask
-
-
-class ColorJitter(VisionTransform):
-    r"""Randomly change the brightness, contrast, saturation and hue of an image.
-
-    :param brightness: How much to jitter brightness.
-        Chosen uniformly from [max(0, 1 - brightness), 1 + brightness]
-        or the given [min, max]. Should be non negative numbers.
-    :param contrast: How much to jitter contrast.
-        Chosen uniformly from [max(0, 1 - contrast), 1 + contrast]
-        or the given [min, max]. Should be non negative numbers.
-    :param saturation: How much to jitter saturation.
-        Chosen uniformly from [max(0, 1 - saturation), 1 + saturation]
-        or the given [min, max]. Should be non negative numbers.
-    :param hue: How much to jitter hue.
-        Chosen uniformly from [-hue, hue] or the given [min, max].
-        Should have 0<= hue <= 0.5 or -0.5 <= min <= max <= 0.5.
-    :param order: The same with :class:`VisionTransform`
-    """
-
-    def __init__(self, brightness=0, contrast=0, saturation=0, hue=0, *, order=None):
-        super().__init__(order)
-        transforms = []
-        if brightness != 0:
-            transforms.append(BrightnessTransform(brightness))
-        if contrast != 0:
-            transforms.append(ContrastTransform(contrast))
-        if saturation != 0:
-            transforms.append(SaturationTransform(saturation))
-        if hue != 0:
-            transforms.append(HueTransform(hue))
-        self.transforms = Compose(
-            transforms,
-            shuffle_indices=[tuple(range(1, len(transforms) + 1))],
-            order=order,
-        )
-
-    def apply(self, input):
-        return self.transforms.apply(input)
-
-
-class Lighting(VisionTransform):
-    def __init__(self, scale, *, order=None):
-        super().__init__(order)
-        if scale < 0:
-            raise ValueError("lighting scale should be non-negative")
-        self.scale = scale
-        self.eigvec = np.array(
-            [
-                [-0.5836, -0.6948, 0.4203],
-                [-0.5808, -0.0045, -0.8140],
-                [-0.5675, 0.7192, 0.4009],
-            ]
-        )  # reverse the first dimension for BGR
-        self.eigval = np.array([0.2175, 0.0188, 0.0045])
-
-    def _apply_image(self, image):
-        if self.scale == 0:
-            return image
-
-        dtype = image.dtype
-        image = image.astype(np.float32)
-        alpha = np.random.normal(scale=self.scale, size=3)
-        image = image + self.eigvec.dot(alpha * self.eigval)
-        return image.clip(0, 255).astype(dtype)
-
-    def _apply_coords(self, coords):
-        return coords
-
-    def _apply_mask(self, mask):
-        return mask
diff --git a/python_module/megengine/distributed/__init__.py b/python_module/megengine/distributed/__init__.py
deleted file mode 100644
index 1416e82c..00000000
--- a/python_module/megengine/distributed/__init__.py
+++ /dev/null
@@ -1,33 +0,0 @@
-# -*- coding: utf-8 -*-
-# MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
-#
-# Copyright (c) 2014-2020 Megvii Inc. All rights reserved.
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-from .functional import (
-    all_gather,
-    all_reduce_max,
-    all_reduce_min,
-    all_reduce_sum,
-    all_to_all,
-    bcast_param,
-    broadcast,
-    gather,
-    reduce_scatter_sum,
-    reduce_sum,
-    scatter,
-)
-from .util import (
-    get_backend,
-    get_free_ports,
-    get_master_ip,
-    get_master_port,
-    get_rank,
-    get_world_size,
-    group_barrier,
-    init_process_group,
-    is_distributed,
-    synchronized,
-)
diff --git a/python_module/megengine/distributed/functional.py b/python_module/megengine/distributed/functional.py
deleted file mode 100644
index 56ad089f..00000000
--- a/python_module/megengine/distributed/functional.py
+++ /dev/null
@@ -1,302 +0,0 @@
-# -*- coding: utf-8 -*-
-# MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
-#
-# Copyright (c) 2014-2020 Megvii Inc. All rights reserved.
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-from typing import Optional, Union
-
-import megengine._internal as mgb
-from megengine._internal.opr_param_defs import CollectiveComm as Param
-
-from ..core import Buffer, Parameter, Tensor, wrap_io_tensor
-from ..functional import add_update
-from .helper import collective_comm_symvar
-from .util import get_rank, is_distributed
-
-
-@wrap_io_tensor
-def _collective_comm(*args, **kargs):
-    return collective_comm_symvar(*args, **kargs)
-
-
-def _group_check(*args):
-    """Return True when arguments are all None or all not None
-    """
-    l = [val is None for val in args]
-    return len(set(l)) <= 1
-
-
-def reduce_sum(
-    tensor: Tensor,
-    key: Optional[str] = None,
-    nr_ranks: Optional[int] = None,
-    is_root: Optional[bool] = None,
-) -> Tensor:
-    """Create reduce_sum operator for collective communication
-
-    :param tensor: input tensor
-    :param key: unique identifier for collective communication
-    :param nr_ranks: number of ranks, use util.get_world_size() as default
-    :param is_root: whether this is a root node
-    """
-    assert _group_check(
-        key, nr_ranks, is_root
-    ), "key, nr_ranks, is_root should be set at the same time"
-    return _collective_comm(
-        tensor, key, Param.Mode.REDUCE_SUM, nr_ranks, is_root, device=tensor.device,
-    )
-
-
-def gather(
-    tensor: Tensor,
-    key: Optional[str] = None,
-    nr_ranks: Optional[int] = None,
-    is_root: Optional[bool] = None,
-    rank: Optional[int] = None,
-) -> Tensor:
-    """Create gather operator for collective communication
-
-    :param tensor: input tensor
-    :param key: unique identifier for collective communication
-    :param nr_ranks: number of ranks, use util.get_world_size() as default
-    :param is_root: whether this is a root node
-    :param rank: rank of this node
-    """
-    assert _group_check(
-        key, nr_ranks, is_root, rank
-    ), "key, nr_ranks, is_root, rank should be set at the same time"
-    return _collective_comm(
-        tensor, key, Param.Mode.GATHER, nr_ranks, is_root, rank, device=tensor.device,
-    )
-
-
-def broadcast(
-    tensor: Tensor,
-    key: Optional[str] = None,
-    nr_ranks: Optional[int] = None,
-    is_root: Optional[bool] = None,
-) -> Tensor:
-    """Create broadcast operator for collective communication
-
-    :param tensor: input tensor
-    :param key: unique identifier for collective communication
-    :param nr_ranks: number of ranks, use util.get_world_size() as default
-    :param is_root: whether this is a root node
-    """
-    assert _group_check(
-        key, nr_ranks, is_root
-    ), "key, nr_ranks, is_root should be set at the same time"
-
-    if is_root is None:
-        is_root = get_rank() == 0
-    if is_root:
-        inp = tensor
-    else:
-        inp = tensor._symvar.owner_graph
-
-    return _collective_comm(
-        inp,
-        key,
-        Param.Mode.BROADCAST,
-        nr_ranks,
-        is_root,
-        dtype=tensor.dtype,
-        device=tensor.device,
-    )
-
-
-def scatter(
-    tensor: Tensor,
-    key: Optional[str] = None,
-    nr_ranks: Optional[int] = None,
-    is_root: Optional[bool] = None,
-    rank: Optional[int] = None,
-) -> Tensor:
-    """Create scatter operator for collective communication
-
-    :param tensor: input tensor
-    :param key: unique identifier for collective communication
-    :param nr_ranks: number of ranks, use util.get_world_size() as default
-    :param is_root: whether this is a root node
-    :param rank: rank of this node
-    """
-    assert _group_check(
-        key, nr_ranks, is_root, rank
-    ), "key, nr_ranks, is_root, rank should be set at the same time"
-    if key is None:
-        key = tensor._symvar.name
-    if is_root is None:
-        is_root = get_rank() == 0
-
-    if is_root:
-        inp = tensor
-    else:
-        inp = tensor._symvar.owner_graph
-
-    return _collective_comm(
-        inp,
-        key,
-        Param.Mode.SCATTER,
-        nr_ranks,
-        is_root,
-        rank,
-        dtype=tensor.dtype,
-        device=tensor.device,
-    )
-
-
-def all_to_all(
-    tensor: Tensor,
-    key: Optional[str] = None,
-    nr_ranks: Optional[int] = None,
-    rank: Optional[int] = None,
-    local_grad: Optional[bool] = False,
-) -> Tensor:
-    """Create all_to_all operator for collective communication
-
-    :param tensor: input tensor
-    :param key: unique identifier for collective communication
-    :param nr_ranks: number of ranks, use util.get_world_size() as default
-    :param rank: rank of this node
-    :param local_grad: whether use local grad
-    """
-    assert _group_check(
-        key, nr_ranks, rank
-    ), "key, nr_ranks, rank should be set at the same time"
-    return _collective_comm(
-        tensor, key, Param.Mode.ALL_TO_ALL, nr_ranks, rank=rank, local_grad=local_grad,
-    )
-
-
-def all_gather(
-    tensor: Tensor,
-    key: Optional[str] = None,
-    nr_ranks: Optional[int] = None,
-    rank: Optional[int] = None,
-    local_grad: Optional[bool] = False,
-) -> Tensor:
-    """Create all_gather operator for collective communication
-
-    :param tensor: input tensor
-    :param key: unique identifier for collective communication
-    :param nr_ranks: number of ranks, use util.get_world_size() as default
-    :param rank: rank of this node
-    :param local_grad: whether use local grad
-    """
-    assert _group_check(
-        key, nr_ranks, rank
-    ), "key, nr_ranks, rank should be set at the same time"
-    return _collective_comm(
-        tensor, key, Param.Mode.ALL_GATHER, nr_ranks, rank=rank, local_grad=local_grad
-    )
-
-
-def reduce_scatter_sum(
-    tensor: Tensor,
-    key: Optional[str] = None,
-    nr_ranks: Optional[int] = None,
-    rank: Optional[int] = None,
-    local_grad: Optional[bool] = False,
-) -> Tensor:
-    """Create reduce_scatter_sum operator for collective communication
-
-    :param tensor: input tensor
-    :param key: unique identifier for collective communication
-    :param nr_ranks: number of ranks, use util.get_world_size() as default
-    :param rank: rank of this node
-    :param local_grad: whether use local grad
-    """
-    assert _group_check(
-        key, nr_ranks, rank
-    ), "key, nr_ranks, rank should be set at the same time"
-    return _collective_comm(
-        tensor,
-        key,
-        Param.Mode.REDUCE_SCATTER_SUM,
-        nr_ranks,
-        rank=rank,
-        local_grad=local_grad,
-    )
-
-
-def all_reduce_sum(
-    tensor: Tensor,
-    key: Optional[str] = None,
-    nr_ranks: Optional[int] = None,
-    local_grad: Optional[bool] = False,
-) -> Tensor:
-    """Create all_reduce_sum operator for collective communication
-
-    :param tensor: input tensor
-    :param key: unique identifier for collective communication
-    :param nr_ranks: number of ranks, use util.get_world_size() as default
-    :param local_grad: whether use local grad
-    """
-    assert _group_check(key, nr_ranks), "key, nr_ranks should be set at the same time"
-    return _collective_comm(
-        tensor, key, Param.Mode.ALL_REDUCE_SUM, nr_ranks, local_grad=local_grad
-    )
-
-
-def all_reduce_max(
-    tensor: Tensor,
-    key: Optional[str] = None,
-    nr_ranks: Optional[int] = None,
-    local_grad: Optional[bool] = False,
-) -> Tensor:
-    """Create all_reduce_max operator for collective communication
-
-    :param tensor: input tensor
-    :param key: unique identifier for collective communication
-    :param nr_ranks: number of ranks, use util.get_world_size() as default
-    :param local_grad: whether use local grad
-    """
-    assert _group_check(key, nr_ranks), "key, nr_ranks should be set at the same time"
-    return _collective_comm(
-        tensor, key, Param.Mode.ALL_REDUCE_MAX, nr_ranks, local_grad=local_grad
-    )
-
-
-def all_reduce_min(
-    tensor: Tensor,
-    key: Optional[str] = None,
-    nr_ranks: Optional[int] = None,
-    local_grad: Optional[bool] = False,
-) -> Tensor:
-    """Create all_reduce_min operator for collective communication
-
-    :param tensor: input tensor
-    :param key: unique identifier for collective communication
-    :param nr_ranks: number of ranks, use util.get_world_size() as default
-    :param local_grad: whether use local grad
-    """
-    assert _group_check(key, nr_ranks), "key, nr_ranks should be set at the same time"
-    return _collective_comm(
-        tensor, key, Param.Mode.ALL_REDUCE_MIN, nr_ranks, local_grad=local_grad
-    )
-
-
-def bcast_param(
-    inp: Union[Buffer, Parameter],
-    key: Optional[str] = None,
-    nr_ranks: Optional[int] = None,
-    is_root: Optional[bool] = None,
-) -> None:
-    """Broadcast parameters among devices
-
-    :param inp: input Buffer or Parameter to be synchronized
-    :param key: unique identifier for collective communication
-    :param nr_ranks: number of ranks, use util.get_world_size() as default
-    :param is_root: whether this is a root node
-    """
-    if not is_distributed():
-        return
-    assert _group_check(
-        key, nr_ranks, is_root
-    ), "key, nr_ranks, is_root should be set at the same time"
-    assert isinstance(inp, (Buffer, Parameter))
-    bcast_res = broadcast(inp, key, nr_ranks, is_root)
-    add_update(inp, bcast_res, alpha=0)
diff --git a/python_module/megengine/distributed/helper.py b/python_module/megengine/distributed/helper.py
deleted file mode 100644
index 7d2d84bd..00000000
--- a/python_module/megengine/distributed/helper.py
+++ /dev/null
@@ -1,63 +0,0 @@
-# -*- coding: utf-8 -*-
-# MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
-#
-# Copyright (c) 2014-2020 Megvii Inc. All rights reserved.
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-from typing import Optional, Union
-
-import megengine._internal as mgb
-from megengine._internal.opr_param_defs import CollectiveComm as CollParam
-
-from .util import (
-    get_backend,
-    get_group_id,
-    get_master_ip,
-    get_master_port,
-    get_rank,
-    get_world_size,
-)
-
-
-def collective_comm_symvar(
-    inp: Union[mgb.SymbolVar, mgb.CompGraph],
-    key: Optional[str] = None,
-    op: CollParam.Mode = None,
-    nr_ranks: Optional[int] = None,
-    is_root: Optional[bool] = None,
-    rank: Optional[int] = None,
-    local_grad: Optional[bool] = False,
-    dtype: Optional[type] = None,
-    device: Optional[mgb.CompNode] = None,
-    comp_graph: Optional[mgb.CompGraph] = None,
-) -> mgb.SymbolVar:
-    """Helper function for creating collective_comm operators
-
-    :param inp: tensor or comp_graph
-    :param key: unique identifier for collective communication
-    :param op: mode of collective communication
-    :param nr_ranks: number of ranks, use util.get_world_size() as default
-    :param is_root: whether this node is root node
-    :param rank: rank of this node
-    :param local_grad: whether use local grad
-    :param dtype: output data type, use dtype of inp as default
-    :param device: output comp node, use comp node of inp as default
-    :param comp_graph: output comp graph, use comp graph of inp as default
-    """
-    return mgb.opr.collective_comm(
-        inp,
-        key=key if key is not None else ("collective_comm_" + str(get_group_id())),
-        nr_devices=nr_ranks if nr_ranks is not None else get_world_size(),
-        is_root=is_root if is_root is not None else (get_rank() == 0),
-        rank=rank if rank is not None else get_rank(),
-        local_grad=local_grad,
-        server_addr=get_master_ip(),
-        port=get_master_port(),
-        param=CollParam(mode=op),
-        dtype=dtype,
-        backend=get_backend(),
-        comp_node=device,
-        comp_graph=comp_graph,
-    )
diff --git a/python_module/megengine/distributed/util.py b/python_module/megengine/distributed/util.py
deleted file mode 100644
index 5166a9fc..00000000
--- a/python_module/megengine/distributed/util.py
+++ /dev/null
@@ -1,146 +0,0 @@
-# -*- coding: utf-8 -*-
-# MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
-#
-# Copyright (c) 2014-2020 Megvii Inc. All rights reserved.
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-import functools
-import socket
-from typing import Callable, List, Optional
-
-import megengine._internal as mgb
-
-from ..core import set_default_device
-
-_master_ip = None
-_master_port = 0
-_world_size = 0
-_rank = 0
-_backend = None
-_group_id = 0
-
-
-def init_process_group(
-    master_ip: str,
-    master_port: int,
-    world_size: int,
-    rank: int,
-    dev: int,
-    backend: Optional[str] = "nccl",
-) -> None:
-    """Initialize the distributed process group, and also specify the device used in the current process.
-
-    :param master_ip: IP address of the master node.
-    :param master_port: Port available for all processes to communicate.
-    :param world_size: Total number of processes participating in the job.
-    :param rank: Rank of the current process.
-    :param dev: The GPU device id to bind this process to.
-    :param backend: Communicator backend, currently support 'nccl' and 'ucx'
-    """
-    global _master_ip  # pylint: disable=global-statement
-    global _master_port  # pylint: disable=global-statement
-    global _world_size  # pylint: disable=global-statement
-    global _rank  # pylint: disable=global-statement
-    global _backend  # pylint: disable=global-statement
-    global _group_id  # pylint: disable=global-statement
-
-    if not isinstance(master_ip, str):
-        raise TypeError("Expect type str but got {}".format(type(master_ip)))
-    if not isinstance(master_port, int):
-        raise TypeError("Expect type int but got {}".format(type(master_port)))
-    if not isinstance(world_size, int):
-        raise TypeError("Expect type int but got {}".format(type(world_size)))
-    if not isinstance(rank, int):
-        raise TypeError("Expect type int but got {}".format(type(rank)))
-    if not isinstance(backend, str):
-        raise TypeError("Expect type str but got {}".format(type(backend)))
-
-    _master_ip = master_ip
-    _master_port = master_port
-    _world_size = world_size
-    _rank = rank
-    _backend = backend
-    _group_id = 0
-
-    set_default_device(mgb.comp_node("gpu" + str(dev)))
-
-    if rank == 0:
-        _master_port = mgb.config.create_mm_server("0.0.0.0", master_port)
-        if _master_port == -1:
-            raise Exception("Failed to start server on port {}".format(master_port))
-    else:
-        assert master_port > 0, "master_port must be specified for non-zero rank"
-
-
-def is_distributed() -> bool:
-    """Return True if the distributed process group has been initialized"""
-    return _world_size is not None and _world_size > 1
-
-
-def get_master_ip() -> str:
-    """Get the IP address of the master node"""
-    return str(_master_ip)
-
-
-def get_master_port() -> int:
-    """Get the port of the rpc server on the master node"""
-    return _master_port
-
-
-def get_world_size() -> int:
-    """Get the total number of processes participating in the job"""
-    return _world_size
-
-
-def get_rank() -> int:
-    """Get the rank of the current process"""
-    return _rank
-
-
-def get_backend() -> str:
-    """Get the backend str"""
-    return str(_backend)
-
-
-def get_group_id() -> int:
-    """Get group id for collective communication"""
-    global _group_id
-    _group_id += 1
-    return _group_id
-
-
-def group_barrier() -> None:
-    """Block until all ranks in the group reach this barrier"""
-    mgb.config.group_barrier(_master_ip, _master_port, _world_size, _rank)
-
-
-def synchronized(func: Callable):
-    """Decorator. Decorated function will synchronize when finished.
-    Specifically, we use this to prevent data race during hub.load"""
-
-    @functools.wraps(func)
-    def wrapper(*args, **kwargs):
-        if not is_distributed():
-            return func(*args, **kwargs)
-
-        ret = func(*args, **kwargs)
-        group_barrier()
-        return ret
-
-    return wrapper
-
-
-def get_free_ports(num: int) -> List[int]:
-    """Get one or more free ports.
-    """
-    socks, ports = [], []
-    for i in range(num):
-        sock = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
-        sock.bind(("", 0))
-        socks.append(sock)
-        ports.append(sock.getsockname()[1])
-    for sock in socks:
-        sock.close()
-    return ports
diff --git a/python_module/megengine/functional/__init__.py b/python_module/megengine/functional/__init__.py
deleted file mode 100644
index 6220c599..00000000
--- a/python_module/megengine/functional/__init__.py
+++ /dev/null
@@ -1,118 +0,0 @@
-# -*- coding: utf-8 -*-
-# MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
-#
-# Copyright (c) 2014-2020 Megvii Inc. All rights reserved.
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# pylint: disable=redefined-builtin
-from .elemwise import (
-    abs,
-    add,
-    arccos,
-    arcsin,
-    ceil,
-    clamp,
-    cos,
-    divide,
-    equal,
-    exp,
-    floor,
-    greater,
-    greater_equal,
-    isinf,
-    isnan,
-    less,
-    less_equal,
-    log,
-    maximum,
-    minimum,
-    mod,
-    multiply,
-    power,
-    relu,
-    round,
-    sigmoid,
-    sin,
-    subtract,
-    tanh,
-)
-from .graph import add_extra_vardep, add_update, grad
-from .loss import (
-    binary_cross_entropy,
-    cross_entropy,
-    cross_entropy_with_softmax,
-    hinge_loss,
-    l1_loss,
-    nll_loss,
-    smooth_l1_loss,
-    square_loss,
-    triplet_margin_loss,
-)
-from .math import (
-    argmax,
-    argmin,
-    logsumexp,
-    max,
-    mean,
-    min,
-    norm,
-    normalize,
-    prod,
-    sqrt,
-    sum,
-)
-from .nn import (
-    assert_equal,
-    avg_pool2d,
-    batch_norm2d,
-    batched_matrix_mul,
-    conv2d,
-    conv_transpose2d,
-    dropout,
-    embedding,
-    eye,
-    flatten,
-    identity,
-    indexing_one_hot,
-    interpolate,
-    leaky_relu,
-    linear,
-    local_conv2d,
-    matrix_mul,
-    max_pool2d,
-    one_hot,
-    prelu,
-    remap,
-    roi_align,
-    roi_pooling,
-    softmax,
-    softplus,
-    sync_batch_norm,
-    warp_perspective,
-)
-from .quantized import conv_bias_activation
-from .sort import argsort, sort, top_k
-from .tensor import (
-    add_axis,
-    arange,
-    broadcast_to,
-    concat,
-    cond_take,
-    dimshuffle,
-    gather,
-    linspace,
-    remove_axis,
-    reshape,
-    scatter,
-    shapeof,
-    transpose,
-    where,
-    zeros_like,
-)
-from .utils import accuracy, zero_grad
-
-# delete namespace
-# pylint: disable=undefined-variable
-del elemwise, graph, loss, math, nn, tensor  # type: ignore[name-defined]
diff --git a/python_module/megengine/functional/debug_param.py b/python_module/megengine/functional/debug_param.py
deleted file mode 100644
index b27f4b4b..00000000
--- a/python_module/megengine/functional/debug_param.py
+++ /dev/null
@@ -1,49 +0,0 @@
-# -*- coding: utf-8 -*-
-# MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
-#
-# Copyright (c) 2014-2020 Megvii Inc. All rights reserved.
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-import os
-
-_conv_execution_strategy = os.getenv("MEGENGINE_CONV_EXECUTION_STRATEGY", "HEURISTIC")
-
-
-def get_conv_execution_strategy() -> str:
-    """Returns the execuation strategy of :class:`~.Conv2d`.
-
-    See :func:`~.set_conv_execution_strategy` for possible return values
-    """
-    return _conv_execution_strategy
-
-
-def set_conv_execution_strategy(option: str):
-    """Sets the execuation strategy of :class:`~.Conv2d`.
-
-    :param option: Decides how :class:`~.Conv2d` algorithm is chosen.
-        Available values:
-
-        * 'HEURISTIC' uses heuristic to choose the fastest algorithm.
-        * 'PROFILE' runs possible algorithms on real device to find the best.
-        * 'PROFILE_HEURISTIC' uses profile result and heuristic to choose the fastest algorithm.
-        * 'PROFILE_REPRODUCIBLE' uses the fastest of profile result that is also reproducible.
-        * 'HEURISTIC_REPRODUCIBLE' uses heuristic to choose the fastest algorithm that is also reproducible.
-
-        The default strategy is 'HEURISTIC'.
-
-        It can also be set through the environmental variable 'MEGENGINE_CONV_EXECUTION_STRATEGY'.
-    """
-    valid_option = (
-        "HEURISTIC",
-        "PROFILE",
-        "PROFILE_HEURISTIC",
-        "PROFILE_REPRODUCIBLE",
-        "HEURISTIC_REPRODUCIBLE",
-    )
-    if not option in valid_option:
-        raise ValueError("Valid option can only be one of {}".format(valid_option))
-
-    global _conv_execution_strategy  # pylint: disable=global-statement
-    _conv_execution_strategy = option
diff --git a/python_module/megengine/functional/elemwise.py b/python_module/megengine/functional/elemwise.py
deleted file mode 100644
index 16889652..00000000
--- a/python_module/megengine/functional/elemwise.py
+++ /dev/null
@@ -1,299 +0,0 @@
-# -*- coding: utf-8 -*-
-# MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
-#
-# Copyright (c) 2014-2020 Megvii Inc. All rights reserved.
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# pylint: disable=unused-argument,invalid-name,redefined-builtin,arguments-out-of-order
-import functools
-
-import megengine._internal as mgb
-
-from ..core.graph import _use_default_if_none
-from ..core.tensor import Tensor, wrap_io_tensor
-
-__all__ = [
-    "abs",
-    "arccos",
-    "add",
-    "arcsin",
-    "clamp",
-    "ceil",
-    "cos",
-    "divide",
-    "equal",
-    "exp",
-    "greater",
-    "greater_equal",
-    "floor",
-    "isinf",
-    "isnan",
-    "less",
-    "less_equal",
-    "log",
-    "maximum",
-    "minimum",
-    "mod",
-    "multiply",
-    "power",
-    "relu",
-    "round",
-    "sigmoid",
-    "sin",
-    "subtract",
-    "tanh",
-]
-
-
-def _elemwise(mode):  # DONT export
-    """Decorator helps to wrap megbrain element-wise oprs"""
-
-    def elemwise_decorator(func):
-        @functools.wraps(func)
-        @wrap_io_tensor
-        def elemwise_func(*inputs) -> Tensor:
-            if all(isinstance(i, (int, float)) for i in inputs):
-                device, comp_graph = _use_default_if_none(None, None)
-                ret = mgb.opr.elemwise(
-                    *inputs, mode=mode, comp_node=device, comp_graph=comp_graph
-                )
-                return ret.inferred_value[0]
-            return mgb.opr.elemwise(*inputs, mode=mode)
-
-        return elemwise_func
-
-    return elemwise_decorator
-
-
-@_elemwise("ABS")
-def abs(x):
-    """Calculate the absolute value element-wise."""
-
-
-@_elemwise("ACOS")
-def arccos(x):
-    """Inverse cosine, element-wise."""
-
-
-@_elemwise("ADD")
-def add(x, y):
-    """Element-wise addition."""
-
-
-@_elemwise("ASIN")
-def arcsin(x):
-    """Inverse sine, element-wise."""
-
-
-@_elemwise("CEIL")
-def ceil(x):
-    """Return the ceil of the input, element-wise."""
-
-
-@_elemwise("COS")
-def cos(x):
-    """Cosine, element-wise."""
-
-
-@_elemwise("TRUE_DIV")
-def divide(x, y):
-    """Return (x / y) element-wise."""
-
-
-@_elemwise("EQ")
-def equal(x, y):
-    """Return (x == y) element-wise."""
-
-
-@_elemwise("EXP")
-def exp(x):
-    """Calculate the exponential element-wise"""
-
-
-@_elemwise("FLOOR")
-def floor(x):
-    """Return the floor of the input, element-wise"""
-
-
-def greater(x, y):
-    """Return (x > y) element-wise."""
-    return less(y, x)
-
-
-def greater_equal(x, y):
-    """Return (x >= y) element-wise"""
-    return less_equal(y, x)
-
-
-@_elemwise("LT")
-def less(x, y):
-    """Return (x < y) element-wise."""
-
-
-@_elemwise("LEQ")
-def less_equal(x, y):
-    """Return (x =< y) element-wise."""
-
-
-@_elemwise("LOG")
-def log(x):
-    """Natural logarithm (base `e`), element-wise."""
-
-
-@_elemwise("MAX")
-def maximum(x, y):
-    """Element-wise maximum of array elements."""
-
-
-@_elemwise("MIN")
-def minimum(x, y):
-    """Element-wise minimum of array elements."""
-
-
-@_elemwise("MOD")
-def mod(x, y):
-    """Return element-wise remainder of division."""
-
-
-@_elemwise("MUL")
-def multiply(x, y):
-    """Element-wise multiplication."""
-
-
-@_elemwise("POW")
-def power(x, y):
-    """First tensor elements raised to powers from second tensor (x ** y), element-wise."""
-
-
-@_elemwise("RELU")
-def relu(x):
-    """Return `max(x, 0)` element-wise."""
-
-
-@_elemwise("ROUND")
-def round(x):
-    """Round tensor to int element-wise."""
-
-
-@_elemwise("SIGMOID")
-def sigmoid(x):
-    """Return 1 / ( 1 + exp( -x ) ) element-wise."""
-
-
-@_elemwise("SIN")
-def sin(x):
-    """Sine, element-wise."""
-
-
-@_elemwise("SUB")
-def subtract(x, y):
-    """Subtract arguments element-wise"""
-
-
-@_elemwise("TANH")
-def tanh(x):
-    """Compute hyperbolic tangent element-wise."""
-
-
-@wrap_io_tensor
-def clamp(inp: Tensor, lower=None, upper=None) -> Tensor:
-    r"""
-    Clamp all elements in :attr:`inp` into the range `[` :attr:`lower`, :attr:`upper` `]` and return
-    a resulting tensor:
-
-    .. math::
-        y_i = \begin{cases}
-            \text{lower} & \text{if } x_i < \text{lower} \\
-            x_i & \text{if } \text{lower} \leq x_i \leq \text{upper} \\
-            \text{upper} & \text{if } x_i > \text{upper}
-        \end{cases}
-
-    :param inp: the input tensor.
-    :param lower: lower-bound of the range to be clamped to
-    :param upper: upper-bound of the range to be clamped to
-
-    Example:
-
-    .. testcode::
-
-        import numpy as np
-        from megengine import tensor
-        import megengine.functional as F
-        a = tensor(np.arange(5).astype(np.int32))
-
-        print(F.clamp(a, 2, 4).numpy())
-
-        print(F.clamp(a, lower=3).numpy())
-
-        print(F.clamp(a, upper=3).numpy())
-
-    .. testoutput::
-
-        [2 2 2 3 4]
-        [3 3 3 3 4]
-        [0 1 2 3 3]
-
-    """
-    assert (
-        lower is not None or upper is not None
-    ), "At least one of 'lower' or 'upper' must not be None"
-    if lower is not None:
-        if upper is not None:
-            assert lower <= upper, "clamp lower bound is bigger that upper bound"
-            return minimum(maximum(inp, lower), upper)
-        else:
-            return maximum(inp, lower)
-    else:
-        return minimum(inp, upper)
-
-
-def isnan(inp: Tensor) -> Tensor:
-    r"""Returns a new tensor representing if each element is NaN or not.
-
-    :param: inp
-    :return: a new tensor representing if each element in :attr:`inp` is NaN or not.
-
-    Examples:
-
-    .. testcode::
-
-        from megengine import tensor
-        import megengine.functional as F
-
-        x = tensor([1, float("nan"), 0])
-
-        print(F.isnan(x))
-
-    .. testoutput::
-
-        Tensor([0 1 0], dtype=uint8)
-
-    """
-    return (inp != inp).astype("uint8")
-
-
-def isinf(inp: Tensor) -> Tensor:
-    r"""Returns a new tensor representing if each element is Inf or not.
-
-    :param: inp
-    :return: a new tensor representing if each element in :attr:`inp` is Inf or not.
-
-    Examples:
-
-    .. testcode::
-
-        from megengine import tensor
-        import megengine.functional as F
-
-        x = tensor([1, float("inf"), 0])
-
-        print(F.isinf(x))
-
-    .. testoutput::
-
-        Tensor([0 1 0], dtype=uint8)
-
-    """
-    return (abs(inp) == float("inf")).astype("uint8")
diff --git a/python_module/megengine/functional/external.py b/python_module/megengine/functional/external.py
deleted file mode 100644
index 6c93d217..00000000
--- a/python_module/megengine/functional/external.py
+++ /dev/null
@@ -1,65 +0,0 @@
-# -*- coding: utf-8 -*-
-# MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
-#
-# Copyright (c) 2014-2020 Megvii Inc. All rights reserved.
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# pylint: disable=too-many-lines
-from typing import List
-
-import megengine._internal as mgb
-
-from ..core import Tensor, wrap_io_tensor
-
-
-@wrap_io_tensor
-def cambricon_subgraph(
-    inputs: List[Tensor], data: bytes, symbol: str, tensor_dim_mutable: bool,
-) -> List[Tensor]:
-    """Load a serialized Cambricon subgraph (i.e. cnrtModel_t) and
-    execute the operations defined in the subgraph.
-
-    :param inputs: List of input tensors of the subgraph.
-    :param data: The serialized subgraph.
-    :param symbol: The name of the function in the subgraph.
-        The function is corresponding to a cnmlFusionOp
-        which is added to the cnmlModel_t/cnrtModel_t.
-    :param tensor_dim_mutable: Whether the input tensors' shapes are mutalbe
-        in cnrtModel_t
-    """
-    return mgb.opr.cambricon_runtime(
-        data, symbol, tuple(map(lambda x: x._symvar, inputs)), tensor_dim_mutable
-    )
-
-
-@wrap_io_tensor
-def atlas_subgraph(inputs: List[Tensor], data: bytes) -> List[Tensor]:
-    """Load a serialized Atlas subgraph (i.e. om model) and
-    execute the operations defined in the subgraph.
-
-    :param inputs: List of input tensors of the subgraph.
-    :param data: The serialized subgraph.
-    """
-    return mgb.opr.atlas_runtime(tuple(map(lambda x: x._symvar, inputs)), data)
-
-
-@wrap_io_tensor
-def extern_opr_subgraph(
-    inputs, output_shapes: List[tuple], dump_name: str, dump_data: bytes,
-) -> List[Tensor]:
-    """Load a serialized extern opr subgraph and fake execute the operator
-
-    :param inputs: Tensor or list of input tensors.
-    :param output_shapes: The output shapes.
-    :param dump_name: The serialized subgraph name.
-    :param dump_data: The serialized subgraph.
-
-    :return: List of tensors
-    """
-    if not isinstance(inputs, list):
-        inputs = [inputs]
-    return mgb.opr.extern_c_opr_placeholder(
-        inputs, output_shapes, dump_name=dump_name, dump_data=dump_data,
-    )
diff --git a/python_module/megengine/functional/graph.py b/python_module/megengine/functional/graph.py
deleted file mode 100644
index 5dbdadb6..00000000
--- a/python_module/megengine/functional/graph.py
+++ /dev/null
@@ -1,125 +0,0 @@
-# -*- coding: utf-8 -*-
-# MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
-#
-# Copyright (c) 2014-2020 Megvii Inc. All rights reserved.
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-import collections
-from typing import Iterable, Optional, Union
-
-import megengine._internal as mgb
-
-from ..core.graph import get_default_graph
-from ..core.tensor import Tensor, wrap_io_tensor
-from ..jit import barrier, mark_impure, trace
-
-
-@wrap_io_tensor
-def grad(
-    target: Tensor,
-    wrt: Union[Tensor, Iterable[Tensor]],
-    warn_mid_wrt: bool = True,
-    use_virtual_grad: bool = None,
-    return_zero_for_nodep: bool = True,
-) -> Union[Tensor, Iterable[Optional[Tensor]], None]:
-    r"""Compute the symbolic gradient of ``target`` with repect to ``wrt``.
-
-    ``wrt`` can either be a single tensor or a sequence of tensors.
-
-    :param target: ``grad`` target tensor
-    :param wrt: with respect to which to compute the gradient
-    :param warn_mid_wrt: whether to give warning if ``wrt`` is not endpoint
-    :param use_virtual_grad: whether to use virtual ``grad`` opr, so fwd graph can
-        be optimized before applying ``grad``; if ``None`` is given, then virtual
-        ``grad`` would be used if ``graph_opt_level >= 2``
-    :param return_zero_for_nodep: if ``target`` does not depend on ``wrt``, set to True to return
-        a zero-valued :class:`~.Tensor` rather than ``None``; can't be set to False when using
-        virtual ``grad`` opr.
-    :return: :math:`\partial\text{target} / \partial\text{wrt}`
-    """
-    if not isinstance(wrt, mgb.SymbolVar):
-        assert isinstance(wrt, collections.Iterable)
-        wrt = [w._symvar for w in wrt]
-
-    return mgb.grad(target, wrt, warn_mid_wrt, use_virtual_grad, return_zero_for_nodep)
-
-
-_add_update_cache = {}  # type: dict
-
-_dummy = mgb.SharedScalar(0)
-
-
-def add_update(
-    dest: Tensor,
-    delta: Tensor,
-    *,
-    alpha: Union[Tensor, float, int] = 1.0,
-    beta: Union[Tensor, float, int] = 1.0,
-    bias: Union[Tensor, float, int] = 0.0
-):
-    r"""Inplace modify ``dest`` as follows:
-
-    .. math::
-        dest = alpha * dest +  beta * delta + bias
-
-    :param dest: input data that will be inplace modified.
-    :param delta: update value that will be added to ``dest``.
-    :param alpha: weight ratio of ``dest``. Default: 1.0
-    :param beta: weight ratio of ``delta``. Default: 1.0
-    :param bias: bias value appended to the result. Default: 0.0
-    """
-
-    if isinstance(beta, Tensor) or isinstance(alpha, Tensor):
-        delta *= beta
-        beta = 1.0
-    if isinstance(alpha, Tensor):
-        delta += (alpha - 1.0) * dest
-        alpha = 1.0
-    if isinstance(bias, Tensor):
-        delta += bias
-        bias = 0.0
-
-    comp_graph = dest._comp_graph or get_default_graph()
-    comp_node = dest._comp_node
-
-    if not isinstance(delta, Tensor):
-        _delta = mgb.make_immutable(
-            value=delta, comp_node=comp_node, comp_graph=comp_graph
-        )
-    else:
-        _delta = delta._attach(comp_graph)
-
-    _dest = dest._attach(comp_graph)
-
-    # use (dest, delta) as the key, so we could not add the same delta to dest in static graph
-    key = (comp_graph._id(), _dest.id, _delta.id)
-    if key in _add_update_cache:
-        _alpha, _beta, _bias, config = _add_update_cache[key]
-        mgb.mgb._mgb.SharedScalar__set(_alpha, alpha)
-        mgb.mgb._mgb.SharedScalar__set(_beta, beta)
-        mgb.mgb._mgb.SharedScalar__set(_bias, bias)
-    else:
-        _alpha = mgb.SharedScalar(alpha)
-        _beta = mgb.SharedScalar(beta)
-        _bias = mgb.SharedScalar(bias)
-        config = mgb.helper.gen_config(None, comp_node, None)
-        _add_update_cache[key] = (_alpha, _beta, _bias, config)
-
-    u = mgb.mgb._Opr.add_update(
-        _dest, barrier(_delta), _alpha, _beta, _bias, _dummy, config
-    )
-    mark_impure(u)
-
-    if trace._active_instance:
-        dest._override_symvar_during_trace(trace._active_instance, u)
-
-    return Tensor(u)
-
-
-@wrap_io_tensor
-def add_extra_vardep(oup: Tensor, dep: Tensor):
-    r"""Explicitly set the dependency that tensor ``oup`` depends on tensor ``dep``.
-    """
-    return mgb.config.add_extra_vardep(oup, dep)
diff --git a/python_module/megengine/functional/loss.py b/python_module/megengine/functional/loss.py
deleted file mode 100644
index 0755825d..00000000
--- a/python_module/megengine/functional/loss.py
+++ /dev/null
@@ -1,391 +0,0 @@
-# -*- coding: utf-8 -*-
-# MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
-#
-# Copyright (c) 2014-2020 Megvii Inc. All rights reserved.
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-import megengine._internal as mgb
-
-from ..core.tensor import Tensor
-from .elemwise import abs, equal, log, maximum, power, relu
-from .nn import assert_equal, indexing_one_hot
-from .tensor import where
-from .utils import zero_grad
-
-
-def l1_loss(pred: Tensor, label: Tensor) -> Tensor:
-    r"""
-    Calculates the mean absolute error (MAE) between
-    each element in the pred :math:`x` and label :math:`y`.
-
-    The mean absolute error can be described as:
-
-    .. math:: \ell(x,y) = mean\left(L \right)
-
-    where
-
-    .. math::
-
-        L = \{l_1,\dots,l_N\}, \quad
-        l_n = \left| x_n - y_n \right|,
-
-    :math:`x` and :math:`y` are tensors of arbitrary shapes with a total
-    of :math:`N` elements each. :math:`N` is the batch size.
-
-    :param pred: The predicted result from model.
-    :param label: The ground truth to compare.
-
-    Examples:
-
-    .. testcode::
-
-        import numpy as np
-        import megengine as mge
-        import megengine.functional as F
-        ipt = mge.tensor(np.array([3, 3, 3, 3]).astype(np.float32))
-        tgt = mge.tensor(np.array([2, 8, 6, 1]).astype(np.float32))
-        loss = F.l1_loss(ipt,tgt)
-        print(loss.numpy())
-
-    Outputs:
-
-    .. testoutput::
-
-        [2.75]
-
-    """
-
-    diff = pred - label
-    return abs(diff).mean()
-
-
-def square_loss(pred: Tensor, label: Tensor) -> Tensor:
-    r"""
-    Calculates the mean squared error (squared L2 norm) between
-    each element in the pred :math:`x` and label :math:`y`.
-
-    The mean squared error can be described as:
-
-    .. math:: \ell(x, y) = mean\left( L \right)
-
-    where
-
-    .. math::
-
-        L = \{l_1,\dots,l_N\}, \quad
-        l_n = \left( x_n - y_n \right)^2,
-
-    :math:`x` and :math:`y` are tensors of arbitrary shapes with a total
-    of :math:`N` elements each. :math:`N` is the batch size.
-
-    :param pred: The predicted result from model.
-    :param label: The ground truth to compare.
-
-    Shape:
-        - pred: :math:`(N, *)` where :math:`*` means any number of additional
-          dimensions
-        - label: :math:`(N, *)`. Same shape as ``pred``
-
-    """
-    diff = pred - label
-    return (diff ** 2).mean()
-
-
-def cross_entropy(
-    inp: Tensor, target: Tensor, axis: int = 1, ignore_index: int = -1
-) -> Tensor:
-    r"""
-    Returns the cross entropy loss in a classification problem.
-
-    .. math:: \textrm{CrossEntropy}(x, y) = - \sum_{i} y_i\log(x_i)
-
-    :param inp: The input tensor representing the predicted probability.
-    :param label: The input tensor representing the classification label.
-    :param axis: An axis along which cross_entropy will be applied. Default: 1
-    :param ignore_index: Specifies a target value that is ignored and does not contribute to the input gradient. Default: -1
-
-    Examples:
-
-    .. testcode::
-
-        import numpy as np
-        from megengine import tensor
-        import megengine.functional as F
-
-        data_shape = (1, 2)
-        label_shape = (1, )
-
-        pred = tensor(np.array([0.5, 0.5], dtype=np.float32).reshape(data_shape))
-        label = tensor(np.ones(label_shape, dtype=np.int32))
-        loss = F.cross_entropy(pred, label)
-        print(loss.numpy())
-
-    Outputs:
-
-    .. testoutput::
-
-        [0.69]
-
-    """
-    n0 = inp.ndim
-    n1 = target.ndim
-    assert n0 == n1 + 1, (
-        "target ndim must be one less than input ndim; input_ndim={} "
-        "target_ndim={}".format(n0, n1)
-    )
-
-    if ignore_index != -1:
-        mask = 1 - equal(target, ignore_index)
-        target = target * mask
-        loss = -log(indexing_one_hot(inp, target, axis)) * mask
-        return loss.sum() / maximum(mask.sum(), 1.0)
-    else:
-        return -log(indexing_one_hot(inp, target, axis)).mean()
-
-
-def cross_entropy_with_softmax(
-    pred: Tensor, label: Tensor, axis: int = 1, label_smooth: float = 0
-) -> Tensor:
-    r"""
-    Returns loss after applying :func:`~.softmax` + :func:`~.cross_entropy`.
-
-    It has better numerical stability compared with sequential calls to :func:`~.softmax` and :func:`~.cross_entropy`.
-
-    When using label smoothing, the label distribution is as follows:
-
-    .. math:: y^{LS}_{k}=y_{k}\left(1-\alpha\right)+\alpha/K
-
-    where :math:`y^{LS}` and :math:`y` are new label distribution and origin label distribution respectively.
-    k is the index of label distribution. :math:`\alpha` is label_smooth and :math:`K` is the number of classes.
-
-    :param pred: The input tensor representing the predicted probability.
-    :param label: The input tensor representing the classification label.
-    :param axis: An axis along which softmax will be applied. Default: 1.
-    :param label_smooth: A label smoothing of parameter that can re-distribute target distribution. Default: 0.
-    """
-
-    n0 = pred.ndim
-    n1 = label.ndim
-    assert n0 == n1 + 1, (
-        "target ndim must be one less than input ndim; input_ndim={} "
-        "target_ndim={}".format(n0, n1)
-    )
-
-    num_classes = pred.shapeof(axis)
-
-    # Denominator of the softmax
-    offset = zero_grad(pred.max(axis=axis, keepdims=True))
-    pred = pred - offset
-    down = mgb.opr.elem.exp(pred).sum(axis=axis, keepdims=True)
-
-    up = indexing_one_hot(pred, label, axis)
-
-    if label_smooth != 0:
-        factor = label_smooth / num_classes
-        up = up * (1 - label_smooth) + pred.sum(axis=axis, keepdims=True) * factor
-
-    return (log(down) - up).mean()
-
-
-def triplet_margin_loss(
-    anchor: Tensor, positive: Tensor, negative: Tensor, margin: float = 1.0, p: int = 2
-) -> Tensor:
-    r"""
-    Creates a criterion that measures the triplet loss given an input tensors.
-
-    .. math::
-
-        L(a, p, n) = max\left\{d\left(a_{i},p_{i}\right)-d\left(a_{i}, n_{i}\right)+margin, 0\right\},\
-        d\left(x_{i},y_{i}\right)=\left\|x_{i}-y_{i}\right\|_{p}
-
-    :param anchor: The input tensor representing the anchor samples.
-    :param positive: The input tensor representing the positive samples.
-    :param negative: The input tensor representing the negative samples.
-    :param margin: Default: 1.0
-    :param p: The norm degree for pairwise distance. Default: 2.0
-    """
-
-    s0 = anchor.shapeof()
-    s1 = positive.shapeof()
-    s2 = negative.shapeof()
-    assert_equal(s0, s1)
-    assert_equal(s1, s2)
-
-    n0 = anchor.ndim
-    n1 = positive.ndim
-    n2 = negative.ndim
-    assert n0 == 2 and n1 == 2 and n2 == 2, (
-        "anchor ndim, positive ndim, and negative ndim must be 2; "
-        "anchor_ndim={} positive_ndim={} negative_ndim={}".format(n0, n1, n2)
-    )
-    assert p > 0, "a margin with a value greater than 0; p={}".format(p)
-
-    diff0 = abs(anchor - positive)
-    diff1 = abs(anchor - negative)
-
-    d1 = power(power(diff0, p).sum(axis=1, keepdims=True), 1 / p)
-    d2 = power(power(diff1, p).sum(axis=1, keepdims=True), 1 / p)
-
-    loss = maximum(d1 - d2 + margin, 0)
-
-    return loss.mean()
-
-
-def binary_cross_entropy(pred: Tensor, label: Tensor) -> Tensor:
-    r"""Function that measures the Binary Cross Entropy between the target and the prediction.
-
-    :param pred: (N,*) where * means, any number of additional dimensions.
-    :param label: (N,*), same shape as the input.
-
-    """
-    s0 = pred.shapeof()
-    s1 = label.shapeof()
-
-    assert_equal(s0, s1)
-
-    return -1.0 * (label * log(pred) + (1.0 - label) * log(1 - pred)).mean()
-
-
-def nll_loss(
-    pred: Tensor, label: Tensor, axis: int = 1, ignore_index: int = -1
-) -> Tensor:
-    r"""
-    The negative log likelihood loss.
-
-    :param pred: The predicted result from model.
-    :param label: The ground truth to compare.
-
-    Examples:
-
-    .. testcode::
-
-        import numpy as np
-        from megengine import tensor
-        import megengine.functional as F
-        data_shape = (2, 2)
-        label_shape = (2, )
-
-        data = tensor(
-            np.array([[1, 0.5], [0.3, 1.2]], dtype=np.float32).reshape(data_shape),
-        )
-        label = tensor(
-            np.ones(label_shape, dtype=np.int32)
-        )
-        pred = F.log(F.softmax(data))
-        loss1 = F.nll_loss(pred, label)
-        loss2 = F.cross_entropy_with_softmax(data, label)
-        print(loss1.numpy(), loss2.numpy())
-
-    Outputs:
-
-    .. testoutput::
-
-        [0.6576154] [0.6576154]
-
-    """
-    n0 = pred.ndim
-    n1 = label.ndim
-    assert n0 == n1 + 1, (
-        "target ndim must be one less than input ndim; input_ndim={} "
-        "target_ndim={}".format(n0, n1)
-    )
-
-    mask = 1.0 - equal(label, ignore_index)
-    label = label * mask
-
-    loss = indexing_one_hot(pred, label, axis) * mask
-
-    return -1.0 * loss.sum() / maximum(mask.sum(), 1.0)
-
-
-def hinge_loss(pred: Tensor, label: Tensor, norm: str = "L1") -> Tensor:
-    r"""
-    Caculate the hinge loss which is often used in SVMs.
-
-    The hinge loss can be described as:
-
-    .. math:: loss(x, y) = \frac{1}{N}\sum_i\sum_j(max(0, 1 - x_{ij}*y_{ij}))
-
-    :param pred: The input tensor representing the predicted probability, shape is (N, C).
-    :param label: The input tensor representing the binary classification label, shape is (N, C).
-    :param norm: Specify the norm to caculate the loss, should be "L1" or "L2".
-
-    Examples:
-
-    .. testcode::
-
-        from megengine import tensor
-        import megengine.functional as F
-
-        pred = tensor([[0.5, -0.5, 0.1], [-0.6, 0.7, 0.8]])
-        label = tensor([[1, -1, -1], [-1, 1, 1]])
-
-        loss = F.hinge_loss(pred, label)
-
-        print(loss.numpy())
-
-    Outputs:
-
-    .. testoutput::
-
-        [1.5]
-
-    """
-    assert norm in ["L1", "L2"], "norm must be L1 or L2"
-    # Converts binary labels to -1/1 labels.
-    loss = relu(1.0 - pred * label)
-    if norm == "L1":
-        return loss.sum(axis=1).mean()
-    else:
-        return (loss ** 2).sum(axis=1).mean()
-
-
-def smooth_l1_loss(pred: Tensor, label: Tensor) -> Tensor:
-    r"""
-    Caculate the smooth l1 loss proposed in `Fast R-CNN paper by Ross Girshick`.
-
-    The smooth l1 loss can be described as:
-
-    .. math::
-        \text{loss}(x, y) = \frac{1}{n} \sum_{i} l_{i}
-
-    where :math:`l_{i}` is given by:
-
-    .. math::
-        l_{i} =
-        \begin{cases}
-        0.5 (x_i - y_i)^2, & \text{if } |x_i - y_i| < 1 \\
-        |x_i - y_i| - 0.5, & \text{otherwise }
-        \end{cases}
-
-    :param pred: The predicted result from model.
-    :param label: The ground truth to compare.
-
-    Examples:
-
-    .. testcode::
-
-        from megengine import tensor
-        import megengine.functional as F
-
-        pred = tensor([[0.5, -0.5, 0.1], [-0.6, 0.7, 0.8]])
-        label = tensor([[0.4, 1.5, 1.2], [0., 0.1, 2.2]])
-
-        loss = F.smooth_l1_loss(pred, label)
-
-        print(loss.numpy())
-
-    Outputs:
-
-    .. testoutput::
-
-        [0.5608334]
-    """
-    diff = abs(pred - label)
-    l2_loss = 0.5 * (diff ** 2)
-    l1_loss = diff - 0.5
-    mask = diff < 1
-    loss = where(mask, l2_loss, l1_loss)
-    return loss.mean()
diff --git a/python_module/megengine/functional/math.py b/python_module/megengine/functional/math.py
deleted file mode 100644
index 06f9cebe..00000000
--- a/python_module/megengine/functional/math.py
+++ /dev/null
@@ -1,333 +0,0 @@
-# -*- coding: utf-8 -*-
-# MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
-#
-# Copyright (c) 2014-2020 Megvii Inc. All rights reserved.
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-import math
-import numbers
-from typing import Optional, Sequence, Union
-
-import megengine._internal as mgb
-
-from ..core import Tensor, wrap_io_tensor
-from .elemwise import clamp, exp, isinf, log
-from .tensor import remove_axis, where, zeros_like
-
-
-@wrap_io_tensor
-def sum(inp: Tensor, axis: Optional[int] = None, keepdims: bool = False) -> Tensor:
-    r"""Returns the sum of each row of the ``inp`` tensor in the given ``axis``.
-
-    :param inp: The input tensor.
-    :param axis: The dimension to reduce. If None, all the dimensions will be reduced.
-        Default: None
-    :param keepdims: Whether the output tensor has ``axis`` retained or not.
-        Default: False
-    :return: The output tensor
-
-    Examples:
-
-    .. testcode::
-
-        import numpy as np
-        from megengine import tensor
-        import megengine.functional as F
-
-        data = tensor(np.arange(1, 7, dtype=np.int32).reshape(2, 3))
-        out = F.sum(data)
-        print(out.numpy())
-
-    .. testoutput::
-
-        [21]
-
-    """
-    return mgb.opr.reduce_(inp, "SUM", axis, keepdims)
-
-
-@wrap_io_tensor
-def prod(inp: Tensor, axis: Optional[int] = None, keepdims=False) -> Tensor:
-    r"""
-    Returns the element product of input tensor along given *axis*.
-
-    :param inp: The input tensor
-    :param axis: The dimension to reduce. If None, all the dimensions will be reduced. Default: ``None``
-    :param keepdims: Whether the output tensor has *axis* retained or not. Default: ``False``
-    :return: The output tensor
-
-    Examples:
-
-    .. testcode::
-
-        import numpy as np
-        from megengine import tensor
-        import megengine.functional as F
-
-        data = tensor(np.arange(1, 7, dtype=np.int32).reshape(2, 3))
-        out = F.prod(data)
-        print(out.numpy())
-
-    Outputs:
-
-    .. testoutput::
-
-        [720]
-
-    """
-    return mgb.opr.reduce_(inp, "PRODUCT", axis, keepdims)
-
-
-@wrap_io_tensor
-def mean(inp: Tensor, axis: Optional[int] = None, keepdims: bool = False) -> Tensor:
-    """Returns the mean value of each row of the ``inp`` tensor in
-    the given ``axis``. If axis is a list of dimensions,
-    reduce over all of them.
-
-    :param inp: The input tensor
-    :param axis: The dimension to reduce. If None, all the dimensions will be reduced. Default: None
-    :param keepdims: Whether the output tensor has ``axis`` retained or not. Default: False
-
-    Examples:
-
-    .. testcode::
-
-        import numpy as np
-        from megengine import tensor
-        import megengine.functional as F
-
-        data = tensor(np.arange(1, 7, dtype=np.int32).reshape(2, 3))
-        out = F.mean(data)
-        print(out.numpy())
-
-    .. testoutput::
-
-        [3.5]
-
-    """
-    return mgb.opr.mean(inp, axis, keepdims)
-
-
-@wrap_io_tensor
-def min(inp: Tensor, axis: Optional[int] = None, keepdims: bool = False) -> Tensor:
-    r"""
-    Returns the min value of input tensor along given *axis*.
-
-    :param inp: The input tensor
-    :param axis: The dimension to reduce. If None, all the dimensions will be reduced. Default: None
-    :param keepdims: Whether the output tensor has *axis* retained or not. Default: False
-    :return: The output tensor
-
-    Examples:
-
-    .. testcode::
-
-        import numpy as np
-        from megengine import tensor
-        import megengine.functional as F
-
-        x = tensor(np.arange(1, 7, dtype=np.int32).reshape(2,3))
-        y = F.min(x)
-        print(y.numpy())
-
-    Outputs:
-
-    .. testoutput::
-
-        [1]
-
-    """
-    return mgb.opr.reduce_(inp, "MIN", axis, keepdims)
-
-
-@wrap_io_tensor
-def max(inp: Tensor, axis: Optional[int] = None, keepdims: bool = False) -> Tensor:
-    r"""Returns the max value of the input tensor along given *axis*.
-
-    :param inp: The input tensor
-    :param axis: The dimension to reduce. If None, all the dimensions will be reduced. Default: None
-    :param keepdims: Whether the output tensor has *axis* retained or not. Default: False
-    :return: The output tensor
-
-    Examples:
-
-    .. testcode::
-
-        import numpy as np
-        from megengine import tensor
-        import megengine.functional as F
-
-        x = tensor(np.arange(1, 7, dtype=np.int32).reshape(2,3))
-        y = F.max(x)
-        print(y.numpy())
-
-    .. testoutput::
-
-        [6]
-
-    """
-    return mgb.opr.reduce_(inp, "MAX", axis, keepdims)
-
-
-@wrap_io_tensor
-def sqrt(inp: Tensor) -> Tensor:
-    """
-    Return a new tensor with the square-root of the elements of ``inp``
-
-    :param inp: The input tensor
-    :return: The computed tensor
-
-    Examples:
-
-    .. testcode::
-
-        import numpy as np
-        import megengine as mge
-        import megengine.functional as F
-
-        data = mge.tensor(np.arange(0, 6, dtype=np.float32).reshape(2, 3))
-        out = F.sqrt(data)
-        print(out.numpy())
-
-    Outputs:
-
-    .. testoutput::
-
-        [[0.      1.     1.4142]
-         [1.7321  2.     2.2361 ]]
-
-    """
-
-    return mgb.opr.sqrt(inp)
-
-
-def norm(inp: Tensor, p: int = 2, axis: Optional[int] = None, keepdims=False):
-    """Calculate ``p``-norm of input tensor along certain axis.
-
-    :param inp: The input tensor
-    :param p: power of value ``p`` applied to ``inp``. Default: 2
-    :param axis: The dimension to reduce. If None, all the dimensions will be reduced. Default: None
-    :param keepdims: Whether the output tensor has ``axis`` retained or not. Default: False
-    :return: The output tensor
-
-    """
-    if axis is None:
-        inp = inp.reshape(-1)
-    return (inp ** p).sum(axis=axis, keepdims=keepdims) ** (1.0 / p)
-
-
-@wrap_io_tensor
-def argmin(inp: Tensor, axis: Optional[int] = None, keepdims: bool = False) -> Tensor:
-    r"""Returns the indices of the minimum values along an axis
-
-    :param inp: The input tensor
-    :param axis: The dimension to reduce. If None, all the dimensions will be reduced. Default: None
-    :param keepdims: Whether the output tensor has *axis* retained or not. Default: False
-    :return: The output tensor
-
-    Examples:
-
-    .. testcode::
-
-        import numpy as np
-        from megengine import tensor
-        import megengine.functional as F
-
-        x = tensor(np.arange(1, 7, dtype=np.int32).reshape(2,3))
-        y = F.argmin(x)
-        print(y.numpy())
-
-    .. testoutput::
-
-        [0]
-
-    """
-    return mgb.opr.argmin(inp, axis, keepdims)
-
-
-@wrap_io_tensor
-def argmax(inp: Tensor, axis: Optional[int] = None, keepdims: bool = False) -> Tensor:
-    r"""Returns the indices of the maximum values along an axis
-
-    :param inp: The input tensor
-    :param axis: The dimension to reduce. If None, all the dimensions will be reduced. Default: None
-    :param keepdims: Whether the output tensor has *axis* retained or not. Default: False
-    :return: The output tensor
-
-    Examples:
-
-    .. testcode::
-
-        import numpy as np
-        from megengine import tensor
-        import megengine.functional as F
-
-        x = tensor(np.arange(1, 7, dtype=np.int32).reshape(2,3))
-        y = F.argmax(x)
-        print(y.numpy())
-
-    .. testoutput::
-
-        [5]
-
-    """
-    return mgb.opr.argmax(inp, axis, keepdims)
-
-
-def normalize(
-    inp: Tensor, p: int = 2, axis: Optional[int] = None, eps: float = 1e-12
-) -> Tensor:
-    r"""Perform :math:`L_p` normalization of input tensor along certain axis.
-
-    For a tensor :attr:`inp` of shape :math:`(n_0, ..., n_{dim}, ..., n_k)`, each
-    :math:`n_{dim}` -element vector :math:`v` along dimension :attr:`axis` is transformed as:
-
-    .. math::
-        v = \frac{v}{\max(\lVert v \rVert_p, \epsilon)}.
-
-    :param inp: the input tensor
-    :param p: power of value ``p`` applied to ``inp``. Default: 2
-    :param axis: The dimension to reduce. If None, all the dimensions will be reduced
-        to calculate the norm. Default: None
-    :param eps: a small value to avoid division by zero. Default: 1e-12
-    :return: the normalized output tensor
-
-    """
-    if axis is None:
-        return inp / clamp(norm(inp, p), lower=eps)
-    else:
-        return inp / clamp(norm(inp, p, axis, keepdims=True), lower=eps)
-
-
-def logsumexp(inp: Tensor, axis: Union[int, Sequence[int]], keepdims: bool = False):
-    r"""
-    Compute the log of the sum of exponentials of inputs along the given :attr:`axis`. The computation is numerically stabilized.
-    
-    .. math::
-        
-        \mathsf{logsumexp}(x_1, \dots, x_n) = \log(\exp(x_1) + \cdots + \exp(x_n))
-
-    :param inp: The input tensor.
-    :param axis: Axis over which the sum is taken. It can be a single axis or a list of axes.
-    :param keepdims: whether to retain :attr:`axis` or not for the output tensor.
-
-    """
-    if isinstance(axis, numbers.Integral):
-        axis = (axis,)
-    max_value = inp
-    for dim in axis:
-        max_value = max_value.max(axis=dim, keepdims=True)
-    max_value = where(
-        isinf(max_value).astype("int32"), zeros_like(max_value), max_value
-    )
-    x = exp(inp - max_value)
-    for dim in axis:
-        x = x.sum(axis=dim, keepdims=True)
-    x = max_value + log(x)
-    if not keepdims:
-        axis = sorted(axis, reverse=True)
-        for i in axis:
-            x = remove_axis(x, axis=i)
-    return x
diff --git a/python_module/megengine/functional/nn.py b/python_module/megengine/functional/nn.py
deleted file mode 100644
index 44438d83..00000000
--- a/python_module/megengine/functional/nn.py
+++ /dev/null
@@ -1,1234 +0,0 @@
-# -*- coding: utf-8 -*-
-# MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
-#
-# Copyright (c) 2014-2020 Megvii Inc. All rights reserved.
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# pylint: disable=too-many-lines
-from typing import Optional, Tuple, Union
-
-import megengine._internal as mgb
-from megengine._internal import CompGraph, CompNode
-from megengine._internal.config import add_extra_vardep
-from megengine._internal.opr import add_update
-from megengine._internal.opr_param_defs import CollectiveComm as CollParam
-
-from .. import distributed as dist
-from ..core import Tensor, wrap_io_tensor
-from ..core.graph import _use_default_if_none
-from ..distributed.util import get_group_id
-from ..jit import barrier, mark_impure
-from ..random import uniform
-from ..utils.types import _pair, _pair_nonzero
-from .debug_param import get_conv_execution_strategy
-from .elemwise import exp, log
-from .tensor import where
-from .utils import _decide_comp_node_and_comp_graph
-
-
-@wrap_io_tensor
-def linear(inp: Tensor, weight: Tensor, bias: Optional[Tensor] = None) -> Tensor:
-    """Applies a linear transformation to the input.
-
-    Refer to :class:`~.module.linear.Linear` for more information.
-
-    :param inp: the input tensor with shape `(N, in_features)`.
-    :param weight: the weight with shape `(out_features, in_features)`.
-    :param bias: the bias with shape `(out_features,)`.
-        Default: ``None``
-    """
-    orig_shape = inp.shape
-    inp = inp.reshape(-1, orig_shape[-1])
-    ret = mgb.opr.matrix_mul(inp, weight, transposeB=True)
-    ret = ret.reshape(orig_shape[:-1], weight.shape[0])
-    if bias is not None:
-        ret += bias.reshape(1, bias.shape[0])
-    return ret
-
-
-@wrap_io_tensor
-def conv2d(
-    inp: Tensor,
-    weight: Tensor,
-    bias: Optional[Tensor] = None,
-    stride: Union[int, Tuple[int, int]] = 1,
-    padding: Union[int, Tuple[int, int]] = 0,
-    dilation: Union[int, Tuple[int, int]] = 1,
-    groups: int = 1,
-    conv_mode="CROSS_CORRELATION",
-    compute_mode="DEFAULT",
-) -> Tensor:
-    """2D convolution operation.
-
-    Refer to :class:`~.Conv2d` for more information.
-
-    :param inp: The feature map of the convolution operation
-    :param weight: The convolution kernel
-    :param bias: The bias added to the result of convolution (if given)
-    :param stride: Stride of the 2D convolution operation. Default: 1
-    :param padding: Size of the paddings added to the input on both sides of its
-        spatial dimensions. Only zero-padding is supported. Default: 0
-    :param dilation: Dilation of the 2D convolution operation. Default: 1
-    :param groups: number of groups to divide input and output channels into,
-        so as to perform a "grouped convolution". When ``groups`` is not 1,
-        ``in_channels`` and ``out_channels`` must be divisible by ``groups``,
-        and the shape of weight should be ``(groups, out_channel // groups,
-        in_channels // groups, height, width)``.
-    :type conv_mode: string or :class:`mgb.opr_param_defs.Convolution.Mode`
-    :param conv_mode: Supports 'CROSS_CORRELATION' or 'CONVOLUTION'. Default:
-        'CROSS_CORRELATION'.
-    :type compute_mode: string or
-        :class:`mgb.opr_param_defs.Convolution.ComputeMode`
-    :param compute_mode: When set to 'DEFAULT', no special requirements will be
-        placed on the precision of intermediate results. When set to 'FLOAT32',
-        Float32 would be used for accumulator and intermediate result, but only
-        effective when input and output are of Float16 dtype.
-
-    """
-    ph, pw = _pair(padding)
-    sh, sw = _pair_nonzero(stride)
-    dh, dw = _pair_nonzero(dilation)
-    Sparse = mgb.opr_param_defs.Convolution.Sparse
-    sparse_type = Sparse.DENSE if groups == 1 else Sparse.GROUP
-    res = mgb.opr.convolution(
-        inp,
-        weight,
-        pad_h=ph,
-        pad_w=pw,
-        stride_h=sh,
-        stride_w=sw,
-        dilate_h=dh,
-        dilate_w=dw,
-        format="NCHW",
-        strategy=get_conv_execution_strategy(),
-        mode=conv_mode,
-        compute_mode=compute_mode,
-        sparse=sparse_type,
-    )
-    if bias is not None:
-        res += bias
-    return res
-
-
-@wrap_io_tensor
-def conv_transpose2d(
-    inp: Tensor,
-    weight: Tensor,
-    bias: Optional[Tensor] = None,
-    stride: Union[int, Tuple[int, int]] = 1,
-    padding: Union[int, Tuple[int, int]] = 0,
-    dilation: Union[int, Tuple[int, int]] = 1,
-    groups: int = 1,
-    conv_mode="CROSS_CORRELATION",
-    compute_mode="DEFAULT",
-) -> Tensor:
-    """2D transposed convolution operation.
-
-    Refer to :class:`~.ConvTranspose2d` for more information.
-
-    :param inp: The feature map of the convolution operation
-    :param weight: The convolution kernel
-    :param bias: The bias added to the result of convolution (if given)
-    :param stride: Stride of the 2D convolution operation. Default: 1
-    :param padding: Size of the paddings added to the input on both sides of its
-        spatial dimensions. Only zero-padding is supported. Default: 0
-    :param dilation: Dilation of the 2D convolution operation. Default: 1
-    :param groups: number of groups to divide input and output channels into,
-        so as to perform a "grouped convolution". When ``groups`` is not 1,
-        ``in_channels`` and ``out_channels`` must be divisible by ``groups``,
-        and the shape of weight should be ``(groups, out_channel // groups,
-        in_channels // groups, height, width)``. Default: 1
-    :type conv_mode: string or :class:`mgb.opr_param_defs.Convolution.Mode`
-    :param conv_mode: Supports 'CROSS_CORRELATION' or 'CONVOLUTION'. Default:
-        'CROSS_CORRELATION'.
-    :type compute_mode: string or
-        :class:`mgb.opr_param_defs.Convolution.ComputeMode`
-    :param compute_mode: When set to 'DEFAULT', no special requirements will be
-        placed on the precision of intermediate results. When set to 'FLOAT32',
-        Float32 would be used for accumulator and intermediate result, but only
-        effective when input and output are of Float16 dtype.
-
-    """
-    ph, pw = _pair(padding)
-    sh, sw = _pair_nonzero(stride)
-    dh, dw = _pair_nonzero(dilation)
-    Sparse = mgb.opr_param_defs.Convolution.Sparse
-    sparse_type = Sparse.DENSE if groups == 1 else Sparse.GROUP
-    res = mgb.opr.deconvolution(
-        inp,
-        weight,
-        pad_h=ph,
-        pad_w=pw,
-        stride_h=sh,
-        stride_w=sw,
-        dilate_h=dh,
-        dilate_w=dw,
-        format="NCHW",
-        strategy=get_conv_execution_strategy(),
-        mode=conv_mode,
-        compute_mode=compute_mode,
-        sparse=sparse_type,
-    )
-    if bias is not None:
-        res += bias
-    return res
-
-
-@wrap_io_tensor
-def local_conv2d(
-    inp: Tensor,
-    weight: Tensor,
-    stride: Union[int, Tuple[int, int]] = 1,
-    padding: Union[int, Tuple[int, int]] = 0,
-    dilation: Union[int, Tuple[int, int]] = 1,
-    conv_mode="CROSS_CORRELATION",
-) -> Tensor:
-    """Applies spatial 2D convolution over an image with untied kernels.
-
-    Refer to :class:`~.LocalConv2d` for more information.
-    """
-    ret = mgb.opr.group_local(
-        inp,
-        weight,
-        pad_h=padding[0],
-        pad_w=padding[1],
-        stride_h=stride[0],
-        stride_w=stride[1],
-        dilate_h=dilation[0],
-        dilate_w=dilation[1],
-        format="NCHW",
-        mode=conv_mode,
-    )
-    return ret
-
-
-@wrap_io_tensor
-def max_pool2d(
-    inp: Tensor,
-    kernel_size: Union[int, Tuple[int, int]],
-    stride: Optional[Union[int, Tuple[int, int]]] = None,
-    padding: Union[int, Tuple[int, int]] = 0,
-) -> Tensor:
-    """Applies a 2D max pooling over an input.
-
-    Refer to :class:`~.MaxPool2d` for more information.
-
-    :param inp: The input tensor.
-    :param kernel_size: The size of the window.
-    :param stride: The stride of the window. If not provided, its value is set to ``kernel_size``.
-        Default: None
-    :param padding: Implicit zero padding to be added on both sides. Default: 0
-
-    """
-
-    kh, kw = _pair_nonzero(kernel_size)
-    sh, sw = _pair_nonzero(stride or kernel_size)
-    ph, pw = _pair(padding)
-    mode = mgb.opr_param_defs.Pooling.Mode.MAX
-    return mgb.opr.pooling(
-        inp,
-        mode=mode,
-        format="NCHW",
-        stride_h=sh,
-        stride_w=sw,
-        pad_h=ph,
-        pad_w=pw,
-        window_h=kh,
-        window_w=kw,
-    )
-
-
-@wrap_io_tensor
-def avg_pool2d(
-    inp: Tensor,
-    kernel_size: Union[int, Tuple[int, int]],
-    stride: Optional[Union[int, Tuple[int, int]]] = None,
-    padding: Union[int, Tuple[int, int]] = 0,
-) -> Tensor:
-    """ Applies a 2D average pooling over an input.
-
-    Refer to :class:`~.AvgPool2d` for more information.
-
-    :param inp: The input tensor.
-    :param kernel_size: The size of the window.
-    :param stride: The stride of the window. If not provided, its value is set to ``kernel_size``.
-        Default: None
-    :param padding: Implicit zero padding to be added on both sides. Default: 0
-
-    """
-    kh, kw = _pair_nonzero(kernel_size)
-    sh, sw = _pair_nonzero(stride or kernel_size)
-    ph, pw = _pair(padding)
-    mode = mgb.opr_param_defs.Pooling.Mode.AVERAGE
-    return mgb.opr.pooling(
-        inp,
-        mode=mode,
-        format="NCHW",
-        stride_h=sh,
-        stride_w=sw,
-        pad_h=ph,
-        pad_w=pw,
-        window_h=kh,
-        window_w=kw,
-    )
-
-
-@wrap_io_tensor
-def prelu(inp: Tensor, weight: Tensor) -> Tensor:
-    r"""
-    Applies the element-wise PReLU function.
-
-    Refer to :class:`~.PReLU` for more information.
-    """
-
-    return mgb.opr.elemwise(inp, 0, mode="MAX") + weight * mgb.opr.elemwise(
-        inp, 0, mode="MIN"
-    )
-
-
-@wrap_io_tensor
-def leaky_relu(inp: Tensor, negative_slope: float = 0.01) -> Tensor:
-    r"""
-    Applies the element-wise leaky_relu function
-
-    Refer to :class:`~.LeakyReLU` for more information.
-    """
-
-    return mgb.opr.elemwise(inp, 0, mode="MAX") + negative_slope * mgb.opr.elemwise(
-        inp, 0, mode="MIN"
-    )
-
-
-@wrap_io_tensor
-def softplus(inp: Tensor, beta: float = 1, threshold: float = 20) -> Tensor:
-    r"""
-    Performs the elementwise function:
-
-    .. math::
-
-        \mathsf{softplus}(x) = \log(1+\exp(\beta x)) / \beta.
-
-    For numerical stability the identity function is used when :math:`\beta x > \textrm{threshold}`.
-
-    """
-    mask = beta * inp <= threshold
-    out = log(1 + exp(beta * inp)) / beta
-    out = where(mask, out, inp)
-    return out
-
-
-@wrap_io_tensor
-def flatten(inp: Tensor, start_axis: int = 0, end_axis: int = -1) -> Tensor:
-    r"""
-    Reshapes the tensor by flattening the sub-tensor from dimension ``start_axis`` to dimension ``end_axis``.
-
-    :param inp: The input tensor.
-    :param start_axis: The start dimension that the sub-tensor to be flattened. Default: 0
-    :param end_axis: The end dimension that the sub-tensor to be flattened. Default: -1
-
-    Examples:
-
-    .. testcode::
-
-        import numpy as np
-        from megengine import tensor
-        import megengine.functional as F
-
-        inp_shape = (2, 2, 3, 3)
-        inp = tensor(
-            np.arange(36, dtype=np.int32).reshape(inp_shape),
-        )
-        oup = F.flatten(inp, 2)
-        print(inp.numpy().shape)
-        print(oup.numpy().shape)
-
-    Outputs:
-
-    .. testoutput::
-
-        (2, 2, 3, 3)
-        (2, 2, 9)
-
-    """
-
-    target_shape = tuple(inp.shape[i] for i in range(start_axis)) + (-1,)
-    if end_axis != -1:
-        target_shape += (inp.shape[end_axis + 1 :],)
-    return inp.reshape(*target_shape)
-
-
-def _get_softmax_axis(ndim: int) -> int:
-    if ndim in (0, 1, 3):
-        return 0
-    return 1
-
-
-@wrap_io_tensor
-def softmax(inp: Tensor, axis: Optional[int] = None) -> Tensor:
-    r"""
-    Applies a softmax function. Softmax is defined as:
-
-    .. math::
-            \text{Softmax}(x_{i}) = \frac{\exp(x_i)}{\sum_j \exp(x_j)}
-
-    It is applied to all elements along axis, and will re-scale them so that
-    the elements lie in the range `[0, 1]` and sum to 1.
-
-    See :class:`~megengine.module.activation.Softmax` for more details.
-
-    :param inp: The input tensor.
-    :param axis: An axis along which softmax will be applied. By default,
-        softmax will apply along the highest ranked axis.
-
-    """
-    if axis is None:
-        axis = _get_softmax_axis(len(inp.imm_shape))
-    offset = mgb.opr.zero_grad(inp.max(axis=axis, keepdims=True))
-    inp = inp - offset
-    down = mgb.opr.elem.exp(inp).sum(axis=axis, keepdims=True)
-    return mgb.opr.elem.exp(inp) / down
-
-
-@wrap_io_tensor
-def batch_norm2d(
-    inp: Tensor,
-    running_mean: Tensor,
-    running_var: Tensor,
-    weight: Optional[Tensor] = None,
-    bias: Optional[Tensor] = None,
-    training: bool = False,
-    momentum: float = 0.9,
-    eps: float = 1e-5,
-) -> Tensor:
-    """Applies batch normalization to the input.
-
-    Refer to :class:`~.BatchNorm2d` and :class:`~.BatchNorm1d` for more information.
-
-    :param inp: input tensor.
-    :param running_mean: tensor to store running mean.
-    :param running_var: tensor to store running variance.
-    :param weight: scaling tensor in the learnable affine parameters.
-        See :math:`\gamma` in :class:`~.BatchNorm2d`
-    :param bias: bias tensor in the learnable affine parameters.
-        See :math:`\beta` in :class:`~.BatchNorm2d`
-    :param training: a boolean value to indicate whether batch norm is performed
-        in traning mode. Default: ``False``
-    :param momentum: the value used for the ``running_mean`` and ``running_var``
-        computation.
-        Default: 0.9
-    :param eps: a value added to the denominator for numerical stability.
-        Default: 1e-5.
-
-    """
-
-    inp = mgb.opr.mark_no_broadcast_elemwise(inp)
-    _channels = inp.imm_shape[1]
-    _ndim = len(inp.imm_shape)
-    _param_shape = (1, _channels) + (1,) * (_ndim - 2)
-
-    assert _ndim == 4, "only 4D tensor supported"
-
-    if weight is not None:
-        weight = weight.reshape(*_param_shape)
-    else:
-        weight = mgb.make_immutable(*_use_default_if_none(None, None), 1.0).broadcast(
-            *_param_shape
-        )
-
-    if bias is not None:
-        bias = bias.reshape(*_param_shape)
-    else:
-        bias = mgb.make_immutable(*_use_default_if_none(None, None), 0.0).broadcast(
-            *_param_shape
-        )
-
-    FwdMode = mgb.opr_param_defs.BN.FwdMode
-    fwdmode = FwdMode.TRAINING if training else FwdMode.INFERENCE
-    avg_factor = 1 - momentum
-
-    if running_mean is not None and running_var is not None:
-        if training:
-            inp = barrier(inp)
-
-        output = mgb.opr.batch_norm(
-            inp,
-            weight,
-            bias,
-            running_mean,
-            running_var,
-            param_dim="DIM_1C11",
-            fwd_mode=fwdmode,
-            epsilon=eps,
-            avg_factor=avg_factor,
-        )[-1]
-        if training:
-            mark_impure(output)
-    else:
-        output = mgb.opr.batch_norm_no_statistic(
-            inp,
-            weight,
-            bias,
-            param_dim="DIM_1C11",
-            fwd_mode=fwdmode,
-            epsilon=eps,
-            avg_factor=avg_factor,
-        )[-1]
-
-    return output
-
-
-@wrap_io_tensor
-def sync_batch_norm(
-    input: Tensor,
-    running_mean: Tensor,
-    running_var: Tensor,
-    weight: Optional[Tensor] = None,
-    bias: Optional[Tensor] = None,
-    training: bool = False,
-    momentum: Union[float, Tensor] = 0.9,
-    eps: float = 1e-5,
-    eps_mode="ADDITIVE",
-) -> Tensor:
-    """ Applies synchronized batch normalization to the input.
-
-    Refer to :class:`~.BatchNorm2d` and :class:`~.BatchNorm1d` for more information.
-
-    :param inp: input tensor.
-    :param running_mean: tensor to store running mean.
-    :param running_var: tensor to store running variance.
-    :param weight: scaling tensor in the learnable affine parameters.
-        See :math:`\gamma` in :class:`~.BatchNorm2d`
-    :param bias: bias tensor in the learnable affine parameters.
-        See :math:`\beta` in :class:`~.BatchNorm2d`
-    :param training: a boolean value to indicate whether batch norm is performed
-        in traning mode. Default: ``False``
-    :param momentum: the value used for the ``running_mean`` and ``running_var``
-        computation.
-        Default: 0.9
-    :param eps: a value added to the denominator for numerical stability.
-        Default: 1e-5.
-    """
-
-    assert eps_mode in {"MAX", "ADDITIVE"}, "unknown eps_mode: {}".format(eps_mode)
-    input = mgb.opr.mark_no_broadcast_elemwise(input)
-    _channels = input.imm_shape[1]
-    _ndim = len(input.imm_shape)
-    _param_shape = (1, _channels) + (1,) * (_ndim - 2)
-
-    if training:
-
-        def _sum_on_channel(input):
-            return mgb.opr.reduce_general([input, _param_shape], mode="sum")
-
-        def _allreduce(stat, key):
-            return dist.helper.collective_comm_symvar(
-                stat, key, CollParam.Mode.ALL_REDUCE_SUM
-            )
-
-        reduce_size = input.shape[0]
-        for i in range(2, _ndim):
-            reduce_size = reduce_size * input.shape[i]
-        channel_x1s = _sum_on_channel(input)
-        channel_x2s = _sum_on_channel(input ** 2)
-
-        if dist.is_distributed():
-            # reduce all nodes' data to calculate mean and variance
-            reduce_size = reduce_size.reshape(*(1,) * _ndim)
-            stat = mgb.opr.concat([reduce_size, channel_x1s, channel_x2s], axis=1)
-            stat = _allreduce(stat, key="sync_bn_" + str(get_group_id()))
-            reduce_size = stat[:, :1].reshape(1)
-            channel_x1s = stat[:, 1 : 1 + _channels]
-            channel_x2s = stat[:, 1 + _channels :]
-
-        channel_mean = channel_x1s / reduce_size
-        channel_variance = (
-            channel_x1s ** 2 / (-reduce_size * reduce_size) + channel_x2s / reduce_size
-        )
-    else:
-        assert running_var is not None and running_mean is not None
-        channel_variance = running_var.reshape(*_param_shape)
-        channel_mean = running_mean.reshape(*_param_shape)
-
-    invsqrt_channel_variance = (
-        mgb.opr.elem.max(channel_variance, eps)
-        if eps_mode == "MAX"
-        else mgb.opr.elem.add(channel_variance, eps)
-    ) ** -0.5
-
-    if weight is not None:
-        weight = weight.reshape(*_param_shape)
-    if bias is not None:
-        bias = bias.reshape(*_param_shape)
-
-    # outvar = output * weight + bias
-    # where output = input * invsqrt_channel_variance + (
-    #    -channel_mean * invsqrt_channel_variance
-    # )
-    # Manually expand output for gopt
-
-    if weight is not None:
-        inv_var_wt = invsqrt_channel_variance * weight
-        neg_channel_mean = -channel_mean
-        if bias is not None:
-            outvar = input * inv_var_wt + (neg_channel_mean * inv_var_wt + bias)
-        else:
-            outvar = input * inv_var_wt + neg_channel_mean * inv_var_wt
-    else:
-        outvar = input * invsqrt_channel_variance + (
-            -channel_mean * invsqrt_channel_variance
-        )
-        if bias is not None:
-            outvar = outvar + bias
-
-    if training and running_var is not None and running_mean is not None:
-        _mean_update = add_update(
-            running_mean, channel_mean, alpha=momentum, beta=1 - momentum,
-        )
-        channel_variance_unbiased = channel_x1s ** 2 / (
-            -reduce_size * (reduce_size - 1)
-        ) + channel_x2s / (reduce_size - 1)
-        _variance_update = add_update(
-            running_var, channel_variance_unbiased, alpha=momentum, beta=1 - momentum
-        )
-        for dep in (_mean_update, _variance_update):
-            add_extra_vardep(outvar, dep)
-
-    return outvar
-
-
-def one_hot(inp: Tensor, num_classes: int) -> Tensor:
-    r"""
-    Perform one-hot encoding for the input tensor.
-
-    :param inp: input tensor
-    :param num_classes: number of classes denotes the last dimension of the output tensor
-
-    Examples:
-
-    .. testcode::
-
-        import numpy as np
-        from megengine import tensor
-        import megengine.functional as F
-
-        inp = tensor(np.arange(1, 4, dtype=np.int32))
-        out = F.one_hot(inp, num_classes=4)
-        print(out.numpy())
-
-    Outputs:
-
-    .. testoutput::
-
-        [[0 1 0 0]
-         [0 0 1 0]
-         [0 0 0 1]]
-
-    """
-    comp_node, comp_graph = _decide_comp_node_and_comp_graph(inp)
-
-    zeros = mgb.make_immutable(value=0, comp_node=comp_node, comp_graph=comp_graph)
-    zeros_symvar = zeros.broadcast(inp.shapeof(), num_classes)
-
-    ones = mgb.make_immutable(value=1, comp_node=comp_node, comp_graph=comp_graph)
-    ones_symvar = ones.broadcast(inp.shapeof(), 1)
-
-    return Tensor(
-        mgb.opr.indexing_set_one_hot(
-            zeros_symvar, axis=len(inp.shapeof()), index=inp, value=ones_symvar
-        )
-    )
-
-
-@wrap_io_tensor
-def warp_perspective(
-    inp: Tensor,
-    M: Tensor,
-    dsize: Union[Tuple[int, int], int, Tensor],
-    border_mode: str = "REPLICATE",
-    border_val: float = 0.0,
-    interp_mode: str = "LINEAR",
-):
-    r"""
-    Applies perspective transformation to batched 2D images.
-
-    The input images are transformed to the output images by the transformation matrix:
-
-    .. math::
-            \text{output}(n, c, h, w) = \text{input} \left( n, c,
-                \frac{M_{00}h + M_{01}w + M_{02}}{M_{20}h + M_{21}w + M_{22}},
-                \frac{M_{10}h + M_{11}w + M_{12}}{M_{20}h + M_{21}w + M_{22}}
-                \right)
-
-    :param inp: input image
-    :param M: (batch, 3, 3) transformation matrix
-    :param dsize: (h, w) size of the output image
-    :param border_mode: pixel extrapolation method. Default: ``"REPLICATE"``
-    :param border_val: value used in case of a constant border. Default: ``0``
-    :param interp_mode: interpolation methods. Default: ``"LINEAR"``
-
-    Examples:
-
-    .. testcode::
-
-        import numpy as np
-        from megengine import tensor
-        import megengine.functional as F
-        inp_shape = (1, 1, 4, 4)
-        inp = tensor(np.arange(16, dtype=np.float32).reshape(inp_shape))
-        M_shape = (1, 3, 3)
-        # M defines a translation: dst(1, 1, h, w) = rst(1, 1, h+1, w+1)
-        M = tensor(np.array([[1., 0., 1.],
-                             [0., 1., 1.],
-                             [0., 0., 1.]], dtype=np.float32).reshape(M_shape))
-        out = F.warp_perspective(inp, M, (2, 2))
-        print(out.numpy())
-
-    Outputs:
-
-    .. testoutput::
-
-        [[[[ 5.  6.]
-           [ 9. 10.]]]]
-
-    """
-
-    return mgb.opr.warp_perspective(
-        inp,
-        M,
-        dsize,
-        bmode=border_mode,
-        border_val=border_val,
-        imode=interp_mode,
-        format="NCHW",
-    )
-
-
-@wrap_io_tensor
-def remap(
-    inp: Tensor,
-    map_xy: Tensor,
-    border_mode: str = "REPLICATE",
-    scalar: float = 0.0,
-    interp_mode: str = "LINEAR",
-) -> Tensor:
-    r"""
-    Applies remap transformation to batched 2D images.
-
-    The input images are transformed to the output images by the tensor map_xy.
-    The output's H and W are same as map_xy's H and W.
-
-    :param inp: input image
-    :param map_xy: (batch, oh, ow, 2) transformation matrix
-    :param border_mode: pixel extrapolation method. Default: ``"REPLICATE"``
-    :param scalar: value used in case of a constant border. Default: ``0``
-    :param interp_mode: interpolation methods. Default: ``"LINEAR"``
-
-    Examples:
-
-    .. testcode::
-
-        import numpy as np
-        from megengine import tensor
-        import megengine.functional as F
-        inp_shape = (1, 1, 4, 4)
-        inp = tensor(np.arange(16, dtype=np.float32).reshape(inp_shape))
-        map_xy_shape = (1, 2, 2, 2)
-        map_xy = tensor(np.array([[[1., 0.],[0., 1.]],
-                            [[0., 1.],[0., 1.]]],
-                             dtype=np.float32).reshape(map_xy_shape))
-        out = F.remap(inp, map_xy)
-        print(out.numpy())
-
-    Outputs:
-
-    .. testoutput::
-
-        [[[[1. 4.]
-           [4. 4.]]]]
-
-    """
-
-    return mgb.opr.remap(
-        inp,
-        map_xy,
-        border_type=border_mode,
-        scalar=scalar,
-        imode=interp_mode,
-        format="NCHW",
-    )
-
-
-@wrap_io_tensor
-def eye(
-    n: int,
-    m: Optional[int] = None,
-    *,
-    dtype=None,
-    device: Optional[CompNode] = None,
-    comp_graph: Optional[CompGraph] = None
-) -> Tensor:
-    """
-    Returns a 2D tensor with ones on the diagonal and zeros elsewhere.
-
-    :param n: The number of rows
-    :param m: The number of columns. Default: None
-    :param dtype: The data type. Default: None
-    :param device: Compute node of the matrix. Default: None
-    :param comp_graph: Compute graph of the matrix. Default: None
-    :return: The eye matrix
-
-    Examples:
-
-    .. testcode::
-
-        import numpy as np
-        import megengine.functional as F
-
-        data_shape = (4, 6)
-        n, m = data_shape
-        out = F.eye(n, m, dtype=np.float32)
-        print(out.numpy())
-
-    Outputs:
-
-    .. testoutput::
-
-        [[1. 0. 0. 0. 0. 0.]
-         [0. 1. 0. 0. 0. 0.]
-         [0. 0. 1. 0. 0. 0.]
-         [0. 0. 0. 1. 0. 0.]]
-
-    """
-
-    device, comp_graph = _use_default_if_none(device, comp_graph)
-    if m is None:
-        m = n
-    return mgb.opr.eye((n, m), dtype=dtype, comp_node=device, comp_graph=comp_graph)
-
-
-@wrap_io_tensor
-def matrix_mul(inp1: Tensor, inp2: Tensor) -> Tensor:
-    """
-    Performs a matrix multiplication of the matrices ``inp1`` and ``inp2``
-
-    :param inp1: The first matrix to be multiplied (a, b)
-    :param inp2: The second matrix to be multiplied (b, c)
-    :return: The output tensor (a, c)
-
-    Examples:
-
-    .. testcode::
-
-        import numpy as np
-        from megengine import tensor
-        import megengine.functional as F
-
-        shape_1 = (2, 3)
-        shape_2 = (3, 4)
-        data1 = tensor(np.arange(0, 6, dtype=np.float32).reshape(2, 3))
-        data2 = tensor(np.arange(0, 6, dtype=np.float32).reshape(3, 2))
-        out = F.matrix_mul(data1, data2)
-        print(out.numpy())
-
-    Outputs:
-
-    .. testoutput::
-
-        [[10. 13.]
-         [28. 40.]]
-
-    """
-    return mgb.opr.matrix_mul(inp1, inp2)
-
-
-@wrap_io_tensor
-def batched_matrix_mul(inp1: Tensor, inp2: Tensor) -> Tensor:
-    """
-    Performs a batched multiplication of th batched matrices ``inp1`` and ``inp2``
-
-    :param inp1: The first batch matrix to be multiplied (n, a, b)
-    :param inp2: The second batch matrix to be multiplied (n, b, c)
-    :return: The output batch (n, a, c)
-
-    Examples:
-
-    .. testcode::
-
-        import numpy as np
-        from megengine import tensor
-        import megengine.functional as F
-
-        batch_size = 3
-        shape_1 = (batch_size, 2, 3)
-        shape_2 = (batch_size, 3, 4)
-        data1 = tensor(
-            np.arange(0, batch_size * 6, dtype=np.float32).reshape(batch_size, 2, 3))
-        data2 = tensor(
-            np.arange(0, batch_size * 12, dtype=np.float32).reshape(batch_size, 3, 4))
-        out = F.batched_matrix_mul(data1, data2)
-        print(out.numpy())
-
-    Outputs:
-
-    .. testoutput::
-
-        [[[  20.   23.   26.   29.]
-          [  56.   68.   80.   92.]]
-
-         [[ 344.  365.  386.  407.]
-          [ 488.  518.  548.  578.]]
-
-         [[1100. 1139. 1178. 1217.]
-          [1352. 1400. 1448. 1496.]]]
-
-    """
-    return mgb.opr.batched_matrix_mul(inp1, inp2)
-
-
-@wrap_io_tensor
-def interpolate(
-    inp: Tensor,
-    size: Optional[Union[int, Tuple[int, int]]] = None,
-    scale_factor: Optional[Union[float, Tuple[float, float]]] = None,
-    mode: str = "BILINEAR",
-    align_corners: bool = None,
-) -> Tensor:
-    r"""
-    Down/up samples the input tensor to either the given :attr:`size` or the given
-    :attr:`scale_factor`
-
-    :param inp: input tensor
-    :param size: size of the output tensor. Default: ``None``
-    :param scale_factor: scaling factor of the output tensor. Default: ``None``
-    :param mode: interpolation methods, acceptable values are:
-        'BILINEAR', 'LINEAR'. Default: ``BILINEAR``
-
-    Examples:
-
-    .. testcode::
-
-        import numpy as np
-        from megengine import tensor
-        import megengine.functional as F
-        from megengine.test import assertTensorClose
-
-        inp = tensor(np.arange(1, 5, dtype=np.float32).reshape(1, 1, 2, 2))
-        out = F.interpolate(inp, [4, 4], align_corners=False)
-        print(out.numpy())
-
-        out2 = F.interpolate(inp, scale_factor=2.)
-        assertTensorClose(out.numpy(), out2.numpy())
-
-    Outputs:
-
-    .. testoutput::
-
-        [[[[1.   1.25 1.75 2.  ]
-           [1.5  1.75 2.25 2.5 ]
-           [2.5  2.75 3.25 3.5 ]
-           [3.   3.25 3.75 4.  ]]]]
-
-    """
-    mode = mode.upper()
-    if mode not in ["BILINEAR", "LINEAR"]:
-        raise ValueError("interpolate only support linear or bilinear mode")
-    if mode not in ["BILINEAR", "LINEAR"]:
-        if align_corners is not None:
-            raise ValueError(
-                "align_corners option can only be set in the bilinear/linear interpolating mode"
-            )
-    else:
-        if align_corners is None:
-            align_corners = False
-
-    if mode == "LINEAR":
-        inp = mgb.opr.add_axis(inp, 3)
-
-    if len(inp.imm_shape) != 4:
-        raise ValueError("shape of input tensor must correspond to the operartion mode")
-
-    if size is None:
-        if scale_factor is None:
-            raise ValueError("scale_factor must not be None when size is None")
-
-        if isinstance(scale_factor, (float, int)):
-            scale_factor = float(scale_factor)
-            if mode == "LINEAR":
-                scale_factor = (scale_factor, float(1))
-            else:
-                scale_factor = (scale_factor, scale_factor)
-        else:
-            if mode == "LINEAR":
-                raise ValueError(
-                    "under LINEAR mode, scale_factor can only be single value"
-                )
-
-        assert len(scale_factor) == 2, "shape of scale_factor must be equal to (2, )"
-        assert isinstance(scale_factor[0], float) and isinstance(
-            scale_factor[1], float
-        ), "scale_factor must be float type"
-        dsize = tuple(
-            mgb.opr.elemwise(inp.shape[i + 2] * scale_factor[i], mode="FLOOR")
-            for i in range(2)
-        )
-        dsize = mgb.opr.concat([dsize[0], dsize[1]], axis=0)
-    else:
-        if scale_factor is not None:
-            raise ValueError("scale_factor must be None when size is provided")
-
-        if isinstance(size, int):
-            size = (size, 1)
-        else:
-            if mode == "LINEAR":
-                raise ValueError("under LINEAR mode, size can only be single value")
-        dsize = size
-
-    oh, ow = dsize[0], dsize[1]
-    ih, iw = inp.shape[2], inp.shape[3]
-
-    if align_corners:
-        hscale = (ih - 1.0) / (oh - 1.0)
-        wscale = 1.0 * iw / ow
-        if mode != "LINEAR":
-            wscale = (iw - 1.0) / (ow - 1.0)
-        row0 = mgb.opr.concat([wscale, [0, 0]], axis=0).reshape(1, 3)
-        row1 = mgb.opr.concat([[0], hscale, [0]], axis=0).reshape(1, 3)
-        weight = mgb.opr.concat([row0, row1, [[0, 0, 1]]], axis=0).reshape(1, 3, 3)
-        weight = mgb.opr.broadcast(weight, (inp.shape[0], 3, 3))
-    else:
-        hscale = 1.0 * ih / oh
-        wscale = 1.0 * iw / ow
-        row0 = mgb.opr.concat([wscale, [0], 0.5 * wscale - 0.5], axis=0).reshape(1, 3)
-        row1 = mgb.opr.concat([[0], hscale, 0.5 * hscale - 0.5], axis=0).reshape(1, 3)
-        weight = mgb.opr.concat([row0, row1, [[0, 0, 1]]], axis=0).reshape(1, 3, 3)
-        weight = mgb.opr.broadcast(weight, (inp.shape[0], 3, 3))
-
-    ret = mgb.opr.warp_perspective(inp, weight, dsize, imode="LINEAR", format="NCHW")
-    if mode == "LINEAR":
-        ret = mgb.opr.reshape(ret, ret.shape[0:3])
-    return ret
-
-
-@wrap_io_tensor
-def dropout(inp: Tensor, drop_prob: float, rescale: bool = True) -> Tensor:
-    """
-    Returns a new tensor where each of the elements are randomly set to zero
-    with probability P = ``drop_prob``. Optionally rescale the output tensor.
-
-    :param inp: The input tensor
-    :param drop_prob: The probability to drop (set to zero) a single element
-    :param rescale: The default behavior of ``dropout`` during training is to rescale the output,
-        then it can be replaced by an :class:`~.Identity` during inference, default to True.
-    :return: The output tensor
-
-    Examples:
-
-    .. testcode::
-
-        import numpy as np
-        import megengine as mge
-
-        import megengine.functional as F
-        from megengine import tensor
-
-        data = tensor(np.ones(10, dtype=np.float32))
-        out = F.dropout(data, 1./3.)
-        print(out.numpy())
-
-    Outputs:
-
-    .. testoutput::
-        :options: +SKIP
-
-        [1.5 1.5 0.  1.5 1.5 1.5 1.5 1.5 1.5 1.5]
-
-    """
-    assert 0 <= drop_prob < 1
-    rv = uniform(inp.shape)
-    mask = rv > drop_prob
-    inp *= mask.astype(inp.dtype)
-    if rescale:
-        inp *= 1 / (1 - drop_prob)
-    return inp
-
-
-@wrap_io_tensor
-def identity(inp: Tensor) -> Tensor:
-    """applies an identity transform to the input tensor.
-
-    :param inp: The input tensor
-    """
-    return mgb.opr.identity(inp)
-
-
-@wrap_io_tensor
-def embedding(
-    input: Tensor,
-    weight: Tensor,
-    padding_idx: Optional[int] = None,
-    max_norm: Optional[float] = None,
-    norm_type: Optional[float] = None,
-):
-    """
-    Applies lookup table for embedding.
-
-    :param input: the tensor with indices.
-    :param weight: the learnable weights which embedding from.
-    :param padding_idx: should be set to None, not support now.
-    :param max_norm: should be set to None, not support now.
-    :param norm_type: should be set to None, not support now.
-
-
-    Refer to :class:`~.Embedding` for more information.
-    """
-    if padding_idx is not None:
-        raise ValueError("Not support padding_idx Now!")
-    if max_norm is not None or norm_type is not None:
-        raise ValueError("Not support weight normlization Now!")
-
-    return mgb.opr.advanced_indexing(weight)[input.reshape(-1), :].reshape(
-        input.shape, weight.shape[-1]
-    )
-
-
-@wrap_io_tensor
-def roi_pooling(
-    input: Tensor,
-    rois: Tensor,
-    output_shape: Union[int, tuple, list],
-    mode: str = "max",
-    scale: float = 1.0,
-) -> Tensor:
-    """
-    Apply roi pooling on input feature
-
-    :param input: tensor that represents the input feature, (N, C, H, W) images
-    :param rois: (K, 5) boxes. First column is the index into N. The other 4 columns are xyxy
-    :param output_shape: (height, width) of output rois feature
-    :param mode: "max" or "average", use max/average align just like max/average pooling. Default: ``"max"``
-    :param scale: scale the input boxes by this number. Default: 1.0
-    :return: (K, C, output_shape[0], output_shape[1]) feature of rois
-    """
-    assert mode in ["max", "average"], "only max/average mode is supported"
-    if isinstance(output_shape, int):
-        output_shape = (output_shape, output_shape)
-
-    return mgb.opr.roi_pooling(
-        input, rois, output_shape, mode=mode.upper(), scale=scale
-    )
-
-
-@wrap_io_tensor
-def roi_align(
-    input: Tensor,
-    rois: Tensor,
-    output_shape: Union[int, tuple, list],
-    mode: str = "average",
-    spatial_scale: float = 1.0,
-    sample_points: Union[int, tuple, list] = 2,
-    aligned: bool = True,
-) -> Tensor:
-    """
-    Apply roi align on input feature
-
-    :param input: tensor that represents the input feature, (N, C, H, W) images
-    :param rois: (N, 5) boxes. First column is the index into N. The other 4 columns are xyxy
-    :param output_shape: (height, width) shape of output rois feature.
-    :param mode: "max" or "average", use max/average align just like max/average pooling. Default: ``"average"``
-    :param spatial_scale: scale the input boxes by this number. Default: 1.0
-    :param sample_points: number of inputs samples to take for each output sample.
-        0 to take samples densely. Default: 2
-    :param aligned: wheather align the input feature, with `aligned=True`,
-        we first appropriately scale the ROI and then shift it by -0.5. Default: True
-    """
-    assert mode in ["max", "average"], "only max/average mode is supported"
-    if isinstance(output_shape, int):
-        output_shape = (output_shape, output_shape)
-    pooled_height, pooled_width = output_shape
-    if isinstance(sample_points, int):
-        sample_points = (sample_points, sample_points)
-    sample_height, sample_width = sample_points
-    offset = 0.5 if aligned else 0.0
-
-    return mgb.opr.roi_align(
-        input,
-        rois,
-        mode=mode.upper(),
-        spatial_scale=spatial_scale,
-        offset=offset,
-        pooled_height=pooled_height,
-        pooled_width=pooled_width,
-        sample_height=sample_height,
-        sample_width=sample_width,
-    )
-
-
-@wrap_io_tensor
-def assert_equal(
-    get: Tensor, expect: Tensor, max_err: float = 1e-4, verbose: bool = False
-) -> Tensor:
-    r"""
-    Asserts that ``get`` equals to ``expect``, and returns value of ``expect``.
-
-    :param get: tensor to be checked.
-    :param expect: tensor with expected values.
-    :param max_err: tolerance that two float values are asserted equal. Default: 1e-4
-    :param verbose: whether to print details if two tensors are not equal. Default: False
-
-    Examples:
-
-    .. testcode::
-
-        import megengine.functional as F
-        from megengine import tensor
-
-        get = tensor([1.0, 2.0])
-        max_err = 0.1
-        expect = get + max_err / 2.0
-        val = F.assert_equal(expect, get, max_err=max_err)
-        print(val.numpy())
-
-    Outputs:
-
-    .. testoutput::
-
-        [1.05 2.05]
-
-    """
-
-    return mgb.opr.assert_equal(get, expect, maxerr=max_err, verbose=verbose)
-
-
-@wrap_io_tensor
-def indexing_one_hot(
-    src: Tensor, index: Tensor, axis: int = 1, keepdims=False
-) -> Tensor:
-    r"""
-    One-hot indexing for some axis.
-
-    :param src: input data tensor.
-    :param index: index tensor.
-    :param axis: the axis on src for which values in index index. Default: 1
-    :param keepdims: whether not to remove the axis in result. Default: ``False``
-
-    Examples:
-
-    .. testcode::
-
-        import megengine.functional as F
-        from megengine import tensor
-
-        src = tensor([[1.0, 2.0]])
-        index = tensor([0])
-        val = F.indexing_one_hot(src, index)
-        print(val.numpy())
-
-    .. testoutput::
-
-        [1.]
-
-    """
-
-    return mgb.opr.indexing_one_hot(src, axis, index, keepdims=keepdims)
diff --git a/python_module/megengine/functional/quantized.py b/python_module/megengine/functional/quantized.py
deleted file mode 100644
index 8cfb4a5b..00000000
--- a/python_module/megengine/functional/quantized.py
+++ /dev/null
@@ -1,80 +0,0 @@
-# MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
-#
-# Copyright (c) 2014-2020 Megvii Inc. All rights reserved.
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# pylint: disable=too-many-lines
-from typing import Tuple, Union
-
-from .. import _internal as mgb
-from ..core import Tensor, wrap_io_tensor
-from ..utils.types import _pair, _pair_nonzero
-from .debug_param import get_conv_execution_strategy
-
-
-@wrap_io_tensor
-def conv_bias_activation(
-    inp: Tensor,
-    weight: Tensor,
-    bias: Tensor,
-    dtype=None,
-    stride: Union[int, Tuple[int, int]] = 1,
-    padding: Union[int, Tuple[int, int]] = 0,
-    dilation: Union[int, Tuple[int, int]] = 1,
-    groups: int = 1,
-    nonlinear_mode="IDENTITY",
-    conv_mode="CROSS_CORRELATION",
-    compute_mode="DEFAULT",
-) -> Tensor:
-    """ convolution bias with activation operation, only for inference.
-
-    :param inp: The feature map of the convolution operation
-    :param weight: The convolution kernel
-    :param bias: The bias added to the result of convolution
-    :param stride: Stride of the 2D convolution operation. Default: 1
-    :param padding: Size of the paddings added to the input on both sides of its
-        spatial dimensions. Only zero-padding is supported. Default: 0
-    :param dilation: Dilation of the 2D convolution operation. Default: 1
-    :param groups: number of groups to divide input and output channels into,
-        so as to perform a "grouped convolution". When ``groups`` is not 1,
-        ``in_channels`` and ``out_channels`` must be divisible by ``groups``,
-        and the shape of weight should be ``(groups, out_channel // groups,
-        in_channels // groups, height, width)``.
-    :type conv_mode: string or :class:`mgb.opr_param_defs.Convolution.Mode`
-    :param conv_mode: Supports 'CROSS_CORRELATION' or 'CONVOLUTION'. Default:
-        'CROSS_CORRELATION'.
-    :param dtype:  Support for np.dtype, Default:
-        np.int8.
-    :type compute_mode: string or
-        :class:`mgb.opr_param_defs.Convolution.ComputeMode`
-    :param compute_mode: When set to 'DEFAULT', no special requirements will be
-        placed on the precision of intermediate results. When set to 'FLOAT32',
-        Float32 would be used for accumulator and intermediate result, but only
-        effective when input and output are of Float16 dtype.
-
-    """
-    ph, pw = _pair(padding)
-    sh, sw = _pair_nonzero(stride)
-    dh, dw = _pair_nonzero(dilation)
-    sparse_type = "DENSE" if groups == 1 else "GROUP"
-    res = mgb.opr.conv_bias_activation(
-        inp,
-        weight,
-        bias,
-        compute_mode=compute_mode,
-        dtype=dtype,
-        strategy=get_conv_execution_strategy(),
-        nonlineMode=nonlinear_mode,
-        sparse=sparse_type,
-        format="NCHW",
-        pad_h=ph,
-        pad_w=pw,
-        stride_h=sh,
-        stride_w=sw,
-        dilate_h=dh,
-        dilate_w=dw,
-        mode=conv_mode,
-    )
-    return res
diff --git a/python_module/megengine/functional/sort.py b/python_module/megengine/functional/sort.py
deleted file mode 100644
index 70597d52..00000000
--- a/python_module/megengine/functional/sort.py
+++ /dev/null
@@ -1,123 +0,0 @@
-# -*- coding: utf-8 -*-
-# MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
-#
-# Copyright (c) 2014-2020 Megvii Inc. All rights reserved.
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-import functools
-from typing import Optional, Tuple, Union
-
-import megengine._internal as mgb
-
-from ..core.tensor import Tensor, wrap_io_tensor
-
-__all__ = ["argsort", "sort", "top_k"]
-
-
-@wrap_io_tensor
-def argsort(inp: Tensor, descending: bool = False) -> Tuple[Tensor, Tensor]:
-    r"""
-    Sort the target 2d matrix by row, return both the sorted tensor and indices.
-
-    :param inp: The input tensor, if 2d, each row will be sorted
-    :param descending: Sort in descending order, where the largest comes first. Default: ``False``
-    :return: Tuple of two tensors (sorted_tensor, indices_of_int32)
-
-    Examples:
-
-    .. testcode::
-
-        import numpy as np
-        from megengine import tensor
-        import  megengine.functional as F
-        data = tensor(np.array([1,2], dtype=np.float32))
-        sorted, indices = F.argsort(data)
-        print(sorted.numpy(), indices.numpy())
-
-    Outputs:
-
-    .. testoutput::
-
-        [1. 2.] [0 1]
-
-    """
-    assert len(inp.imm_shape) <= 2, "Input should be 1d or 2d"
-    if descending:
-        order = mgb.opr_param_defs.Argsort.Order.DESCENDING
-    else:
-        order = mgb.opr_param_defs.Argsort.Order.ASCENDING
-    if len(inp.imm_shape) == 1:
-        inp = inp.reshape(1, -1)
-        tns, ind = mgb.opr.argsort(inp, order=order)
-        return tns[0], ind[0]
-    return mgb.opr.argsort(inp, order=order)
-
-
-@functools.wraps(argsort)
-def sort(*args, **kwargs):
-    return argsort(*args, **kwargs)
-
-
-@wrap_io_tensor
-def top_k(
-    inp: Tensor,
-    k: int,
-    descending: bool = False,
-    kth_only: bool = False,
-    no_sort: bool = False,
-) -> Tuple[Tensor, Tensor]:
-    r"""
-    Selected the Top-K (by default) smallest elements of 2d matrix by row.
-
-    :param inp: The input tensor, if 2d, each row will be sorted
-    :param k: The number of elements needed
-    :param descending: If true, return the largest elements instead. Default: ``False``
-    :param kth_only: If true, only the k-th element will be returned. Default: ``False``
-    :param no_sort: If true, the returned elements can be unordered. Default: ``False``
-    :return: Tuple of two tensors (topk_tensor, indices_of_int32)
-
-    Examples:
-
-    .. testcode::
-
-        import numpy as np
-        from megengine import tensor
-        import  megengine.functional as F
-        data = tensor(np.array([2, 4, 6, 8, 7, 5, 3, 1], dtype=np.float32))
-        top, indices = F.top_k(data, 5)
-        print(top.numpy(), indices.numpy())
-
-    Outputs:
-
-    .. testoutput::
-
-        [1. 2. 3. 4. 5.] [7 0 6 1 5]
-
-    """
-    assert len(inp.imm_shape) <= 2, "Input should be 1d or 2d"
-    if kth_only:
-        raise NotImplementedError(
-            "TODO: would enconter:"
-            "NotImplementedError: SymbolVar var could not be itered"
-        )
-    if descending:
-        inp = -inp
-    Mode = mgb.opr_param_defs.TopK.Mode
-    if kth_only:
-        mode = Mode.KTH_ONLY
-    elif no_sort:
-        mode = Mode.VALUE_IDX_NOSORT
-    else:
-        mode = Mode.VALUE_IDX_SORTED
-    if len(inp.imm_shape) == 1:
-        inp = inp.reshape(1, -1)
-        tns, ind = mgb.opr.top_k(inp, k, mode=mode)
-        tns = tns[0]
-        ind = ind[0]
-    else:
-        tns, ind = mgb.opr.top_k(inp, k, mode=mode)
-    if descending:
-        tns = -tns
-    return tns, ind
diff --git a/python_module/megengine/functional/tensor.py b/python_module/megengine/functional/tensor.py
deleted file mode 100644
index b9a14ee2..00000000
--- a/python_module/megengine/functional/tensor.py
+++ /dev/null
@@ -1,667 +0,0 @@
-# -*- coding: utf-8 -*-
-# MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
-#
-# Copyright (c) 2014-2020 Megvii Inc. All rights reserved.
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-import functools
-from typing import Iterable, List, Optional, Union
-
-import numpy as np
-
-import megengine._internal as mgb
-from megengine._internal import CompGraph, CompNode
-
-from ..core import zeros
-from ..core.graph import _use_default_if_none
-from ..core.tensor import Tensor, wrap_io_tensor
-from .elemwise import ceil
-from .utils import _decide_comp_node_and_comp_graph
-
-
-@wrap_io_tensor
-def broadcast_to(inp: Tensor, shape: Union[int, Iterable[int]]) -> Tensor:
-    """
-    Broadcast a tensor to ``shape``
-
-    :param inp: The input tensor
-    :param shape: The target shape
-    :return: The output tensor
-
-    Examples:
-
-    .. testcode::
-
-        import numpy as np
-        from megengine import tensor
-        import megengine.functional as F
-
-        data = tensor(np.arange(0, 6, dtype=np.float32).reshape(2, 3))
-        out = F.broadcast_to(data, (4, 2, 3))
-        print(out.numpy())
-
-    Outputs:
-
-    .. testoutput::
-
-        [[[0. 1. 2.]
-          [3. 4. 5.]]
-
-         [[0. 1. 2.]
-          [3. 4. 5.]]
-
-         [[0. 1. 2.]
-          [3. 4. 5.]]
-
-         [[0. 1. 2.]
-          [3. 4. 5.]]]
-
-    """
-
-    if isinstance(shape, int):
-        shape = (shape,)
-    return mgb.opr.broadcast(inp, shape)
-
-
-def _get_idx(index, axis):
-    index_dims = len(index.imm_shape)
-    idx = []
-    comp_node, comp_graph = _decide_comp_node_and_comp_graph(index)
-    for i in range(index_dims):
-        if i != axis:
-            shape = [1] * index_dims
-            shape[i] = index.axis_shape(i)
-            arange = mgb.opr.linspace(
-                0,
-                index.axis_shape(i) - 1,
-                index.axis_shape(i),
-                comp_node=comp_node,
-                comp_graph=comp_graph,
-            )
-            arange = (
-                arange.reshape(*shape)
-                .broadcast(index.shape)
-                .reshape(-1)
-                .astype(np.int32)
-            )
-            idx.append(arange)
-        else:
-            idx.append(index.reshape(-1))
-    return tuple(idx)
-
-
-@wrap_io_tensor
-def gather(inp: Tensor, axis: int, index: Tensor) -> Tensor:
-    r"""
-    Gather data from :attr:`inp` on :attr:`axis` using :attr:`index`.
-
-    For a 3-D tensor, the output is specified by::
-
-        out[i][j][k] = inp[index[i][j][k]][j][k] # if axis == 0
-        out[i][j][k] = inp[i][index[i][j][k]][k] # if axis == 1
-        out[i][j][k] = inp[i][j][index[i][j][k]] # if axis == 2
-
-    if :attr:`inp` is an n-dimensional tensor with size
-    :math:`(x_0,x_1,...,x_{i-1},x_i,x_{i+1},...,x_{n-1})` and axis=i,
-    then :attr:`index` must be an n-dimensional tensor with size
-    :math:`(x_0,x_1,...,x_{i-1},y,x_{i+1},...,x_{n-1})` where :math:`y\ge 1` and
-    output will have the same size as :attr:`index`.
-
-
-    :param inp: the source tensor
-    :param axis: the axis along which to index
-    :param index: the indices of elements to gather
-
-    Examples:
-
-    .. testcode::
-
-        import megengine.functional as F
-        from megengine.core import tensor
-
-        inp = tensor([
-            [1,2], [3,4], [5,6],
-        ])
-        index = tensor([[0,2], [1,0]])
-        oup = F.gather(inp, 0, index)
-        print(oup.numpy())
-
-    Outputs:
-
-    .. testoutput::
-
-        [[1 6]
-         [3 2]]
-
-    """
-
-    input_shape = inp.imm_shape
-    index_shape = index.imm_shape
-    input_dims = len(input_shape)
-    index_dims = len(index_shape)
-    if input_dims != index_dims:
-        raise ValueError(
-            "The index tensor must have same dimensions as input tensor, "
-            "But the input dims:{}, the index dims:{}".format(input_dims, index_dims)
-        )
-
-    if axis < 0 or axis >= input_dims:
-        raise ValueError(
-            "Index axis {} is output of bounds, should in range [0 {})".format(
-                axis, input_dims
-            )
-        )
-
-    for i in range(input_dims):
-        if i != axis and input_shape[i] != index_shape[i]:
-            raise ValueError(
-                "The input {} and index {} must have the same size apart from axis {}".format(
-                    input_shape, index_shape, axis
-                )
-            )
-
-    idx = _get_idx(index, axis)
-    return mgb.opr.advanced_indexing(inp)[idx].reshape(
-        index.shape
-    )  # pylint: disable=no-member
-
-
-@wrap_io_tensor
-def concat(
-    inps: Iterable[Tensor],
-    axis: int = 0,
-    device: Optional[CompNode] = None,
-    comp_graph: Optional[CompGraph] = None,
-) -> Tensor:
-    r"""
-    Concat some tensors
-
-    :param inps: Input tensors to concat
-    :param axis: the dimension over which the tensors are concatenated. Default: 0
-    :param device: The comp node output on. Default: None
-    :param comp_graph: The graph in which output is. Default: None
-    :return: The output tensor
-
-    Examples:
-
-    .. testcode::
-
-        import numpy as np
-        from megengine import tensor
-        import megengine.functional as F
-
-        data1 = tensor(np.arange(0, 6, dtype=np.float32).reshape((2, 3)))
-        data2 = tensor(np.arange(6, 12, dtype=np.float32).reshape((2, 3)))
-        out = F.concat([data1, data2])
-        print(out.numpy())
-
-    Outputs:
-
-    .. testoutput::
-
-        [[ 0.  1.  2.]
-         [ 3.  4.  5.]
-         [ 6.  7.  8.]
-         [ 9. 10. 11.]]
-
-    """
-
-    # Output buffer not supported
-    return mgb.opr.concat(
-        *list(inps), axis=axis, comp_node=device, comp_graph=comp_graph
-    )
-
-
-@wrap_io_tensor
-def scatter(inp: Tensor, axis: int, index: Tensor, source: Tensor) -> Tensor:
-    r"""
-    Writes all values from the tensor :attr:`source` into :attr:`inp` at the indices specified in the :attr:`index` tensor.
-
-    For each value in :attr:`source`, its output index is specified by its index
-    in :attr:`source` for ``axis != dimension`` and by the corresponding value in
-    :attr:`index` for ``axis = dimension``.
-
-    For a 3-D tensor, :attr:`inp` is updated as::
-
-        inp[index[i][j][k]][j][k] = source[i][j][k]  # if axis == 0
-        inp[i][index[i][j][k]][k] = source[i][j][k]  # if axis == 1
-        inp[i][j][index[i][j][k]] = source[i][j][k]  # if axis == 2
-
-    :attr:`inp`, :attr:`index` and :attr:`source` should have same number of dimensions.
-
-    It is also required that ``source.shape(d) <= inp.shape(d)`` and ``index.shape(d) == source.shape(d)``
-    for all dimensions ``d``.
-
-    Moreover, the values of :attr:`index` must be between ``0`` and ``inp.shape(axis) - 1`` inclusive.
-
-    .. note::
-        Please notice that, due to performance issues, the result is uncertain on the GPU device
-        if scatter difference positions from source to the same destination position
-        regard to index tensor.
-
-        Show the case using the following examples, the oup[0][2] is maybe
-        from source[0][2] which value is 0.2256 or source[1][2] which value is 0.5339
-        if set the index[1][2] from 1 to 0.
-
-    :param inp: the inp tensor which to be scattered
-    :param axis: the axis along which to index
-    :param index: the indices of elements to scatter
-    :param source: the source element(s) to scatter
-
-    Examples:
-
-    .. testcode::
-
-        import numpy as np
-        import megengine.functional as F
-        from megengine.core import tensor
-
-        inp = tensor(np.zeros(shape=(3,5),dtype=np.float32))
-        source = tensor([[0.9935,0.9465,0.2256,0.8926,0.4396],[0.7723,0.0718,0.5939,0.357,0.4576]])
-        index = tensor([[0,2,0,2,1],[2,0,1,1,2]])
-        oup = F.scatter(inp, 0, index,source)
-        print(oup.numpy())
-
-    Outputs:
-
-    .. testoutput::
-
-        [[0.9935 0.0718 0.2256 0.     0.    ]
-         [0.     0.     0.5939 0.357  0.4396]
-         [0.7723 0.9465 0.     0.8926 0.4576]]
-
-    """
-
-    input_shape = inp.imm_shape
-    index_shape = index.imm_shape
-    source_shape = source.imm_shape
-    input_dims = len(input_shape)
-    index_dims = len(index_shape)
-    source_dims = len(source_shape)
-
-    if input_dims != index_dims or input_dims != source_dims:
-        raise ValueError("The input, source and index tensor must have same dimensions")
-
-    if axis < 0 or axis >= input_dims:
-        raise ValueError(
-            "Index axis {} is output of bounds, should in range [0 {})".format(
-                axis, input_dims
-            )
-        )
-
-    for i in range(source_dims):
-        if source_shape[i] > input_shape[i]:
-            raise ValueError(
-                "The each shape size for source {} must be less than or equal to input {} ".format(
-                    source_shape, input_shape
-                )
-            )
-
-    for i in range(index_dims):
-        if index_shape[i] != source_shape[i]:
-            raise ValueError(
-                "The each shape size for index {} must be equal to source {} ".format(
-                    index_shape, source_shape
-                )
-            )
-
-    for i in range(index_dims):
-        if i != axis and index_shape[i] > input_shape[i]:
-            raise ValueError(
-                "The index {} must be less than or equal to input {} size apart from axis {}".format(
-                    index_shape, input_shape, axis
-                )
-            )
-
-    idx = _get_idx(index, axis)
-    return mgb.opr.set_advanced_indexing(inp, source.flatten())[idx]
-
-
-@wrap_io_tensor
-def where(mask: Tensor, x: Tensor, y: Tensor) -> Tensor:
-    r"""
-    Select elements either from Tensor x or Tensor y, according to mask.
-
-    .. math::
-
-        \textrm{out}_i = x_i \textrm{ if } \textrm{mask}_i \textrm{ is True else } y_i
-
-    :param mask: a mask used for choosing x or y
-    :param x: the first choice
-    :param y: the second choice
-
-    Examples:
-
-    .. testcode::
-
-        from megengine import tensor
-        import megengine.functional as F
-        mask = tensor(np.array([[1, 0], [0, 1]], dtype=np.int32))
-        x = tensor(np.array([[1, np.inf], [np.nan, 4]],
-            dtype=np.float32))
-        y = tensor(np.array([[5, 6], [7, 8]], dtype=np.float32))
-        out = F.where(mask, x, y)
-        print(out.numpy())
-
-    Outputs:
-
-    .. testoutput::
-
-        [[1. 6.]
-         [7. 4.]]
-    """
-    v0, index0 = mgb.opr.cond_take(
-        x, mask, mode=mgb.opr_param_defs.CondTake.Mode.EQ, val=1
-    )
-    v1, index1 = mgb.opr.cond_take(
-        y, mask, mode=mgb.opr_param_defs.CondTake.Mode.EQ, val=0
-    )
-    out = x.flatten()
-    index = mgb.opr.concat(index0, index1, axis=0)
-    v = mgb.opr.concat(v0, v1, axis=0)
-    out = mgb.opr.set_advanced_indexing(out, v)[index]
-    out = out.reshape(x.shape)
-    return out
-
-
-@wrap_io_tensor
-def cond_take(mask: Tensor, x: Tensor, val=1) -> Tensor:
-    r"""
-    Take elements from data if specific condition is satisfied on mask. This operator has two outputs: the first is the elements taken, and the second is the indices corresponding to those elements; they are both 1-dimensional. High-dimension input would first be flattened.
-
-    :param mask: condition param; must be the same shape with data
-    :param x: input tensor from which to take elements
-    :param val: value to be compared to by mode
-
-    Examples:
-
-    .. testcode::
-
-        from megengine import tensor
-        import megengine.functional as F
-        mask = tensor(np.array([[1, 0], [0, 1]], dtype=np.int32))
-        x = tensor(np.array([[1, np.inf], [np.nan, 4]],
-            dtype=np.float32))
-        v, index = F.cond_take(mask, x, 1)
-        print(v, index)
-
-    Outputs:
-
-    .. testoutput::
-
-        Tensor([1. 4.]) Tensor([0 3], dtype=int32)
-
-    """
-
-    v, index = mgb.opr.cond_take(
-        x, mask, mode=mgb.opr_param_defs.CondTake.Mode.EQ, val=val
-    )
-    return v, index
-
-
-def shapeof(x: Tensor, axis=None):
-    r"""
-    The shape of input tensor.
-    """
-    return x.shapeof(axis=axis)
-
-
-@wrap_io_tensor
-def dimshuffle(inp: Tensor, pattern: Iterable[int]) -> Tensor:
-    r"""
-    Swap shapes and strides according to given pattern
-
-    :param inp: Input tensor
-    :param pattern: a list of integers including 0, 1, ... , ``ndim``-1, and any number of ``'x'`` char in dimensions where this tensor should be broadcasted. For examples:
-
-        * (``'x'``) -> make a 0d (scalar) into a 1d vector
-        * (0, 1) -> identity for 2d vectors
-        * (1, 0) -> inverts the first and second dimensions
-        * (``'x'``, 0) -> make a row out of a 1d vector (N to 1xN)
-        * (0, ``'x'``) -> make a column out of a 1d vector (N to Nx1)
-        * (2, 0, 1) -> AxBxC to CxAxB
-        * (0, ``'x'``, 1) -> AxB to Ax1xB
-        * (1, ``'x'``, 0) -> AxB to Bx1xA
-        * (1,) -> This remove dimensions 0. It must be a broadcastable dimension (1xA to A)
-
-    :return: The output tensor
-
-    Examples:
-
-    .. testcode::
-
-        import numpy as np
-        from megengine import tensor
-        import megengine.functional as F
-        x = tensor(np.array([[1, 1], [0, 0]], dtype=np.int32))
-        out = F.dimshuffle(x, (1, 0))
-        print(out.numpy())
-
-    Outputs:
-
-    .. testoutput::
-
-        [[1 0]
-         [1 0]]
-
-    """
-    return mgb.opr.dimshuffle(inp, pattern)
-
-
-@wrap_io_tensor
-def reshape(inp: Tensor, target_shape: Iterable[int]) -> Tensor:
-    r"""
-    Reshape a tensor to given target shape; total number of logical elements must
-    remain unchanged
-
-    :param inp: Input tensor
-    :param target_shape: target shape, the components would be concatenated to form the
-        target shape, and it can contain an element of -1 representing unspec_axis.
-
-    Examples:
-
-    .. testcode::
-
-        import numpy as np
-        from megengine import tensor
-        import megengine.functional as F
-        x = tensor(np.arange(12, dtype=np.int32))
-        out = F.reshape(x, (3, 2, 2))
-        print(out.numpy())
-
-    Outputs:
-
-    .. testoutput::
-
-        [[[ 0  1]
-          [ 2  3]]
-
-         [[ 4  5]
-          [ 6  7]]
-
-         [[ 8  9]
-          [10 11]]]
-
-    """
-    return mgb.opr.reshape(inp, target_shape)
-
-
-def transpose(inp: Tensor, pattern: Iterable[int]) -> Tensor:
-    r"""Equivalent to :func:`dimshuffle`
-    """
-    return dimshuffle(inp, pattern)
-
-
-@wrap_io_tensor
-def add_axis(inp: Tensor, axis: int) -> Tensor:
-    r"""
-    Add dimension before given axis.
-
-    :param inp: Input tensor
-    :param axis: Place of new axes
-    :return: The output tensor
-
-    Examples:
-
-    .. testcode::
-
-        import numpy as np
-        from megengine import tensor
-        import megengine.functional as F
-        x = tensor([1, 2])
-        out = F.add_axis(x, 0)
-        print(out.shape)
-
-    Outputs:
-
-    .. testoutput::
-
-        (1, 2)
-
-    """
-    if not isinstance(axis, int):
-        raise ValueError("axis must be int, but got type:{}".format(type(axis)))
-    return mgb.opr.add_axis(inp, axis)
-
-
-@wrap_io_tensor
-def remove_axis(inp: Tensor, axis: int) -> Tensor:
-    r"""
-    Remove dimension of shape 1.
-
-    :param inp: Input tensor
-    :param axis: Place of axis to be removed
-    :return: The output tensor
-
-    Examples:
-
-    .. testcode::
-
-        import numpy as np
-        from megengine import tensor
-        import megengine.functional as F
-        x = tensor(np.array([1, 2], dtype=np.int32).reshape(1, 1, 2, 1))
-        out = F.remove_axis(x, 3)
-        print(out.shape)
-
-    Outputs:
-
-    .. testoutput::
-
-        (1, 1, 2)
-
-    """
-    if not isinstance(axis, int):
-        raise ValueError("axis must be int, but got type:{}".format(type(axis)))
-    return mgb.opr.remove_axis(inp, axis)
-
-
-def linspace(
-    start: Union[int, float, Tensor],
-    stop: Union[int, float, Tensor],
-    num: Union[int, Tensor],
-    dtype=np.float32,
-    device: Optional[CompNode] = None,
-    comp_graph: Optional[CompGraph] = None,
-) -> Tensor:
-    r"""
-    Return equally spaced numbers over a specified interval
-
-    :param start: Starting value of the squence, shoule be scalar
-    :param stop: The last value of the squence, shoule be scalar
-    :param num: number of values to generate
-    :param dtype: result data type
-    :return: The generated tensor
-
-    Examples:
-
-    .. testcode::
-
-        import numpy as np
-        import megengine.functional as F
-
-        a = F.linspace(3,10,5)
-        print(a.numpy())
-
-    .. testoutput::
-
-        [ 3.    4.75  6.5   8.25 10.  ]
-
-    """
-    if dtype is not np.float32:
-        raise ValueError("linspace is only implemented for float32")
-
-    device, comp_graph = _use_default_if_none(device, comp_graph)
-    ret = Tensor(
-        mgb.opr.linspace(start, stop, num, comp_node=device, comp_graph=comp_graph)
-    )
-    return ret.astype(dtype)
-
-
-def arange(
-    start: Union[int, float, Tensor],
-    end: Union[int, float, Tensor],
-    step: Union[int, float, Tensor] = 1,
-    dtype=np.float32,
-    device: Optional[CompNode] = None,
-    comp_graph: Optional[CompGraph] = None,
-) -> Tensor:
-    r"""
-    Returns a Tensor with values from `start` to `end` with adjacent interval `step`
-
-    :param start: starting value of the squence, shoule be scalar
-    :param end: ending value of the squence, shoule be scalar
-    :param step: the gap between each pair of adjacent values. Default 1
-    :param dtype: result data type
-    :return: The generated tensor
-
-    Examples:
-
-    .. testcode::
-
-        import numpy as np
-        import megengine.functional as F
-
-        a = F.arange(1, 5, 1)
-        print(a.numpy())
-
-    .. testoutput::
-
-        [1. 2. 3. 4.]
-
-    """
-    if dtype is not np.float32:
-        raise ValueError("arange is only implemented for float32")
-    num = ceil((end - start) / step)
-    stop = start + step * (num - 1)
-    ret = linspace(start, stop, num, device=device, comp_graph=comp_graph)
-    return ret
-
-
-def zeros_like(inp: Tensor) -> Tensor:
-    r"""
-    Returns a zero tensor with the same shape as input tensor
-
-    :param inp: input tensor
-
-    Examples:
-
-    .. testcode::
-
-        import numpy as np
-        from megengine import tensor
-        import megengine.functional as F
-
-        inp = tensor(np.arange(1, 7, dtype=np.int32).reshape(2,3))
-        out = F.zeros_like(inp)
-        print(out.numpy())
-
-    .. testoutput::
-
-        [[0 0 0]
-         [0 0 0]]
-
-    """
-    return zeros(inp.shapeof()).astype(inp.dtype)
diff --git a/python_module/megengine/functional/utils.py b/python_module/megengine/functional/utils.py
deleted file mode 100644
index 0ad969ca..00000000
--- a/python_module/megengine/functional/utils.py
+++ /dev/null
@@ -1,81 +0,0 @@
-# -*- coding: utf-8 -*-
-# MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
-#
-# Copyright (c) 2014-2020 Megvii Inc. All rights reserved.
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-from typing import Iterable, Union
-
-import megengine._internal as mgb
-
-from ..core.graph import _use_default_if_none
-from ..core.tensor import Tensor, wrap_io_tensor
-from .elemwise import equal
-from .sort import top_k
-
-
-def _decide_comp_node_and_comp_graph(*args: mgb.SymbolVar):
-    for i in args:
-        if isinstance(i, mgb.SymbolVar):
-            return i.comp_node, i.owner_graph
-    return _use_default_if_none(None, None)
-
-
-def accuracy(
-    logits: Tensor, target: Tensor, topk: Union[int, Iterable[int]] = 1
-) -> Union[Tensor, Iterable[Tensor]]:
-    r"""
-    Calculate the classification accuracy given predicted logits and ground-truth labels.
-
-    :param logits: Model predictions of shape [batch_size, num_classes],
-        representing the probability (likelyhood) of each class.
-    :param target: Ground-truth labels, 1d tensor of int32
-    :param topk: Specifies the topk values, could be an int or tuple of ints. Default: 1
-    :return: Tensor(s) of classification accuracy between 0.0 and 1.0
-
-    Examples:
-
-    .. testcode::
-
-        import numpy as np
-        from megengine import tensor
-        import megengine.functional as F
-
-        logits = tensor(np.arange(80, dtype=np.int32).reshape(8,10))
-        target = tensor(np.arange(8, dtype=np.int32))
-        top1, top5 = F.accuracy(logits, target, (1, 5))
-        print(top1.numpy(), top5.numpy())
-
-    Outputs:
-
-    .. testoutput::
-
-        [0.] [0.375]
-    """
-    if isinstance(topk, int):
-        topk = (topk,)
-    _, pred = top_k(logits, k=max(topk), descending=True)
-    accs = []
-    for k in topk:
-        correct = equal(
-            pred[:, :k], target.dimshuffle(0, "x").broadcast(target.shapeof(0), k)
-        )
-        accs.append(correct.sum() / target.shapeof(0))
-    if len(topk) == 1:  # type: ignore[arg-type]
-        accs = accs[0]
-    return accs
-
-
-@wrap_io_tensor
-def zero_grad(inp: Tensor) -> Tensor:
-    r"""
-    Returns a tensor which is treated as constant during backward gradient calcuation,
-    i.e. its gradient is zero.
-
-    :param inp: Input tensor.
-
-    See implementation of :func:`~.softmax` for example.
-    """
-    return mgb.opr.zero_grad(inp)
diff --git a/python_module/megengine/hub/__init__.py b/python_module/megengine/hub/__init__.py
deleted file mode 100644
index f07c3979..00000000
--- a/python_module/megengine/hub/__init__.py
+++ /dev/null
@@ -1,16 +0,0 @@
-# -*- coding: utf-8 -*-
-# MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
-#
-# Copyright (c) 2014-2020 Megvii Inc. All rights reserved.
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-from .hub import (
-    help,
-    import_module,
-    list,
-    load,
-    load_serialized_obj_from_url,
-    pretrained,
-)
diff --git a/python_module/megengine/hub/const.py b/python_module/megengine/hub/const.py
deleted file mode 100644
index 5f53420b..00000000
--- a/python_module/megengine/hub/const.py
+++ /dev/null
@@ -1,17 +0,0 @@
-# -*- coding: utf-8 -*-
-# MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
-#
-# Copyright (c) 2014-2020 Megvii Inc. All rights reserved.
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-DEFAULT_BRANCH_NAME = "master"
-HUBCONF = "hubconf.py"
-HUBDEPENDENCY = "dependencies"
-DEFAULT_GIT_HOST = "github.com"
-ENV_MGE_HOME = "MGE_HOME"
-ENV_XDG_CACHE_HOME = "XDG_CACHE_HOME"
-DEFAULT_CACHE_DIR = "~/.cache"
-DEFAULT_PROTOCOL = "HTTPS"
-HTTP_READ_TIMEOUT = 120
diff --git a/python_module/megengine/hub/exceptions.py b/python_module/megengine/hub/exceptions.py
deleted file mode 100644
index aab0a134..00000000
--- a/python_module/megengine/hub/exceptions.py
+++ /dev/null
@@ -1,30 +0,0 @@
-# -*- coding: utf-8 -*-
-# MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
-#
-# Copyright (c) 2014-2020 Megvii Inc. All rights reserved.
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-class FetcherError(Exception):
-    """Base class for fetch related error."""
-
-
-class InvalidRepo(FetcherError):
-    """The repo provided was somehow invalid."""
-
-
-class InvalidGitHost(FetcherError):
-    """The git host provided was somehow invalid."""
-
-
-class GitPullError(FetcherError):
-    """A git pull error occurred"""
-
-
-class GitCheckoutError(FetcherError):
-    """A git checkout error occurred"""
-
-
-class InvalidProtocol(FetcherError):
-    """The protocol provided was somehow invalid"""
diff --git a/python_module/megengine/hub/fetcher.py b/python_module/megengine/hub/fetcher.py
deleted file mode 100644
index dfbfb0e7..00000000
--- a/python_module/megengine/hub/fetcher.py
+++ /dev/null
@@ -1,300 +0,0 @@
-# -*- coding: utf-8 -*-
-# MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
-#
-# Copyright (c) 2014-2020 Megvii Inc. All rights reserved.
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-import hashlib
-import os
-import re
-import shutil
-import subprocess
-from tempfile import NamedTemporaryFile
-from typing import Tuple
-from zipfile import ZipFile
-
-import requests
-from tqdm import tqdm
-
-from megengine.utils.http_download import (
-    CHUNK_SIZE,
-    HTTP_CONNECTION_TIMEOUT,
-    HTTPDownloadError,
-)
-
-from ..distributed.util import is_distributed, synchronized
-from ..logger import get_logger
-from .const import DEFAULT_BRANCH_NAME, HTTP_READ_TIMEOUT
-from .exceptions import GitCheckoutError, GitPullError, InvalidGitHost, InvalidRepo
-from .tools import cd
-
-logger = get_logger(__name__)
-
-HTTP_TIMEOUT = (HTTP_CONNECTION_TIMEOUT, HTTP_READ_TIMEOUT)
-
-pattern = re.compile(
-    r"^(?:[a-z0-9]"  # First character of the domain
-    r"(?:[a-z0-9-_]{0,61}[a-z0-9])?\.)"  # Sub domain + hostname
-    r"+[a-z0-9][a-z0-9-_]{0,61}"  # First 61 characters of the gTLD
-    r"[a-z]$"  # Last character of the gTLD
-)
-
-
-class RepoFetcherBase:
-    @classmethod
-    def fetch(
-        cls,
-        git_host: str,
-        repo_info: str,
-        use_cache: bool = False,
-        commit: str = None,
-        silent: bool = True,
-    ) -> str:
-        raise NotImplementedError()
-
-    @classmethod
-    def _parse_repo_info(cls, repo_info: str) -> Tuple[str, str, str]:
-        try:
-            branch_info = DEFAULT_BRANCH_NAME
-            if ":" in repo_info:
-                prefix_info, branch_info = repo_info.split(":")
-            else:
-                prefix_info = repo_info
-            repo_owner, repo_name = prefix_info.split("/")
-            return repo_owner, repo_name, branch_info
-        except ValueError:
-            raise InvalidRepo("repo_info: '{}' is invalid.".format(repo_info))
-
-    @classmethod
-    def _check_git_host(cls, git_host):
-        return cls._is_valid_domain(git_host) or cls._is_valid_host(git_host)
-
-    @classmethod
-    def _is_valid_domain(cls, s):
-        try:
-            return pattern.match(s.encode("idna").decode("ascii"))
-        except UnicodeError:
-            return False
-
-    @classmethod
-    def _is_valid_host(cls, s):
-        nums = s.split(".")
-        if len(nums) != 4 or any(not _.isdigit() for _ in nums):
-            return False
-        return all(0 <= int(_) < 256 for _ in nums)
-
-    @classmethod
-    def _gen_repo_dir(cls, repo_dir: str) -> str:
-        return hashlib.sha1(repo_dir.encode()).hexdigest()[:16]
-
-
-class GitSSHFetcher(RepoFetcherBase):
-    @classmethod
-    @synchronized
-    def fetch(
-        cls,
-        git_host: str,
-        repo_info: str,
-        use_cache: bool = False,
-        commit: str = None,
-        silent: bool = True,
-    ) -> str:
-        """
-        Fetches git repo by SSH protocol
-
-        :param git_host:
-            host address of git repo.
-            example: github.com
-        :param repo_info:
-            a string with format ``"repo_owner/repo_name[:tag_name/:branch_name]"`` with an optional
-            tag/branch. The default branch is ``master`` if not specified.
-            example: ``"brain_sdk/MegBrain[:hub]"``
-        :param use_cache:
-            whether to use locally fetched code or completely re-fetch
-        :param commit:
-            commit id on github or gitlab
-        :param silent:
-            whether to accept the stdout and stderr of the subprocess with PIPE, instead of
-            displaying on the screen
-        :return:
-            directory where the repo code is stored
-        """
-        if not cls._check_git_host(git_host):
-            raise InvalidGitHost("git_host: '{}' is malformed.".format(git_host))
-
-        repo_owner, repo_name, branch_info = cls._parse_repo_info(repo_info)
-        normalized_branch_info = branch_info.replace("/", "_")
-        repo_dir_raw = "{}_{}_{}".format(
-            repo_owner, repo_name, normalized_branch_info
-        ) + ("_{}".format(commit) if commit else "")
-        repo_dir = cls._gen_repo_dir(repo_dir_raw)
-        git_url = "git@{}:{}/{}.git".format(git_host, repo_owner, repo_name)
-
-        if use_cache and os.path.exists(repo_dir):  # use cache
-            logger.debug("Cache Found in %s", repo_dir)
-            return repo_dir
-
-        if is_distributed():
-            logger.warning(
-                "When using `hub.load` or `hub.list` to fetch git repositories\n"
-                "    in DISTRIBUTED mode for the first time, processes are synchronized to\n"
-                "    ensure that target repository is ready to use for each process.\n"
-                "    Users are expected to see this warning no more than ONCE, otherwise\n"
-                "    (very little chance) you may need to remove corrupt cache\n"
-                "    `%s` and fetch again.",
-                repo_dir,
-            )
-
-        shutil.rmtree(repo_dir, ignore_errors=True)  # ignore and clear cache
-
-        logger.debug(
-            "Git Clone from Repo:%s Branch: %s to %s",
-            git_url,
-            normalized_branch_info,
-            repo_dir,
-        )
-
-        kwargs = (
-            {"stderr": subprocess.PIPE, "stdout": subprocess.PIPE} if silent else {}
-        )
-        if commit is None:
-            # shallow clone repo by branch/tag
-            p = subprocess.Popen(
-                [
-                    "git",
-                    "clone",
-                    "-b",
-                    normalized_branch_info,
-                    git_url,
-                    repo_dir,
-                    "--depth=1",
-                ],
-                **kwargs,
-            )
-            cls._check_clone_pipe(p)
-        else:
-            # clone repo and checkout to commit_id
-            p = subprocess.Popen(["git", "clone", git_url, repo_dir], **kwargs)
-            cls._check_clone_pipe(p)
-
-            with cd(repo_dir):
-                logger.debug("git checkout to %s", commit)
-                p = subprocess.Popen(["git", "checkout", commit], **kwargs)
-                _, err = p.communicate()
-                if p.returncode:
-                    shutil.rmtree(repo_dir, ignore_errors=True)
-                    raise GitCheckoutError(
-                        "Git checkout error, please check the commit id.\n"
-                        + err.decode()
-                    )
-        with cd(repo_dir):
-            shutil.rmtree(".git")
-
-        return repo_dir
-
-    @classmethod
-    def _check_clone_pipe(cls, p):
-        _, err = p.communicate()
-        if p.returncode:
-            raise GitPullError(
-                "Repo pull error, please check repo info.\n" + err.decode()
-            )
-
-
-class GitHTTPSFetcher(RepoFetcherBase):
-    @classmethod
-    @synchronized
-    def fetch(
-        cls,
-        git_host: str,
-        repo_info: str,
-        use_cache: bool = False,
-        commit: str = None,
-        silent: bool = True,
-    ) -> str:
-        """
-        Fetches git repo by HTTPS protocol
-
-        :param git_host:
-            host address of git repo
-            example: github.com
-        :param repo_info:
-            a string with format ``"repo_owner/repo_name[:tag_name/:branch_name]"`` with an optional
-            tag/branch. The default branch is ``master`` if not specified.
-            example: ``"brain_sdk/MegBrain[:hub]"``
-        :param use_cache:
-            whether to use locally cached code or completely re-fetch
-        :param commit:
-            commit id on github or gitlab
-        :param silent:
-            whether to accept the stdout and stderr of the subprocess with PIPE, instead of
-            displaying on the screen
-        :return:
-            directory where the repo code is stored
-        """
-        if not cls._check_git_host(git_host):
-            raise InvalidGitHost("git_host: '{}' is malformed.".format(git_host))
-
-        repo_owner, repo_name, branch_info = cls._parse_repo_info(repo_info)
-        normalized_branch_info = branch_info.replace("/", "_")
-        repo_dir_raw = "{}_{}_{}".format(
-            repo_owner, repo_name, normalized_branch_info
-        ) + ("_{}".format(commit) if commit else "")
-        repo_dir = cls._gen_repo_dir(repo_dir_raw)
-        archive_url = cls._git_archive_link(
-            git_host, repo_owner, repo_name, branch_info, commit
-        )
-
-        if use_cache and os.path.exists(repo_dir):  # use cache
-            logger.debug("Cache Found in %s", repo_dir)
-            return repo_dir
-
-        if is_distributed():
-            logger.warning(
-                "When using `hub.load` or `hub.list` to fetch git repositories "
-                "in DISTRIBUTED mode for the first time, processes are synchronized to "
-                "ensure that target repository is ready to use for each process.\n"
-                "Users are expected to see this warning no more than ONCE, otherwise"
-                "(very little chance) you may need to remove corrupt hub cache %s and fetch again."
-            )
-
-        shutil.rmtree(repo_dir, ignore_errors=True)  # ignore and clear cache
-
-        logger.debug("Downloading from %s to %s", archive_url, repo_dir)
-        cls._download_zip_and_extract(archive_url, repo_dir)
-
-        return repo_dir
-
-    @classmethod
-    def _download_zip_and_extract(cls, url, target_dir):
-        resp = requests.get(url, timeout=HTTP_TIMEOUT, stream=True)
-        if resp.status_code != 200:
-            raise HTTPDownloadError(
-                "An error occured when downloading from {}".format(url)
-            )
-
-        total_size = int(resp.headers.get("Content-Length", 0))
-        _bar = tqdm(total=total_size, unit="iB", unit_scale=True)
-
-        with NamedTemporaryFile("w+b") as f:
-            for chunk in resp.iter_content(CHUNK_SIZE):
-                if not chunk:
-                    break
-                _bar.update(len(chunk))
-                f.write(chunk)
-            _bar.close()
-            f.seek(0)
-            with ZipFile(f) as temp_zip_f:
-                zip_dir_name = temp_zip_f.namelist()[0].split("/")[0]
-                temp_zip_f.extractall(".")
-                shutil.move(zip_dir_name, target_dir)
-
-    @classmethod
-    def _git_archive_link(cls, git_host, repo_owner, repo_name, branch_info, commit):
-        archive_link = "https://{}/{}/{}/archive/{}.zip".format(
-            git_host, repo_owner, repo_name, commit or branch_info
-        )
-
-        return archive_link
diff --git a/python_module/megengine/hub/hub.py b/python_module/megengine/hub/hub.py
deleted file mode 100644
index 5342cd8e..00000000
--- a/python_module/megengine/hub/hub.py
+++ /dev/null
@@ -1,333 +0,0 @@
-# -*- coding: utf-8 -*-
-# MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
-#
-# Copyright (c) 2014-2020 Megvii Inc. All rights reserved.
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-import functools
-import hashlib
-import os
-import sys
-import types
-from typing import Any, List
-from urllib.parse import urlparse
-
-from megengine.utils.http_download import download_from_url
-
-from ..core.serialization import load as _mge_load_serialized
-from ..distributed import is_distributed
-from ..logger import get_logger
-from .const import (
-    DEFAULT_CACHE_DIR,
-    DEFAULT_GIT_HOST,
-    DEFAULT_PROTOCOL,
-    ENV_MGE_HOME,
-    ENV_XDG_CACHE_HOME,
-    HTTP_READ_TIMEOUT,
-    HUBCONF,
-    HUBDEPENDENCY,
-)
-from .exceptions import InvalidProtocol
-from .fetcher import GitHTTPSFetcher, GitSSHFetcher
-from .tools import cd, check_module_exists, load_module
-
-logger = get_logger(__name__)
-
-
-PROTOCOLS = {
-    "HTTPS": GitHTTPSFetcher,
-    "SSH": GitSSHFetcher,
-}
-
-
-def _get_megengine_home() -> str:
-    """MGE_HOME setting complies with the XDG Base Directory Specification
-    """
-    megengine_home = os.path.expanduser(
-        os.getenv(
-            ENV_MGE_HOME,
-            os.path.join(os.getenv(ENV_XDG_CACHE_HOME, DEFAULT_CACHE_DIR), "megengine"),
-        )
-    )
-    return megengine_home
-
-
-def _get_repo(
-    git_host: str,
-    repo_info: str,
-    use_cache: bool = False,
-    commit: str = None,
-    protocol: str = DEFAULT_PROTOCOL,
-) -> str:
-    if protocol not in PROTOCOLS:
-        raise InvalidProtocol(
-            "Invalid protocol, the value should be one of {}.".format(
-                ", ".join(PROTOCOLS.keys())
-            )
-        )
-    cache_dir = os.path.expanduser(os.path.join(_get_megengine_home(), "hub"))
-    with cd(cache_dir):
-        fetcher = PROTOCOLS[protocol]
-        repo_dir = fetcher.fetch(git_host, repo_info, use_cache, commit)
-        return os.path.join(cache_dir, repo_dir)
-
-
-def _check_dependencies(module: types.ModuleType) -> None:
-    if not hasattr(module, HUBDEPENDENCY):
-        return
-
-    dependencies = getattr(module, HUBDEPENDENCY)
-    if not dependencies:
-        return
-
-    missing_deps = [m for m in dependencies if not check_module_exists(m)]
-    if len(missing_deps):
-        raise RuntimeError("Missing dependencies: {}".format(", ".join(missing_deps)))
-
-
-def _init_hub(
-    repo_info: str,
-    git_host: str,
-    use_cache: bool = True,
-    commit: str = None,
-    protocol: str = DEFAULT_PROTOCOL,
-):
-    """Imports hubmodule like python import
-
-    :param repo_info:
-        a string with format ``"repo_owner/repo_name[:tag_name/:branch_name]"`` with an optional
-        tag/branch. The default branch is ``master`` if not specified.
-        Example: ``"brain_sdk/MegBrain[:hub]"``
-    :param git_host:
-        host address of git repo
-        Example: github.com
-    :param use_cache:
-        whether to use locally cached code or completely re-fetch
-    :param commit:
-        commit id on github or gitlab
-    :param protocol:
-        which protocol to use to get the repo, and HTTPS protocol only supports public repo on github.
-        The value should be one of HTTPS, SSH.
-    :return:
-        hubconf.py as a python module
-    """
-    cache_dir = os.path.expanduser(os.path.join(_get_megengine_home(), "hub"))
-    os.makedirs(cache_dir, exist_ok=True)
-    absolute_repo_dir = _get_repo(
-        git_host, repo_info, use_cache=use_cache, commit=commit, protocol=protocol
-    )
-    sys.path.insert(0, absolute_repo_dir)
-    hubmodule = load_module(HUBCONF, os.path.join(absolute_repo_dir, HUBCONF))
-    sys.path.remove(absolute_repo_dir)
-
-    return hubmodule
-
-
-@functools.wraps(_init_hub)
-def import_module(*args, **kwargs):
-    return _init_hub(*args, **kwargs)
-
-
-def list(
-    repo_info: str,
-    git_host: str = DEFAULT_GIT_HOST,
-    use_cache: bool = True,
-    commit: str = None,
-    protocol: str = DEFAULT_PROTOCOL,
-) -> List[str]:
-    """Lists all entrypoints available in repo hubconf
-
-    :param repo_info:
-        a string with format ``"repo_owner/repo_name[:tag_name/:branch_name]"`` with an optional
-        tag/branch. The default branch is ``master`` if not specified.
-        Example: ``"brain_sdk/MegBrain[:hub]"``
-    :param git_host:
-        host address of git repo
-        Example: github.com
-    :param use_cache:
-        whether to use locally cached code or completely re-fetch
-    :param commit:
-        commit id on github or gitlab
-    :param protocol:
-        which protocol to use to get the repo, and HTTPS protocol only supports public repo on github.
-        The value should be one of HTTPS, SSH.
-    :return:
-        all entrypoint names of the model
-    """
-    hubmodule = _init_hub(repo_info, git_host, use_cache, commit, protocol)
-
-    return [
-        _
-        for _ in dir(hubmodule)
-        if not _.startswith("__") and callable(getattr(hubmodule, _))
-    ]
-
-
-def load(
-    repo_info: str,
-    entry: str,
-    *args,
-    git_host: str = DEFAULT_GIT_HOST,
-    use_cache: bool = True,
-    commit: str = None,
-    protocol: str = DEFAULT_PROTOCOL,
-    **kwargs
-) -> Any:
-    """Loads model from github or gitlab repo, with pretrained weights.
-
-    :param repo_info:
-        a string with format ``"repo_owner/repo_name[:tag_name/:branch_name]"`` with an optional
-        tag/branch. The default branch is ``master`` if not specified.
-        Example: ``"brain_sdk/MegBrain[:hub]"``
-    :param entry:
-        an entrypoint defined in hubconf
-    :param git_host:
-        host address of git repo
-        Example: github.com
-    :param use_cache:
-        whether to use locally cached code or completely re-fetch
-    :param commit:
-        commit id on github or gitlab
-    :param protocol:
-        which protocol to use to get the repo, and HTTPS protocol only supports public repo on github.
-        The value should be one of HTTPS, SSH.
-    :return:
-        a single model with corresponding pretrained weights.
-    """
-    hubmodule = _init_hub(repo_info, git_host, use_cache, commit, protocol)
-
-    if not hasattr(hubmodule, entry) or not callable(getattr(hubmodule, entry)):
-        raise RuntimeError("Cannot find callable {} in hubconf.py".format(entry))
-
-    _check_dependencies(hubmodule)
-
-    module = getattr(hubmodule, entry)(*args, **kwargs)
-    return module
-
-
-def help(
-    repo_info: str,
-    entry: str,
-    git_host: str = DEFAULT_GIT_HOST,
-    use_cache: bool = True,
-    commit: str = None,
-    protocol: str = DEFAULT_PROTOCOL,
-) -> str:
-    """This function returns docstring of entrypoint ``entry`` by following steps:
-
-    1. Pull the repo code specified by git and repo_info
-    2. Load the entry defined in repo's hubconf.py
-    3. Return docstring of function entry
-
-    :param repo_info:
-        a string with format ``"repo_owner/repo_name[:tag_name/:branch_name]"`` with an optional
-        tag/branch. The default branch is ``master`` if not specified.
-        Example: ``"brain_sdk/MegBrain[:hub]"``
-    :param entry:
-        an entrypoint defined in hubconf.py
-    :param git_host:
-        host address of git repo
-        Example: github.com
-    :param use_cache:
-        whether to use locally cached code or completely re-fetch
-    :param commit:
-        commit id on github or gitlab
-    :param protocol:
-        which protocol to use to get the repo, and HTTPS protocol only supports public repo on github.
-        The value should be one of HTTPS, SSH.
-    :return:
-        docstring of entrypoint ``entry``
-    """
-    hubmodule = _init_hub(repo_info, git_host, use_cache, commit, protocol)
-
-    if not hasattr(hubmodule, entry) or not callable(getattr(hubmodule, entry)):
-        raise RuntimeError("Cannot find callable {} in hubconf.py".format(entry))
-
-    doc = getattr(hubmodule, entry).__doc__
-    return doc
-
-
-def load_serialized_obj_from_url(url: str, model_dir=None) -> Any:
-    """Loads MegEngine serialized object from the given URL.
-
-    If the object is already present in ``model_dir``, it's deserialized and
-    returned. If no ``model_dir`` is specified, it will be ``MGE_HOME/serialized``.
-
-    :param url: url to serialized object
-    :param model_dir: dir to cache target serialized file
-
-    :return: loaded object
-    """
-    if model_dir is None:
-        model_dir = os.path.join(_get_megengine_home(), "serialized")
-    os.makedirs(model_dir, exist_ok=True)
-
-    parts = urlparse(url)
-    filename = os.path.basename(parts.path)
-
-    # use hash as prefix to avoid filename conflict from different urls
-    sha256 = hashlib.sha256()
-    sha256.update(url.encode())
-    digest = sha256.hexdigest()[:6]
-    filename = digest + "_" + filename
-
-    cached_file = os.path.join(model_dir, filename)
-    logger.info(
-        "load_serialized_obj_from_url: download to or using cached %s", cached_file
-    )
-    if not os.path.exists(cached_file):
-        if is_distributed():
-            logger.warning(
-                "Downloading serialized object in DISTRIBUTED mode\n"
-                "    File may be downloaded multiple times. We recommend\n"
-                "    users to download in single process first."
-            )
-        download_from_url(url, cached_file, HTTP_READ_TIMEOUT)
-
-    state_dict = _mge_load_serialized(cached_file)
-    return state_dict
-
-
-class pretrained:
-    r"""
-    Decorator which helps to download pretrained weights from the given url.
-
-    For example, we can decorate a resnet18 function as follows
-
-    .. code-block::
-
-        @hub.pretrained("https://url/to/pretrained_resnet18.pkl")
-        def resnet18(**kwargs):
-            return ResNet(BasicBlock, [2, 2, 2, 2], **kwargs)
-
-    When decorated function is called with ``pretrained=True``, MegEngine will automatically
-    download and fill the returned model with pretrained weights.
-    """
-
-    def __init__(self, url):
-        self.url = url
-
-    def __call__(self, func):
-        @functools.wraps(func)
-        def pretrained_model_func(
-            pretrained=False, **kwargs
-        ):  # pylint: disable=redefined-outer-name
-            model = func(**kwargs)
-            if pretrained:
-                weights = load_serialized_obj_from_url(self.url)
-                model.load_state_dict(weights)
-            return model
-
-        return pretrained_model_func
-
-
-__all__ = [
-    "list",
-    "load",
-    "help",
-    "load_serialized_obj_from_url",
-    "pretrained",
-    "import_module",
-]
diff --git a/python_module/megengine/hub/tools.py b/python_module/megengine/hub/tools.py
deleted file mode 100644
index 0bf9c98c..00000000
--- a/python_module/megengine/hub/tools.py
+++ /dev/null
@@ -1,48 +0,0 @@
-# -*- coding: utf-8 -*-
-# MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
-#
-# Copyright (c) 2014-2020 Megvii Inc. All rights reserved.
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-import importlib.util
-import os
-import types
-from contextlib import contextmanager
-from typing import Iterator
-
-
-def load_module(name: str, path: str) -> types.ModuleType:
-    """
-    Loads module specified by name and path
-
-    :param name: module name
-    :param path: module path
-    """
-    spec = importlib.util.spec_from_file_location(name, path)
-    module = importlib.util.module_from_spec(spec)
-    spec.loader.exec_module(module)
-    return module
-
-
-def check_module_exists(module: str) -> bool:
-    """Checks whether python module exists or not
-
-    :param module: name of module
-    """
-    return importlib.util.find_spec(module) is not None
-
-
-@contextmanager
-def cd(target: str) -> Iterator[None]:
-    """Changes current directory to target
-
-    :param target: target directory
-    """
-    prev = os.getcwd()
-    os.chdir(os.path.expanduser(target))
-    try:
-        yield
-    finally:
-        os.chdir(prev)
diff --git a/python_module/megengine/jit/__init__.py b/python_module/megengine/jit/__init__.py
deleted file mode 100644
index d610388b..00000000
--- a/python_module/megengine/jit/__init__.py
+++ /dev/null
@@ -1,570 +0,0 @@
-# -*- coding: utf-8 -*-
-# MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
-#
-# Copyright (c) 2014-2020 Megvii Inc. All rights reserved.
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-import contextlib
-import functools
-import itertools
-import os
-from typing import Callable, Tuple, Union
-
-import numpy as np
-
-import megengine._internal as mgb
-from megengine._internal.plugin import CompGraphProfiler
-
-from ..core import Tensor, graph, tensor
-from .sublinear_memory_config import SublinearMemoryConfig
-
-
-def sideeffect(f):
-    # during eager tracing, wrapped function is called with proxy inputs
-    # during static tracing, wrapped function will not be called at all
-    @functools.wraps(f)
-    def wrapper(*args, **kwargs):  # pylint: disable=inconsistent-return-statements
-        if not trace._active_instance:
-            return f(*args, **kwargs)
-
-        tensors = {}
-        for i, x in itertools.chain(enumerate(args), kwargs.items()):
-            if isinstance(x, Tensor):
-                tensors[i] = x
-        if tensors:
-            _keys, tensors = zip(*tensors.items())
-        else:
-            _keys, tensors = (), ()
-
-        def callback(*tensors, f=f, keys=_keys, args=args, kwargs=kwargs):
-            replace = dict(zip(keys, tensors))
-            args = tuple(replace.get(i, x) for i, x in enumerate(args))
-            kwargs = {i: replace.get(i, x) for i, x in kwargs.items()}
-            if f(*args, **kwargs) is not None:
-                raise TypeError("a sideeffect function should return None")
-            # TODO: clear memory
-
-        trace._active_instance._register_callback(callback, tensors)
-
-    return wrapper
-
-
-def mark_impure(x):
-    if not trace._active_instance:
-        return x
-    return trace._active_instance._mark_impure(x)
-
-
-def barrier(x):
-    if not trace._active_instance:
-        return x
-    return trace._active_instance._insert_barrier(x)
-
-
-def _dummy():
-    return mgb.make_immutable(*graph._use_default_if_none(None, None), 0)
-
-
-class unset:
-    pass
-
-
-class trace:
-    """
-    Wrap a callable and provide:
-
-    * tracing via :meth:`.trace` and :meth:`.dump`
-    * accelerated evalutaion via :meth:`.__call__`
-
-    :param func: Positional only argument.
-    :param symbolic: Whether to use symbolic tensor. Default: False
-    :param opt_level: Optimization level for compiling trace.
-    :param log_level: Log level.
-    :param sublinear_memory_config: Configuration for sublinear memory optimization.
-        If not None, it enables sublinear memory optimization with given setting.
-    :param allreduce_pack_max_size: Maximum size of an allreduce pack in MB.
-        If not None, multiple gradients will be packed and synchronized together
-    :param profiling: Whether to profile compiled trace. Default: False
-    """
-
-    _active_instance = None
-    enabled = not os.getenv("MGE_DISABLE_TRACE")
-
-    _UNSTARTED = "unstarted"
-    _STARTED = "started"
-    _FINISHED = "finished"
-
-    def __new__(cls, *args, **kwargs):
-        if not args:
-            return functools.partial(cls, **kwargs)
-        return super().__new__(cls)
-
-    def __init__(
-        self,
-        func: Callable[..., Union[None, Tensor, Tuple[Tensor]]],
-        *,
-        symbolic: bool = False,
-        opt_level: int = None,
-        log_level: int = None,
-        sublinear_memory_config: SublinearMemoryConfig = None,
-        allreduce_pack_max_size: int = None,
-        profiling: bool = False
-    ):
-        self.__wrapped__ = func
-        self._symbolic = symbolic
-        self._graph_opt_level = opt_level
-        self._log_level = log_level
-        self._sublinear_memory_config = sublinear_memory_config
-        self._allreduce_pack_max_size = allreduce_pack_max_size
-        self._status = self._UNSTARTED
-        self._args = None
-        self._kwargs = None
-        self._outputs = unset
-        self._sym_outputs = unset
-        self._outspec = None
-        self._checkpoint = None
-        self._compiled_func = None
-        self._profiling = profiling
-        self._profiler = None
-
-    @property
-    def _active(self):
-        c1 = self._status == self._STARTED
-        c2 = type(self)._active_instance is self
-        assert c1 == c2
-        return c1
-
-    def _register_callback(self, f, args=()):
-        assert self._active
-        assert isinstance(args, (tuple, list))
-        proxies = self._make_proxies(args)
-        self._forward(args, proxies, checkpoint=True)
-        # NOTE: under eager graph callback will fire immediately
-        job = mgb.opr.callback_injector(
-            self._insert_barrier(_dummy()), lambda _: f(*proxies)
-        )
-        self._insert_checkpoint(job)
-        self._outspec.append(job)
-
-    def _insert_barrier(self, x):
-        assert self._active
-        if self._checkpoint is None:
-            return x
-        if isinstance(x, Tensor):
-            x = x._symvar
-            wrap = True
-        else:
-            wrap = False
-        if not isinstance(x, mgb.SymbolVar):
-            raise TypeError
-        x = mgb.opr.virtual_dep([x, self._checkpoint])
-        if wrap:
-            x = Tensor(x)
-        return x
-
-    def _insert_checkpoint(self, *args, no_barrier=False):
-        assert self._active
-        if not args:
-            return
-        args = tuple(x._symvar if isinstance(x, Tensor) else x for x in args)
-        for x in args:
-            if not isinstance(x, mgb.SymbolVar):
-                raise TypeError
-        if not no_barrier and self._checkpoint is not None:
-            # normally no need to _insert_barrier here, but if
-            # someone forget to call _insert_barrier beforehand,
-            # this can make things less broken
-            args += (self._checkpoint,)
-        if len(args) == 1:
-            self._checkpoint = args[0]
-        else:
-            self._checkpoint = mgb.opr.virtual_dep(args)
-
-    def _mark_impure(self, x):
-        assert self._active
-        ret = x
-        if isinstance(x, Tensor):
-            x = x._symvar
-        if not isinstance(x, mgb.SymbolVar):
-            raise TypeError
-        self._outspec.append(x)
-        self._insert_checkpoint(x)
-        return ret
-
-    def _make_proxies(self, args):
-        assert isinstance(args, (tuple, list))
-        for x in args:
-            assert isinstance(x, Tensor)
-        return tuple(tensor(dtype=x.dtype, device=x.device) for x in args)
-
-    def _forward(self, srcs, dests, checkpoint=True):
-        # pseudo-op: does not run under static graph; traced
-        # TODO: use shared memory
-        assert len(srcs) == len(dests)
-        if not self._active:
-            for s, d in zip(srcs, dests):
-                d.set_value(s, share=False)
-            return
-        jobs = []
-        for s, d in zip(srcs, dests):
-
-            def callback(value, dest=d):
-                dest.set_value(value, share=False)
-
-            s = self._insert_barrier(s._symvar)
-            # NOTE: callback immediately fire in eager graph
-            jobs.append(mgb.opr.callback_injector(s, callback))
-        self._outspec.extend(jobs)
-        if checkpoint:
-            self._insert_checkpoint(*jobs, no_barrier=True)
-
-    def _forward_inputs(self, *args, **kwargs):
-        if self._kwargs is None:
-            self._kwargs = kwargs
-        elif self._kwargs != kwargs:
-            raise ValueError("kwargs must not change between invocations")
-
-        if self._args is None:
-            self._args = []
-            for i in args:
-                if isinstance(i, Tensor):
-                    self._args.append(tensor(dtype=i.dtype, device=i.device))
-                    self._args[-1].set_value(i, share=False)
-                else:
-                    self._args.append(tensor(i))
-        else:
-            if not len(args) == len(self._args):
-                raise TypeError
-            for i, proxy in zip(args, self._args):
-                proxy.set_value(i, share=False)
-            # XXX: sync?
-
-    def _make_outputs(self, outputs):
-        if outputs is None:
-            self._outputs = None
-            return
-        if isinstance(outputs, Tensor):
-            # no one is able to call barrier after this, so no need to checkpoint
-            # but checkpoint do little harm anyway
-            (self._outputs,) = self._make_proxies([outputs])
-            return
-        if not isinstance(outputs, (tuple, list)):
-            raise TypeError("should return (tuple of) tensor")
-        for i in outputs:
-            if not isinstance(i, Tensor):
-                raise TypeError("should return (tuple of) tensor")
-        self._outputs = self._make_proxies(outputs)
-
-    def _foward_outputs(self, outputs):
-        # pseudo-op: does not run under static graph; traced
-        if self._outputs is unset:
-            self._make_outputs(outputs)
-        if self._outputs is None:
-            if outputs is not None:
-                raise TypeError("should return None")
-        elif isinstance(self._outputs, Tensor):
-            if not isinstance(outputs, Tensor):
-                raise TypeError("should return a tensor")
-            self._forward([outputs], [self._outputs])
-        else:
-            assert isinstance(self._outputs, tuple)
-
-            def check():
-                if not isinstance(outputs, (tuple, list)):
-                    return False
-                if len(self._outputs) != len(outputs):
-                    return False
-                for x in outputs:
-                    if not isinstance(x, Tensor):
-                        return False
-                return True
-
-            if not check():
-                raise TypeError(
-                    "should return tuple of %d tensors" % len(self._outputs)
-                )
-            self._forward(outputs, self._outputs)
-
-    def _apply_graph_options(self, cg):
-        # graph opt level
-        if self._graph_opt_level is not None:
-            cg.set_option("graph_opt_level", self._graph_opt_level)
-        # log level
-        if self._log_level is not None:
-            cg.set_option("log_level", self._log_level)
-        # sublinear
-        if self._sublinear_memory_config is not None:
-            cg.set_option("enable_sublinear_memory_opt", True)
-            cg.set_option(
-                "sublinear_mem_config.lb_memory",
-                self._sublinear_memory_config.lb_memory,
-            )
-            cg.set_option(
-                "sublinear_mem_config.genetic_nr_iter",
-                self._sublinear_memory_config.genetic_nr_iter,
-            )
-            cg.set_option(
-                "sublinear_mem_config.genetic_pool_size",
-                self._sublinear_memory_config.genetic_pool_size,
-            )
-            cg.set_option(
-                "sublinear_mem_config.thresh_nr_try",
-                self._sublinear_memory_config.thresh_nr_try,
-            )
-            cg.set_option(
-                "sublinear_mem_config.num_worker",
-                self._sublinear_memory_config.num_worker,
-            )
-        # pack allreduce
-        if self._allreduce_pack_max_size is not None:
-            cg.set_option("allreduce_pack_max_size", self._allreduce_pack_max_size)
-        # profile
-        if self._profiling:
-            self._profiler = CompGraphProfiler(cg)
-
-    def _get_graph(self, eager):
-
-        if eager:
-            if not hasattr(self, "_eager_graph"):
-                # pylint: disable=attribute-defined-outside-init
-                self._eager_graph = graph.Graph(eager_evaluation=True)
-                self._apply_graph_options(self._eager_graph)
-            return self._eager_graph
-        else:
-            if not hasattr(self, "_static_graph"):
-                # pylint: disable=attribute-defined-outside-init
-                self._static_graph = graph.Graph(eager_evaluation=False)
-                self._apply_graph_options(self._static_graph)
-            return self._static_graph
-
-    @contextlib.contextmanager
-    def _prepare(self, args, kwargs, enable):
-        # prepare for execution
-        self._forward_inputs(*args, **kwargs)
-        if not enable:
-            # XXX: use our own graph here?
-            cg = None
-        elif self._status == self._FINISHED:
-            cg = None
-        elif self._symbolic:
-            cg = self._get_graph(eager=False)
-        else:
-            cg = self._get_graph(eager=True)
-        try:
-            # NOTE: always trace in a new graph, so capturing an undetached tensor
-            # will never work (would work if tracing in default graph)
-            if cg is None:
-                yield
-            else:
-                with cg:
-                    yield
-        finally:
-            # XXX: properly release memory
-            if cg:
-                cg.clear_device_memory()
-
-    @contextlib.contextmanager
-    def _activate(self):
-        # prepare for tracing
-        if self._status != self._UNSTARTED:
-            raise RuntimeError("cannot trace a second time")
-        if type(self)._active_instance is not None:
-            raise RuntimeError("nested trace is unsupported")
-        self._status = self._STARTED
-        type(self)._active_instance = self
-        self._user_cache = {}
-        try:
-            yield
-        finally:
-            self._status = self._FINISHED
-            self._user_cache = None
-            type(self)._active_instance = None
-
-    def _run_wrapped(self):
-        outputs = self.__wrapped__(*self._args, **self._kwargs)
-        self._foward_outputs(outputs)
-        return outputs
-
-    def _do_trace(self):
-        with self._activate():
-            self._outspec = []
-            outputs = self._run_wrapped()
-            if outputs is None:
-                self._sym_outputs = None
-            else:
-                if isinstance(outputs, Tensor):
-                    outputs = [outputs]
-                # _run_wrapped has checked validity of outputs
-                self._sym_outputs = tuple(i._symvar for i in outputs)
-            mgb.comp_graph_tools.set_priority_to_id(self._outspec)
-            self._compiled_func = graph.get_default_graph().compile(None, self._outspec)
-
-    def trace(self, *args: Tensor, **kwargs):
-        """
-        Trace wrapped callable with provided arguments.
-        """
-        with self._prepare(args, kwargs, enable=True):
-            self._do_trace()
-        return self
-
-    def __call__(self, *args: Tensor, **kwargs):
-        """
-        Evaluate on provided arguments, using compiled trace
-        instead of the original callable if applicable.
-
-        :return: ``None`` or :class:`~.Tensor` or tuple of :class:`~.Tensor`, depending on the
-            return value of wrapped callable.
-        """
-        with self._prepare(args, kwargs, enable=self.enabled):
-            if not self.enabled:
-                self._run_wrapped()
-            elif self._status == self._FINISHED:
-                self._compiled_func()
-            else:
-                if self._status == self._UNSTARTED:
-                    self._do_trace()
-                if self._symbolic:
-                    self._compiled_func()
-            return self._outputs
-
-    def dump(
-        self,
-        fpath,
-        *,
-        arg_names=None,
-        append=False,
-        optimize_for_inference=False,
-        output_names=None,
-        **kwargs
-    ):
-        """
-        Serialize trace to file system.
-
-        :param fpath: positional only argument. Path of output file.
-        :param arg_names: names of the input tensors in the traced function.
-        :param append: whether output is appended to ``fpath``.
-        :param optimize_for_inference: whether to enable optimize_for_inference
-            pass before dump.
-        :param output_names: names of the output tensors in the traced function,
-            will use the default name if does not specify.
-
-        :param enable_io16xc32: whether to use float16 for I/O between oprs and use
-            float32 as internal computation precision. Note the output var would be
-            changed to float16.
-        :param enable_ioc16: whether to use float16 for both I/O and computation
-            precision.
-
-        :param enable_hwcd4: whether to use NHWCD4 data layout. This is faster on some
-            OpenCL backend.
-        :param enable_nchw88: whether to use NCHW88 data layout. it currently
-            used in X86 AVX backend.
-        :param enable_nchw44: whether to use NCHW44 data layout. it currently
-            used in arm backend.
-        :param enable_nchw44_dot: whether to use NCHW44_dot data layout. it currently
-            used in armv8.2+dotprod backend.
-        :param enable_nchw4: whether to use NCHW4 data layout. it currently
-            used in nvidia backend(based on cudnn).
-        :param enable_nchw32: whether to use NCHW32 data layout. it currently
-            used in nvidia backend with tensorcore(based on cudnn).
-        :param enable_chwn4: whether to use CHWN4 data layout. it currently
-            used in nvidia backend with tensorcore.
-
-        :param enable_fuse_conv_bias_nonlinearity: whether to fuse conv+bias+nonlinearty
-            into one opr.
-        :param enable_fuse_conv_bias_with_z: whether to fuse conv_bias with z
-            input for inference on nvidia backend(this optimization pass will
-            result in mismatch of the precision of output of training and
-            inference)
-        """
-        if self._status != self._FINISHED:
-            raise ValueError("not traced")
-        assert isinstance(self._sym_outputs, (tuple, type(None)))
-        if not self._sym_outputs:
-            raise ValueError("not outputs")
-        if arg_names is None:
-            arg_names = ["arg_%d" % i for i in range(len(self._args))]
-        elif len(arg_names) != len(self._args):
-            raise ValueError(
-                "len(arg_names) should be {}, got {}".format(
-                    len(self._args), len(arg_names)
-                )
-            )
-        if isinstance(output_names, str):
-            output_names = [output_names]
-        if output_names is None:
-            output_names = [var.name for var in self._sym_outputs]
-        elif len(output_names) != len(self._sym_outputs):
-            raise ValueError(
-                "len(output_names) should be {}, got {}".format(
-                    len(self._sym_outputs), len(output_names)
-                )
-            )
-
-        optimize_for_inference_args_map = {
-            "enable_io16xc32": "f16_io_f32_comp",
-            "enable_ioc16": "f16_io_comp",
-            "enable_hwcd4": "use_nhwcd4",
-            "enable_nchw4": "use_nchw4",
-            "enable_nchw88": "use_nchw88",
-            "enable_nchw32": "use_nchw32",
-            "enable_nchw44": "use_nchw44",
-            "enable_nchw44_dot": "use_nchw44_dot",
-            "enable_chwn4": "use_chwn4",
-            "enable_fuse_conv_bias_nonlinearity": "fuse_conv_bias_nonlinearity",
-            "enable_fuse_conv_bias_with_z": "fuse_conv_bias_with_z",
-        }
-        if optimize_for_inference:
-            optimize_for_inference_kwargs = {}
-            for k, v in optimize_for_inference_args_map.items():
-                if kwargs.pop(k, False):
-                    optimize_for_inference_kwargs[v] = True
-        else:
-            for k in optimize_for_inference_args_map:
-                if kwargs.get(k, False):
-                    raise ValueError(
-                        "cannot set %s when optimize_for_inference is not set" % k
-                    )
-        if kwargs:
-            raise ValueError("unknown options: %s" % list(kwargs))
-
-        cg = self._sym_outputs[0].owner_graph
-        replace = {}
-        for t, name in zip(self._args, arg_names):
-            # relies on symvar dedup
-            s = t.__mgb_symvar__(comp_graph=cg)
-            replace[s] = mgb.make_arg(
-                t.device, cg, dtype=t.dtype, shape=t.shape, name=name
-            )
-        # Convert VolatileSharedDeviceTensor to SharedDeviceTensor,
-        # otherwise some optimizations would not work. The conversion is
-        # safe because there simply is no way (using builtin ops) to make
-        # a VolatileSharedDeviceTensor actually volatile.
-        for s in mgb.cgtools.get_dep_vars(
-            self._sym_outputs, "VolatileSharedDeviceTensor"
-        ):
-            if s in replace:
-                continue  # is an input
-            replace[s] = mgb.SharedND._from_symvar(s).symvar(
-                cg, name=s.name, volatile=False
-            )
-        sym_outputs = mgb.cgtools.replace_vars(self._sym_outputs, replace)
-        sym_outputs = list(sym_outputs)
-        if optimize_for_inference:
-            sym_outputs = mgb.optimize_for_inference(
-                sym_outputs, **optimize_for_inference_kwargs
-            )
-        for var, name in zip(sym_outputs, output_names):
-            var.rename(name)
-        mgb.serialize_comp_graph_to_file(fpath, sym_outputs, append=append)
-
-    def get_profile(self):
-        """
-        Get profiling result for compiled trace.
-
-        :return: a json compatible object.
-        """
-        if not self._profiler:
-            raise RuntimeError("trace is not set with profiling=True")
-        return self._profiler.get()
diff --git a/python_module/megengine/jit/sublinear_memory_config.py b/python_module/megengine/jit/sublinear_memory_config.py
deleted file mode 100644
index fc3e46dd..00000000
--- a/python_module/megengine/jit/sublinear_memory_config.py
+++ /dev/null
@@ -1,56 +0,0 @@
-# -*- coding: utf-8 -*-
-# MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
-#
-# Copyright (c) 2014-2020 Megvii Inc. All rights reserved.
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-
-from ..core.device import get_device_count
-
-
-class SublinearMemoryConfig:
-    r"""
-    Configuration for sublinear memory optimization.
-
-    :param thresh_nr_try: number of samples both for searching in linear space
-        and around current thresh in sublinear memory optimization. Default: 10.
-        It can also be set through the environmental variable 'MGB_SUBLINEAR_MEMORY_THRESH_NR_TRY'.
-    :param genetic_nr_iter: number of iterations to find the best checkpoints in genetic algorithm.
-        Default: 0.
-        It can also be set through the environmental variable 'MGB_SUBLINEAR_MEMORY_GENETIC_NR_ITER'.
-    :param genetic_pool_size: number of samples for the crossover random selection
-        during genetic optimization. Default: 20.
-        It can also be set through the environmental variable 'MGB_SUBLINEAR_MEMORY_GENETIC_POOL_SIZE'.
-    :param lb_memory: memory lower bound of bottleneck size in MB for sublinear memory optimization.
-        It can be used to perform manual tradeoff between memory and speed. Default: 0.
-        It can also be set through the environmental variable 'MGB_SUBLINEAR_MEMORY_LOWER_BOUND_MB'.
-    :param num_worker: number of thread workers to search the optimum checkpoints
-        in sublinear memory optimization. Default: half of cpu number in the system.
-        Note: the value must be greater or equal to one.
-        It can also be set through the environmental variable 'MGB_SUBLINEAR_MEMORY_WORKERS'.
-
-    Note that the environmental variable MGB_COMP_GRAPH_OPT must be set to 'enable_sublinear_memory_opt=1'
-    in order for the above environmental variable to be effective.
-    """
-
-    def __init__(
-        self,
-        thresh_nr_try: int = 10,
-        genetic_nr_iter: int = 0,
-        genetic_pool_size: int = 20,
-        lb_memory: int = 0,
-        num_worker: int = max(1, get_device_count("cpu") // 2),
-    ):
-        assert thresh_nr_try >= 0, "thresh_nr_try must be greater or equal to zero"
-        self.thresh_nr_try = thresh_nr_try
-        assert genetic_nr_iter >= 0, "genetic_nr_iter must be greater or equal to zero"
-        self.genetic_nr_iter = genetic_nr_iter
-        assert (
-            genetic_pool_size >= 0
-        ), "genetic_pool_size must be greater or equal to zero"
-        self.genetic_pool_size = genetic_pool_size
-        self.lb_memory = lb_memory
-        assert num_worker > 0, "num_worker must be greater or equal to one"
-        self.num_worker = num_worker
diff --git a/python_module/megengine/logger.py b/python_module/megengine/logger.py
deleted file mode 100644
index 5cca95bd..00000000
--- a/python_module/megengine/logger.py
+++ /dev/null
@@ -1,231 +0,0 @@
-# -*- coding: utf-8 -*-
-# MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
-#
-# Copyright (c) 2014-2020 Megvii Inc. All rights reserved.
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-import contextlib
-import logging
-import os
-import sys
-
-_all_loggers = []
-_default_level_name = os.getenv("MEGENGINE_LOGGING_LEVEL", "INFO")
-_default_level = logging.getLevelName(_default_level_name.upper())
-
-
-def set_log_file(fout, mode="a"):
-    r"""Sets log output file.
-
-    :type fout: str or file-like
-    :param fout: file-like object that supports write and flush, or string for
-        the filename
-    :type mode: str
-    :param mode: specify the mode to open log file if *fout* is a string
-    """
-    if isinstance(fout, str):
-        fout = open(fout, mode)
-    MegEngineLogFormatter.log_fout = fout
-
-
-class MegEngineLogFormatter(logging.Formatter):
-    log_fout = None
-    date_full = "[%(asctime)s %(lineno)d@%(filename)s:%(name)s] "
-    date = "%(asctime)s "
-    msg = "%(message)s"
-    max_lines = 256
-
-    def _color_exc(self, msg):
-        r"""Sets the color of message as the execution type.
-        """
-        return "\x1b[34m{}\x1b[0m".format(msg)
-
-    def _color_dbg(self, msg):
-        r"""Sets the color of message as the debugging type.
-        """
-        return "\x1b[36m{}\x1b[0m".format(msg)
-
-    def _color_warn(self, msg):
-        r"""Sets the color of message as the warning type.
-        """
-        return "\x1b[1;31m{}\x1b[0m".format(msg)
-
-    def _color_err(self, msg):
-        r"""Sets the color of message as the error type.
-        """
-        return "\x1b[1;4;31m{}\x1b[0m".format(msg)
-
-    def _color_omitted(self, msg):
-        r"""Sets the color of message as the omitted type.
-        """
-        return "\x1b[35m{}\x1b[0m".format(msg)
-
-    def _color_normal(self, msg):
-        r"""Sets the color of message as the normal type.
-        """
-        return msg
-
-    def _color_date(self, msg):
-        r"""Sets the color of message the same as date.
-        """
-        return "\x1b[32m{}\x1b[0m".format(msg)
-
-    def format(self, record):
-        if record.levelno == logging.DEBUG:
-            mcl, mtxt = self._color_dbg, "DBG"
-        elif record.levelno == logging.WARNING:
-            mcl, mtxt = self._color_warn, "WRN"
-        elif record.levelno == logging.ERROR:
-            mcl, mtxt = self._color_err, "ERR"
-        else:
-            mcl, mtxt = self._color_normal, ""
-
-        if mtxt:
-            mtxt += " "
-
-        if self.log_fout:
-            self.__set_fmt(self.date_full + mtxt + self.msg)
-            formatted = super(MegEngineLogFormatter, self).format(record)
-            nr_line = formatted.count("\n") + 1
-            if nr_line >= self.max_lines:
-                head, body = formatted.split("\n", 1)
-                formatted = "\n".join(
-                    [
-                        head,
-                        "BEGIN_LONG_LOG_{}_LINES{{".format(nr_line - 1),
-                        body,
-                        "}}END_LONG_LOG_{}_LINES".format(nr_line - 1),
-                    ]
-                )
-            self.log_fout.write(formatted)
-            self.log_fout.write("\n")
-            self.log_fout.flush()
-
-        self.__set_fmt(self._color_date(self.date) + mcl(mtxt + self.msg))
-        formatted = super(MegEngineLogFormatter, self).format(record)
-
-        if record.exc_text or record.exc_info:
-            # handle exception format
-            b = formatted.find("Traceback ")
-            if b != -1:
-                s = formatted[b:]
-                s = self._color_exc("  " + s.replace("\n", "\n  "))
-                formatted = formatted[:b] + s
-
-        nr_line = formatted.count("\n") + 1
-        if nr_line >= self.max_lines:
-            lines = formatted.split("\n")
-            remain = self.max_lines // 2
-            removed = len(lines) - remain * 2
-            if removed > 0:
-                mid_msg = self._color_omitted(
-                    "[{} log lines omitted (would be written to output file "
-                    "if set_log_file() has been called;\n"
-                    " the threshold can be set at "
-                    "MegEngineLogFormatter.max_lines)]".format(removed)
-                )
-                formatted = "\n".join(lines[:remain] + [mid_msg] + lines[-remain:])
-
-        return formatted
-
-    if sys.version_info.major < 3:
-
-        def __set_fmt(self, fmt):
-            self._fmt = fmt
-
-    else:
-
-        def __set_fmt(self, fmt):
-            self._style._fmt = fmt
-
-
-def get_logger(name=None, formatter=MegEngineLogFormatter):
-    r"""Gets megengine logger with given name.
-    """
-
-    logger = logging.getLogger(name)
-    if getattr(logger, "_init_done__", None):
-        return logger
-    logger._init_done__ = True
-    logger.propagate = False
-    logger.setLevel(_default_level)
-    handler = logging.StreamHandler()
-    handler.setFormatter(formatter(datefmt="%d %H:%M:%S"))
-    handler.setLevel(0)
-    del logger.handlers[:]
-    logger.addHandler(handler)
-    _all_loggers.append(logger)
-    return logger
-
-
-def set_log_level(level, update_existing=True):
-    """Sets default logging level.
-
-    :type level: int e.g. logging.INFO
-    :param level: loggin level given by python :mod:`logging` module
-    :param update_existing: whether to update existing loggers
-    """
-    global _default_level  # pylint: disable=global-statement
-    _default_level = level
-    if update_existing:
-        for i in _all_loggers:
-            i.setLevel(level)
-
-
-_logger = get_logger(__name__)
-
-try:
-    if sys.version_info.major < 3:
-        raise ImportError()
-
-    from megengine._internal.logconf import set_logger as _set_mgb_logger
-
-    class MegBrainLogFormatter(MegEngineLogFormatter):
-        date = "%(asctime)s[mgb] "
-
-        def _color_date(self, msg):
-            return "\x1b[33m{}\x1b[0m".format(msg)
-
-    _megbrain_logger = get_logger("megbrain", MegBrainLogFormatter)
-    _set_mgb_logger(_megbrain_logger)
-
-    def set_mgb_log_level(level):
-        r"""Sets megbrain log level
-
-        :type level: int e.g. logging.INFO
-        :param level: new log level
-        :return: original log level
-        """
-        logger = _megbrain_logger
-        rst = logger.getEffectiveLevel()
-        logger.setLevel(level)
-        return rst
-
-
-except ImportError as exc:
-
-    def set_mgb_log_level(level):
-        raise NotImplementedError("megbrain has not been imported")
-
-
-@contextlib.contextmanager
-def replace_mgb_log_level(level):
-    r"""Replaces megbrain log level in a block and restore after exiting.
-
-    :type level: int e.g. logging.INFO
-    :param level: new log level
-    """
-    old = set_mgb_log_level(level)
-    try:
-        yield
-    finally:
-        set_mgb_log_level(old)
-
-
-def enable_debug_log():
-    r"""Sets logging level to debug for all components.
-    """
-    set_log_level(logging.DEBUG)
-    set_mgb_log_level(logging.DEBUG)
diff --git a/python_module/megengine/module/__init__.py b/python_module/megengine/module/__init__.py
deleted file mode 100644
index c2b3db8a..00000000
--- a/python_module/megengine/module/__init__.py
+++ /dev/null
@@ -1,23 +0,0 @@
-# -*- coding: utf-8 -*-
-# MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
-#
-# Copyright (c) 2014-2020 Megvii Inc. All rights reserved.
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-from .activation import LeakyReLU, PReLU, ReLU, Sigmoid, Softmax
-from .batchnorm import BatchNorm1d, BatchNorm2d, SyncBatchNorm
-from .concat import Concat
-from .conv import Conv2d, ConvRelu2d, ConvTranspose2d, LocalConv2d
-from .conv_bn import ConvBn2d, ConvBnRelu2d
-from .dropout import Dropout
-from .elemwise import Elemwise
-from .embedding import Embedding
-from .identity import Identity
-from .linear import Linear
-from .module import Module
-from .parampack import ParamPack
-from .pooling import AvgPool2d, MaxPool2d
-from .quant_dequant import DequantStub, QuantStub
-from .sequential import Sequential
diff --git a/python_module/megengine/module/activation.py b/python_module/megengine/module/activation.py
deleted file mode 100644
index b80c10a7..00000000
--- a/python_module/megengine/module/activation.py
+++ /dev/null
@@ -1,231 +0,0 @@
-# -*- coding: utf-8 -*-
-# MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
-#
-# Copyright (c) 2014-2020 Megvii Inc. All rights reserved.
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-import numpy as np
-
-from ..core import Parameter
-from ..functional import leaky_relu, prelu, relu, sigmoid, softmax
-from .module import Module
-
-
-class Softmax(Module):
-    r"""
-    Applies a softmax function. Softmax is defined as:
-
-    .. math::
-            \text{Softmax}(x_{i}) = \frac{exp(x_i)}{\sum_j exp(x_j)}
-
-    It is applied to an n-dimensional input Tensor and rescaling them so that the elements of the
-    n-dimensional output Tensor lie in the range of `[0, 1]` and sum to 1.
-
-    :param axis: An axis along which softmax will be applied. By default,
-        softmax will apply along the highest ranked axis.
-
-    Examples:
-
-    .. testcode::
-
-        import numpy as np
-        import megengine as mge
-        import megengine.module as M
-
-        data = mge.tensor(np.array([-2,-1,0,1,2]).astype(np.float32))
-        softmax = M.Softmax()
-        output = softmax(data)
-        with np.printoptions(precision=6):
-            print(output.numpy())
-
-    Outputs:
-
-    .. testoutput::
-
-        [0.011656 0.031685 0.086129 0.234122 0.636409]
-
-    """
-
-    def __init__(self, axis=None):
-        super().__init__()
-        self.axis = axis
-
-    def forward(self, inputs):
-        return softmax(inputs, self.axis)
-
-
-class Sigmoid(Module):
-    r"""
-    Applies the element-wise function:
-
-    .. math::
-        \text{Sigmoid}(x) = \frac{1}{1 + \exp(-x)}
-
-    Examples:
-
-    .. testcode::
-
-        import numpy as np
-        import megengine as mge
-        import megengine.module as M
-
-        data = mge.tensor(np.array([-2,-1,0,1,2,]).astype(np.float32))
-        sigmoid = M.Sigmoid()
-        output = sigmoid(data)
-        with np.printoptions(precision=6):
-            print(output.numpy())
-
-    Outputs:
-
-    .. testoutput::
-
-        [0.119203 0.268941 0.5      0.731059 0.880797]
-
-    """
-
-    def forward(self, inputs):
-        return sigmoid(inputs)
-
-
-class ReLU(Module):
-    r"""
-    Applies the element-wise function:
-
-    .. math::
-        \text{ReLU}(x) = \max(x, 0)
-
-    Examples:
-
-    .. testcode::
-
-        import numpy as np
-        import megengine as mge
-        import megengine.module as M
-        data = mge.tensor(np.array([-2,-1,0,1,2,]).astype(np.float32))
-        relu = M.ReLU()
-        output = relu(data)
-        with np.printoptions(precision=6):
-            print(output.numpy())
-
-    Outputs:
-
-    .. testoutput::
-
-        [0. 0. 0. 1. 2.]
-
-    """
-
-    def forward(self, x):
-        return relu(x)
-
-
-class PReLU(Module):
-    r"""
-    Applies the element-wise function:
-
-    .. math::
-        \text{PReLU}(x) = \max(0,x) + a * \min(0,x)
-
-    or
-
-    .. math::
-        \text{PReLU}(x) =
-        \begin{cases}
-        x, & \text{ if } x \geq 0 \\
-        ax, & \text{ otherwise }
-        \end{cases}
-
-    Here :math:`a` is a learnable parameter. When called without arguments, `PReLU()` uses
-    a single paramter :math:`a` across all input channel. If called with `PReLU(num_of_channels)`,
-    a seperate :math:`a` is used for each input channle.
-
-    :param num_parameters: number of :math:`a` to learn, there is only two
-        values are legitimate: 1, or the number of channels at input. Default: 1
-    :param init: the initial value of :math:`a`. Default: 0.25
-
-    Examples:
-
-    .. testcode::
-
-        import numpy as np
-        import megengine as mge
-        import megengine.module as M
-        data = mge.tensor(np.array([-1.2, -3.7, 2.7]).astype(np.float32))
-        prelu = M.PReLU()
-        output = prelu(data)
-        print(output.numpy())
-
-    Outputs:
-
-    .. testoutput::
-
-        [-0.3   -0.925  2.7  ]
-
-    """
-
-    def __init__(self, num_parameters: int = 1, init: float = 0.25):
-        super().__init__()
-        self.num_parameters = num_parameters
-        if num_parameters > 1:
-            # Assume format is NCHW
-            self.weight = Parameter(
-                value=np.full((1, num_parameters, 1, 1), init, dtype=np.float32)
-            )
-        else:
-            self.weight = Parameter(value=[init])
-
-    def forward(self, inputs):
-        assert self.weight.shape == (1,) or self.weight.shape == (
-            1,
-            int(inputs.shape[1]),
-            1,
-            1,
-        ), "invalid weight's shape"
-        return prelu(inputs, self.weight)
-
-
-class LeakyReLU(Module):
-    r"""
-    Applies the element-wise function:
-
-    .. math::
-        \text{LeakyReLU}(x) = \max(0,x) + negative\_slope \times \min(0,x)
-
-    or
-
-    .. math::
-        \text{LeakyReLU}(x) =
-        \begin{cases}
-        x, & \text{ if } x \geq 0 \\
-        negative\_slope \times x, & \text{ otherwise }
-        \end{cases}
-
-    Examples:
-
-    .. testcode::
-
-        import numpy as np
-        import megengine as mge
-        import megengine.module as M
-        data = mge.tensor(np.array([-8, -12, 6, 10]).astype(np.float32))
-
-        leakyrelu = M.LeakyReLU(0.01)
-        output = leakyrelu(data)
-        print(output.numpy())
-
-    Outputs:
-
-    .. testoutput::
-
-        [-0.08   -0.12  6.   10.  ]
-
-    """
-
-    def __init__(self, negative_slope: float = 0.01):
-        super().__init__()
-        self.negative_slope = negative_slope
-
-    def forward(self, inputs):
-        return leaky_relu(inputs, self.negative_slope)
diff --git a/python_module/megengine/module/batchnorm.py b/python_module/megengine/module/batchnorm.py
deleted file mode 100644
index ba755616..00000000
--- a/python_module/megengine/module/batchnorm.py
+++ /dev/null
@@ -1,257 +0,0 @@
-# -*- coding: utf-8 -*-
-# MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
-#
-# Copyright (c) 2014-2020 Megvii Inc. All rights reserved.
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-import numpy as np
-
-from ..core import Buffer, Parameter
-from ..core.device import get_default_device
-from ..functional import batch_norm2d, sync_batch_norm
-from . import init
-from .module import Module
-
-
-class _BatchNorm(Module):
-    def __init__(
-        self,
-        num_features,
-        eps=1e-5,
-        momentum=0.9,
-        affine=True,
-        track_running_stats=True,
-    ):
-        super(_BatchNorm, self).__init__()
-        self.num_features = num_features
-        self.eps = eps
-        self.momentum = momentum
-        self.affine = affine
-        self.track_running_stats = track_running_stats
-        if self.affine:
-            self.weight = Parameter(np.ones(num_features, dtype=np.float32))
-            self.bias = Parameter(np.zeros(num_features, dtype=np.float32))
-        else:
-            self.weight = None
-            self.bias = None
-
-        tshape = (1, self.num_features, 1, 1)
-
-        if self.track_running_stats:
-            self.running_mean = Buffer(np.zeros(tshape, dtype=np.float32))
-            self.running_var = Buffer(np.ones(tshape, dtype=np.float32))
-        else:
-            self.running_mean = None
-            self.running_var = None
-
-    def reset_running_stats(self) -> None:
-        if self.track_running_stats:
-            init.zeros_(self.running_mean)
-            init.ones_(self.running_var)
-
-    def reset_parameters(self) -> None:
-        self.reset_running_stats()
-        if self.affine:
-            init.ones_(self.weight)
-            init.zeros_(self.bias)
-
-    def _check_input_ndim(self, inp):
-        raise NotImplementedError
-
-    def forward(self, inp):
-        self._check_input_ndim(inp)
-
-        _ndims = len(inp.shape)
-        if _ndims != 4:
-            origin_shape = inp.shapeof()
-            if _ndims == 2:
-                n, c = inp.shapeof(0), inp.shapeof(1)
-                new_shape = (n, c, 1, 1)
-            elif _ndims == 3:
-                n, c, h = inp.shapeof(0), inp.shapeof(1), inp.shapeof(2)
-                new_shape = (n, c, h, 1)
-
-            inp = inp.reshape(new_shape)
-
-        if self.training and self.track_running_stats:
-            exponential_average_factor = self.momentum
-        else:
-            exponential_average_factor = 0.0  # useless
-
-        # FIXME currently rocm does not support real bn opr so we just use
-        # sync_batch_norm(as implemented by elemwise) here,
-        # we will fix it in the next version
-        if get_default_device() == "rocmx":
-            output = sync_batch_norm(
-                inp,
-                self.running_mean,
-                self.running_var,
-                self.weight,
-                self.bias,
-                self.training or not self.track_running_stats,
-                exponential_average_factor,
-                self.eps,
-            )
-        else:
-            output = batch_norm2d(
-                inp,
-                self.running_mean,
-                self.running_var,
-                self.weight,
-                self.bias,
-                self.training or not self.track_running_stats,
-                exponential_average_factor,
-                self.eps,
-            )
-
-        if _ndims != 4:
-            output = output.reshape(origin_shape)
-
-        return output
-
-
-class SyncBatchNorm(_BatchNorm):
-    r"""
-    Applies Synchronization Batch Normalization.
-    """
-
-    def _check_input_ndim(self, inp):
-        if len(inp.shape) not in {2, 3, 4}:
-            raise ValueError(
-                "expected 2D, 3D or 4D input (got {}D input)".format(len(inp.shape))
-            )
-
-    def forward(self, inp):
-        self._check_input_ndim(inp)
-
-        _ndims = len(inp.shape)
-        if _ndims != 4:
-            origin_shape = inp.shapeof()
-            if _ndims == 2:
-                n, c = inp.shapeof(0), inp.shapeof(1)
-                new_shape = (n, c, 1, 1)
-            elif _ndims == 3:
-                n, c, h = inp.shapeof(0), inp.shapeof(1), inp.shapeof(2)
-                new_shape = (n, c, h, 1)
-
-            inp = inp.reshape(new_shape)
-
-        if self.training and self.track_running_stats:
-            exponential_average_factor = self.momentum
-        else:
-            exponential_average_factor = 0.0  # useless
-
-        output = sync_batch_norm(
-            inp,
-            self.running_mean,
-            self.running_var,
-            self.weight,
-            self.bias,
-            self.training or not self.track_running_stats,
-            exponential_average_factor,
-            self.eps,
-        )
-
-        if _ndims != 4:
-            output = output.reshape(origin_shape)
-
-        return output
-
-
-class BatchNorm1d(_BatchNorm):
-    r"""
-    Applies Batch Normalization over a 2D/3D tensor.
-
-    Refer to :class:`~.BatchNorm2d` for more information.
-    """
-
-    def _check_input_ndim(self, inp):
-        if len(inp.shape) not in {2, 3}:
-            raise ValueError(
-                "expected 2D or 3D input (got {}D input)".format(len(inp.shape))
-            )
-
-
-class BatchNorm2d(_BatchNorm):
-    r"""
-    Applies Batch Normalization over a 4D tensor.
-
-    .. math::
-
-        y = \frac{x - \mathrm{E}[x]}{ \sqrt{\mathrm{Var}[x] + \epsilon}} * \gamma + \beta
-
-    The mean and standard-deviation are calculated per-dimension over
-    the mini-batches and :math:`\gamma` and :math:`\beta` are learnable
-    parameter vectors.
-
-    By default, during training this layer keeps running estimates of its
-    computed mean and variance, which are then used for normalization during
-    evaluation. The running estimates are kept with a default :attr:`momentum`
-    of 0.9.
-
-    If :attr:`track_running_stats` is set to ``False``, this layer will not
-    keep running estimates, and batch statistics are instead used during
-    evaluation time.
-
-    .. note::
-        This :attr:`momentum` argument is different from one used in optimizer
-        classes and the conventional notion of momentum. Mathematically, the
-        update rule for running statistics here is
-        :math:`\hat{x}_\text{new} = \text{momentum} \times \hat{x} + (1 - \text{momentum}) \times x_t`,
-        where :math:`\hat{x}` is the estimated statistic and :math:`x_t` is the
-        new observed value.
-
-    Because the Batch Normalization is done over the `C` dimension, computing
-    statistics on `(N, H, W)` slices, it's common terminology to call this
-    Spatial Batch Normalization.
-
-    :type num_features: int
-    :param num_features: usually the :math:`C` from an input of size
-        :math:`(N, C, H, W)` or the highest ranked dimension of an input with
-        less than 4D.
-    :type eps: float
-    :param eps: a value added to the denominator for numerical stability.
-        Default: 1e-5.
-    :type momentum: float
-    :param momentum: the value used for the `running_mean` and `running_var`
-        computation.
-        Default: 0.9
-    :type affine: bool
-    :param affine: a boolean value that when set to ``True``, this module has
-        learnable affine parameters. Default: ``True``
-    :type track_running_stats: bool
-    :param track_running_stats: when set to ``True``, this module tracks the
-        running mean and variance. When set to ``False``, this module does not
-        track such statistics and always uses batch statistics in both training
-        and eval modes. Default: ``True``.
-
-
-    Examples:
-
-    .. testcode::
-
-        import numpy as np
-        import megengine as mge
-        import megengine.module as M
-
-        # With Learnable Parameters
-        m = M.BatchNorm2d(4)
-        inp = mge.tensor(np.random.rand(1, 4, 3, 3).astype("float32"))
-        oup = m(inp)
-        print(m.weight, m.bias)
-        # Without Learnable Parameters
-        m = M.BatchNorm2d(4, affine=False)
-        oup = m(inp)
-        print(m.weight, m.bias)
-
-    .. testoutput::
-
-        Tensor([1. 1. 1. 1.]) Tensor([0. 0. 0. 0.])
-        None None
-    """
-
-    def _check_input_ndim(self, inp):
-        if len(inp.shape) != 4:
-            raise ValueError("expected 4D input (got {}D input)".format(len(inp.shape)))
diff --git a/python_module/megengine/module/concat.py b/python_module/megengine/module/concat.py
deleted file mode 100644
index 453f951b..00000000
--- a/python_module/megengine/module/concat.py
+++ /dev/null
@@ -1,22 +0,0 @@
-# MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
-#
-# Copyright (c) 2014-2020 Megvii Inc. All rights reserved.
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-from typing import Iterable
-
-from .. import functional as F
-from ..core.tensor import Tensor
-from .module import Module
-
-
-class Concat(Module):
-    r"""
-    A :class:`~.Module` to do functional concat. Could be replaced with :class:`~.QATModule`
-    version :class:`~.qat.concat.Concat` using :func:`~.quantize.quantize_qat`.
-    """
-
-    def forward(self, inps: Iterable[Tensor], axis: int = 0):
-        return F.concat(inps, axis)
diff --git a/python_module/megengine/module/conv.py b/python_module/megengine/module/conv.py
deleted file mode 100644
index 02165b89..00000000
--- a/python_module/megengine/module/conv.py
+++ /dev/null
@@ -1,392 +0,0 @@
-# MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
-#
-# Copyright (c) 2014-2020 Megvii Inc. All rights reserved.
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-from abc import abstractmethod
-from typing import Tuple, Union
-
-import numpy as np
-
-import megengine._internal as mgb
-
-from .. import functional as F
-from ..core import Parameter
-from ..utils.types import _pair, _pair_nonzero
-from . import init
-from .module import Module
-
-
-class _ConvNd(Module):
-    """base class for convolution modules, including transposed conv"""
-
-    def __init__(
-        self,
-        in_channels: int,
-        out_channels: int,
-        kernel_size: Union[int, Tuple[int, int]],
-        stride: Union[int, Tuple[int, int]],
-        padding: Union[int, Tuple[int, int]],
-        dilation: Union[int, Tuple[int, int]],
-        groups: int,
-        bias: bool = True,
-    ):
-        super().__init__()
-        if in_channels % groups != 0:
-            raise ValueError("in_channels must be divisible by groups")
-        if out_channels % groups != 0:
-            raise ValueError("out_channels must be divisible by groups")
-        self.in_channels = in_channels
-        self.out_channels = out_channels
-        self.kernel_size = kernel_size
-        self.stride = stride
-        self.padding = padding
-        self.dilation = dilation
-        self.groups = groups
-
-        self.weight = Parameter(np.zeros(self._infer_weight_shape(), dtype=np.float32))
-        self.bias = None
-        if bias:
-            self.bias = Parameter(np.zeros(self._infer_bias_shape(), dtype=np.float32))
-        self.reset_parameters()
-
-    @abstractmethod
-    def _get_fanin(self):
-        pass
-
-    def reset_parameters(self) -> None:
-        fanin = self._get_fanin()
-        std = np.sqrt(1 / fanin)
-        init.normal_(self.weight, 0.0, std)
-        if self.bias is not None:
-            init.zeros_(self.bias)
-
-    @abstractmethod
-    def _infer_weight_shape(self):
-        pass
-
-    @abstractmethod
-    def _infer_bias_shape(self):
-        pass
-
-
-class Conv2d(_ConvNd):
-    r"""Applies a 2D convolution over an input tensor.
-
-    For instance, given an input of the size :math:`(N, C_{\text{in}}, H, W)`,
-    this layer generates an output of the size
-    :math:`(N, C_{\text{out}}, H_{\text{out}}, W_{\text{out}})` through the
-    process described as below:
-
-    .. math::
-        \text{out}(N_i, C_{\text{out}_j}) = \text{bias}(C_{\text{out}_j}) +
-        \sum_{k = 0}^{C_{\text{in}} - 1} \text{weight}(C_{\text{out}_j}, k) \star \text{input}(N_i, k)
-
-    where :math:`\star` is the valid 2D cross-correlation operator,
-    :math:`N` is a batch size, :math:`C` denotes a number of channels,
-    :math:`H` is a height of input planes in pixels, and :math:`W` is
-    width in pixels.
-
-    When ``groups == in_channels`` and ``out_channels == K * in_channels``,
-    where `K` is a positive integer, this operation is also known as depthwise
-    convolution.
-
-    In other words, for an input of size :math:`(N, C_{in}, H_{in}, W_{in})`,
-    a depthwise convolution with a depthwise multiplier `K`, can be constructed
-    by arguments :math:`(in\_channels=C_{in}, out\_channels=C_{in} \times K, ..., groups=C_{in})`.
-
-    :param in_channels: number of input channels.
-    :param out_channels: number of output channels.
-    :param kernel_size: size of weight on spatial dimensions. If ``kernel_size`` is
-        an :class:`int`, the actual kernel size would be
-        ``(kernel_size, kernel_size)``. Default: 1
-    :param stride: stride of the 2D convolution operation. Default: 1
-    :param padding: size of the paddings added to the input on both sides of its
-        spatial dimensions. Only zero-padding is supported. Default: 0
-    :param dilation: dilation of the 2D convolution operation. Default: 1
-    :param groups: number of groups to divide input and output channels into,
-        so as to perform a "grouped convolution". When ``groups`` is not 1,
-        ``in_channels`` and ``out_channels`` must be divisible by ``groups``,
-        and there would be an extra dimension at the beginning of the weight's
-        shape. Specifically, the shape of weight would be ``(groups,
-        out_channel // groups, in_channels // groups, *kernel_size)``.
-    :param bias: whether to add a bias onto the result of convolution. Default:
-        True
-    :param conv_mode: Supports `CROSS_CORRELATION` or `CONVOLUTION`. Default:
-        `CROSS_CORRELATION`.
-    :param compute_mode: When set to `DEFAULT`, no special requirements will be
-        placed on the precision of intermediate results. When set to `FLOAT32`,
-        float32 would be used for accumulator and intermediate result, but only
-        effective when input and output are of float16 dtype.
-    """
-
-    _conv_mode_type = mgb.opr_param_defs.Convolution.Mode
-    _compute_mode_type = mgb.opr_param_defs.Convolution.ComputeMode
-
-    def __init__(
-        self,
-        in_channels: int,
-        out_channels: int,
-        kernel_size: Union[int, Tuple[int, int]],
-        stride: Union[int, Tuple[int, int]] = 1,
-        padding: Union[int, Tuple[int, int]] = 0,
-        dilation: Union[int, Tuple[int, int]] = 1,
-        groups: int = 1,
-        bias: bool = True,
-        conv_mode: str = "CROSS_CORRELATION",
-        compute_mode: str = "DEFAULT",
-    ):
-        kernel_size = _pair_nonzero(kernel_size)
-        stride = _pair_nonzero(stride)
-        padding = _pair(padding)
-        dilation = _pair_nonzero(dilation)
-        self.conv_mode = self._conv_mode_type.convert(conv_mode)
-        self.compute_mode = self._compute_mode_type.convert(compute_mode)
-        super().__init__(
-            in_channels,
-            out_channels,
-            kernel_size,
-            stride,
-            padding,
-            dilation,
-            groups,
-            bias,
-        )
-
-    def _get_fanin(self):
-        kh, kw = self.kernel_size
-        ic = self.in_channels
-        return kh * kw * ic
-
-    def _infer_weight_shape(self):
-        group = self.groups
-        ichl = self.in_channels
-        ochl = self.out_channels
-        kh, kw = self.kernel_size
-        if group == 1:
-            # Assume format is NCHW
-            return (ochl, ichl, kh, kw)
-
-        assert (
-            ichl % group == 0 and ochl % group == 0
-        ), "invalid config: input_channels={} output_channels={} group={}".format(
-            ichl, ochl, group
-        )
-        # Assume format is NCHW
-        return (group, ochl // group, ichl // group, kh, kw)
-
-    def _infer_bias_shape(self):
-        # Assume format is NCHW
-        return (1, self.out_channels, 1, 1)
-
-    def calc_conv(self, inp, weight, bias):
-        return F.conv2d(
-            inp,
-            weight,
-            bias,
-            self.stride,
-            self.padding,
-            self.dilation,
-            self.groups,
-            self.conv_mode,
-            self.compute_mode,
-        )
-
-    def forward(self, inp):
-        return self.calc_conv(inp, self.weight, self.bias)
-
-
-class ConvTranspose2d(_ConvNd):
-    r"""Applies a 2D transposed convolution over an input tensor.
-
-    This module is also known as a deconvolution or a fractionally-strided convolution.
-    :class:`ConvTranspose2d` can ben seen as the gradient of :class:`Conv2d` operation
-    with respect to its input.
-
-    Convolution usually reduces the size of input, while transposed convolution works
-    the opposite way, transforming a smaller input to a larger output while preserving the
-    connectivity pattern.
-
-    :param in_channels: number of input channels.
-    :param out_channels: number of output channels.
-    :param kernel_size: size of weight on spatial dimensions. If ``kernel_size`` is
-        an :class:`int`, the actual kernel size would be
-        ``(kernel_size, kernel_size)``. Default: 1
-    :param stride: stride of the 2D convolution operation. Default: 1
-    :param padding: size of the paddings added to the input on both sides of its
-        spatial dimensions. Only zero-padding is supported. Default: 0
-    :param dilation: dilation of the 2D convolution operation. Default: 1
-    :param groups: number of groups to divide input and output channels into,
-        so as to perform a "grouped convolution". When ``groups`` is not 1,
-        ``in_channels`` and ``out_channels`` must be divisible by ``groups``,
-        and there would be an extra dimension at the beginning of the weight's
-        shape. Specifically, the shape of weight would be ``(groups,
-        out_channels // groups, in_channels // groups, *kernel_size)``. Default: 1
-    :param bias: wether to add a bias onto the result of convolution. Default:
-        True
-    :param conv_mode: Supports `CROSS_CORRELATION` or `CONVOLUTION`. Default:
-        `CROSS_CORRELATION`.
-    :param compute_mode: When set to `DEFAULT`, no special requirements will be
-        placed on the precision of intermediate results. When set to `FLOAT32`,
-        float32 would be used for accumulator and intermediate result, but only
-        effective when input and output are of float16 dtype.
-    """
-
-    _conv_mode_type = mgb.opr_param_defs.Convolution.Mode
-    _compute_mode_type = mgb.opr_param_defs.Convolution.ComputeMode
-
-    def __init__(
-        self,
-        in_channels: int,
-        out_channels: int,
-        kernel_size: Union[int, Tuple[int, int]],
-        stride: Union[int, Tuple[int, int]] = 1,
-        padding: Union[int, Tuple[int, int]] = 0,
-        dilation: Union[int, Tuple[int, int]] = 1,
-        groups: int = 1,
-        bias: bool = True,
-        conv_mode: str = "CROSS_CORRELATION",
-        compute_mode: str = "DEFAULT",
-    ):
-        kernel_size = _pair_nonzero(kernel_size)
-        stride = _pair_nonzero(stride)
-        padding = _pair(padding)
-        dilation = _pair_nonzero(dilation)
-        self.conv_mode = self._conv_mode_type.convert(conv_mode)
-        self.compute_mode = self._compute_mode_type.convert(compute_mode)
-        super().__init__(
-            in_channels,
-            out_channels,
-            kernel_size,
-            stride,
-            padding,
-            dilation,
-            groups,
-            bias,
-        )
-
-    def _get_fanin(self):
-        kh, kw = self.kernel_size
-        oc = self.out_channels
-        return kh * kw * oc
-
-    def _infer_weight_shape(self):
-        group = self.groups
-        ichl = self.in_channels
-        ochl = self.out_channels
-        kh, kw = self.kernel_size
-        if group == 1:
-            # Assume format is NCHW
-            return (ichl, ochl, kh, kw)
-
-        assert (
-            ichl % group == 0 and ochl % group == 0
-        ), "invalid config: input_channels={} output_channels={} group={}".format(
-            ichl, ochl, group
-        )
-        # Assume format is NCHW
-        return (group, ichl // group, ochl // group, kh, kw)
-
-    def _infer_bias_shape(self):
-        # Assume format is NCHW
-        return (1, self.out_channels, 1, 1)
-
-    def forward(self, inp):
-        return F.conv_transpose2d(
-            inp,
-            self.weight,
-            self.bias,
-            self.stride,
-            self.padding,
-            self.dilation,
-            self.groups,
-            self.conv_mode,
-            self.compute_mode,
-        )
-
-
-class LocalConv2d(Conv2d):
-    r"""Applies a spatial convolution with untied kernels over an input 4D tensor.
-    It is also known as the locally connected layer.
-
-    :param in_channels: number of input channels.
-    :param out_channels: number of output channels.
-    :param input_height: the height of the input images.
-    :param input_width: the width of the input images.
-    :param kernel_size: size of weight on spatial dimensions. If ``kernel_size`` is
-        an :class:`int`, the actual kernel size would be
-        ``(kernel_size, kernel_size)``. Default: 1
-    :param stride: stride of the 2D convolution operation. Default: 1
-    :param padding: size of the paddings added to the input on both sides of its
-        spatial dimensions. Only zero-padding is supported. Default: 0
-    :param groups: number of groups to divide input and output channels into,
-        so as to perform a "grouped convolution". When ``groups`` is not 1,
-        ``in_channels`` and ``out_channels`` must be divisible by ``groups``.
-        The shape of weight is ``(groups, output_height, output_width,
-        in_channels // groups, *kernel_size, out_channels // groups)``.
-    """
-
-    _conv_mode_type = mgb.opr_param_defs.Convolution.Mode
-
-    def __init__(
-        self,
-        in_channels: int,
-        out_channels: int,
-        input_height: int,
-        input_width: int,
-        kernel_size: Union[int, Tuple[int, int]],
-        stride: Union[int, Tuple[int, int]] = 1,
-        padding: Union[int, Tuple[int, int]] = 0,
-        dilation: Union[int, Tuple[int, int]] = 1,
-        groups: int = 1,
-        conv_mode: str = "CROSS_CORRELATION",
-    ):
-        self.input_height = input_height
-        self.input_width = input_width
-        super().__init__(
-            in_channels,
-            out_channels,
-            kernel_size,
-            stride,
-            padding,
-            dilation,
-            groups,
-            bias=False,
-        )
-
-    def _infer_weight_shape(self):
-        group = self.groups
-        output_height = (
-            self.input_height + self.padding[0] * 2 - self.kernel_size[0]
-        ) // self.stride[0] + 1
-        output_width = (
-            self.input_width + self.padding[1] * 2 - self.kernel_size[1]
-        ) // self.stride[1] + 1
-        # Assume format is NCHW
-        return (
-            group,
-            output_height,
-            output_width,
-            self.in_channels // group,
-            self.kernel_size[0],
-            self.kernel_size[1],
-            self.out_channels // group,
-        )
-
-    def forward(self, inp):
-        return F.local_conv2d(
-            inp, self.weight, self.stride, self.padding, self.dilation, self.conv_mode
-        )
-
-
-class ConvRelu2d(Conv2d):
-    r"""
-    A fused :class:`~.Module` including Conv2d and relu. Could be replaced
-    with :class:`~.QATModule` version :class:`~.qat.conv.ConvRelu2d` using
-    :func:`~.quantize.quantize_qat`.
-    """
-
-    def forward(self, inp):
-        return F.relu(self.calc_conv(inp, self.weight, self.bias))
diff --git a/python_module/megengine/module/conv_bn.py b/python_module/megengine/module/conv_bn.py
deleted file mode 100644
index 76713b0f..00000000
--- a/python_module/megengine/module/conv_bn.py
+++ /dev/null
@@ -1,69 +0,0 @@
-# MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
-#
-# Copyright (c) 2014-2020 Megvii Inc. All rights reserved.
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-from typing import Tuple, Union
-
-from ..functional import relu
-from .batchnorm import BatchNorm2d
-from .conv import Conv2d
-from .module import Module
-
-
-class _ConvBnActivation2d(Module):
-    def __init__(
-        self,
-        in_channels: int,
-        out_channels: int,
-        kernel_size: Union[int, Tuple[int, int]],
-        stride: Union[int, Tuple[int, int]] = 1,
-        padding: Union[int, Tuple[int, int]] = 0,
-        dilation: Union[int, Tuple[int, int]] = 1,
-        groups: int = 1,
-        bias: bool = True,
-        conv_mode: str = "CROSS_CORRELATION",
-        compute_mode: str = "DEFAULT",
-        eps=1e-5,
-        momentum=0.9,
-        affine=True,
-        track_running_stats=True,
-    ):
-        super().__init__()
-        self.conv = Conv2d(
-            in_channels,
-            out_channels,
-            kernel_size,
-            stride,
-            padding,
-            dilation,
-            groups,
-            bias,
-            conv_mode,
-            compute_mode,
-        )
-        self.bn = BatchNorm2d(out_channels, eps, momentum, affine, track_running_stats)
-
-
-class ConvBn2d(_ConvBnActivation2d):
-    r"""
-    A fused :class:`~.Module` including Conv2d, BatchNorm2d. Could be replaced
-    with :class:`~.QATModule` version :class:`~.qat.conv_bn.ConvBn2d` using
-    :func:`~.quantize.quantize_qat`.
-    """
-
-    def forward(self, inp):
-        return self.bn(self.conv(inp))
-
-
-class ConvBnRelu2d(_ConvBnActivation2d):
-    r"""
-    A fused :class:`~.Module` including Conv2d, BatchNorm2d and relu. Could be replaced
-    with :class:`~.QATModule` version :class:`~.qat.conv_bn.ConvBnRelu2d` using
-    :func:`~.quantize.quantize_qat`.
-    """
-
-    def forward(self, inp):
-        return relu(self.bn(self.conv(inp)))
diff --git a/python_module/megengine/module/dropout.py b/python_module/megengine/module/dropout.py
deleted file mode 100644
index 146eba24..00000000
--- a/python_module/megengine/module/dropout.py
+++ /dev/null
@@ -1,29 +0,0 @@
-# -*- coding: utf-8 -*-
-# MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
-#
-# Copyright (c) 2014-2020 Megvii Inc. All rights reserved.
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-from ..functional import dropout
-from .module import Module
-
-
-class Dropout(Module):
-    r"""Randomly set input elements to zeros with the probability :math:`drop\_prob` during training. Commonly used in large networks to prevent overfitting.
-    Note that we perform dropout only during training, we also rescale(multiply) the output tensor
-    by :math:`\frac{1}{1 - drop\_prob}`. During inference :class:`~.Dropout` is equal to :class:`~.Identity`.
-
-    :param drop_prob: The probability to drop (set to zero) each single element
-    """
-
-    def __init__(self, drop_prob=0.0):
-        super().__init__()
-        self.drop_prob = drop_prob
-
-    def forward(self, inputs):
-        if self.training:
-            return dropout(inputs, self.drop_prob, rescale=True)
-        else:
-            return inputs
diff --git a/python_module/megengine/module/elemwise.py b/python_module/megengine/module/elemwise.py
deleted file mode 100644
index d1947b5e..00000000
--- a/python_module/megengine/module/elemwise.py
+++ /dev/null
@@ -1,90 +0,0 @@
-# MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
-#
-# Copyright (c) 2014-2020 Megvii Inc. All rights reserved.
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-from .. import _internal as mgb
-from ..core import Tensor, wrap_io_tensor
-from ..core.graph import _use_default_if_none
-from .module import Module
-
-
-@wrap_io_tensor
-def _elemwise_func(mode, *inputs, **kwargs) -> Tensor:
-    if all(isinstance(i, (int, float)) for i in inputs):
-        device, comp_graph = _use_default_if_none(None, None)
-        ret = mgb.opr.elemwise(
-            *inputs, mode=mode, comp_node=device, comp_graph=comp_graph, **kwargs
-        )
-        return ret.inferred_value[0]
-    return mgb.opr.elemwise(*inputs, mode=mode, **kwargs)
-
-
-class Elemwise(Module):
-    r"""
-    A :class:`~.Module` to do elemwise operator. Could be replaced with :class:`~.QATModule`
-    version :class:`~.qat.elemwise.Elemwise` using :func:`~.quantize.quantize_qat`.
-
-    :param method: the elemwise method, support the following string.
-        It will do the normal elemwise operator for float.
-
-        * "ADD": a + b
-        * "FUSE_ADD_RELU": max(x+y, 0)
-        * "MUL": x * y
-        * "MIN": min(x, y)
-        * "MAX": max(x, y)
-        * "SUB": x - y
-        * "TRUE_DIV": x / y
-        * "FUSE_ADD_SIGMOID": sigmoid(x + y)
-        * "FUSE_ADD_TANH": tanh(x + y)
-        * "RELU": x > 0 ? x : 0
-        * "ABS": x > 0 ? x : -x
-        * "SIGMOID": sigmoid(x)
-        * "EXP": exp(x)
-        * "TANH": tanh(x)
-        * "FUSE_MUL_ADD3": x * y + z
-        * "FAST_TANH": fast_tanh(x)
-        * "NEGATE": -x
-        * "ACOS": acos(x)
-        * "ASIN": asin(x)
-        * "CEIL": ceil(x)
-        * "COS": cos(x)
-        * "EXPM1": expm1(x)
-        * "FLOOR": floor(x)
-        * "LOG": log(x)
-        * "LOG1P": log1p(x)
-        * "SIN": sin(x)
-        * "ROUND": round(x)
-        * "ERF": erf(x)
-        * "ERFINV": erfinv(x)
-        * "ERFC": erfc(x)
-        * "ERFCINV": erfcinv(x)
-        * "ABS_GRAD": abs_grad
-        * "FLOOR_DIV": floor_div
-        * "MOD": mod
-        * "SIGMOID_GRAD": sigmoid_grad
-        * "SWITCH_GT0": switch_gt0
-        * "TANH_GRAD": tanh_grad
-        * "LT": lt
-        * "LEQ": leq
-        * "EQ": eq
-        * "POW": pow
-        * "LOG_SUM_EXP": log_sum_exp
-        * "FAST_TANH_GRAD": fast_tanh_grad
-        * "ATAN2": atan2
-        * "COND_LEQ_MOV": cond_leq_mov
-        * "H_SWISH": h_swish
-        * "FUSE_ADD_H_SWISH": h_swish(x+y)
-        * "H_SWISH_GRAD": h_swish_grad
-    """
-
-    _elemwise_mode_type = mgb.opr_param_defs.Elemwise.Mode
-
-    def __init__(self, method):
-        super().__init__()
-        self.method = self._elemwise_mode_type.convert(method)
-
-    def forward(self, *inps):
-        return _elemwise_func(self.method, *inps)
diff --git a/python_module/megengine/module/embedding.py b/python_module/megengine/module/embedding.py
deleted file mode 100644
index 976ac125..00000000
--- a/python_module/megengine/module/embedding.py
+++ /dev/null
@@ -1,171 +0,0 @@
-# -*- coding: utf-8 -*-
-# MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
-#
-# Copyright (c) 2014-2020 Megvii Inc. All rights reserved.
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-from typing import Optional
-
-import numpy as np
-
-from ..core import Parameter
-from ..functional import embedding as embedding_func
-from . import init
-from .module import Module
-
-
-class Embedding(Module):
-    r"""
-    A simple lookup table that stores embeddings of a fixed dictionary and size.
-
-    This module is often used to store word embeddings and retrieve them using indices.
-    The input to the module is a list of indices, and the output is the corresponding word embeddings.
-    The indices should less than num_embeddings.
-
-    :param num_embeddings: size of embedding dictionary.
-    :param embedding_dim: size of each embedding vector.
-    :param padding_idx: should be set to None, not support now.
-    :param max_norm: should be set to None, not support now.
-    :param norm_type: should be set to None, not support now.
-    :param initial_weight: the learnable weights of the module of shape (num_embeddings, embedding_dim).
-
-    Examples:
-
-    .. testcode::
-
-        import numpy as np
-        import megengine as mge
-        import megengine.module as M
-        weight = mge.tensor(np.array([(1.2,2.3,3.4,4.5,5.6),(0.1,1.1,2.1,3.1,4.1)], dtype=np.float32))
-        data = mge.tensor(np.array([(0,1,1),(1,0,1),(0,0,1)], dtype=np.int32))
-
-        embedding = M.Embedding(2, 5, initial_weight=weight)
-        output = embedding(data)
-        with np.printoptions(precision=6):
-            print(output.numpy())
-
-    Outputs:
-
-    .. testoutput::
-
-        [[[1.2 2.3 3.4 4.5 5.6]
-          [0.1 1.1 2.1 3.1 4.1]
-          [0.1 1.1 2.1 3.1 4.1]]
-
-         [[0.1 1.1 2.1 3.1 4.1]
-          [1.2 2.3 3.4 4.5 5.6]
-          [0.1 1.1 2.1 3.1 4.1]]
-
-         [[1.2 2.3 3.4 4.5 5.6]
-          [1.2 2.3 3.4 4.5 5.6]
-          [0.1 1.1 2.1 3.1 4.1]]]
-
-    """
-
-    def __init__(
-        self,
-        num_embeddings: int,
-        embedding_dim: int,
-        padding_idx: Optional[int] = None,
-        max_norm: Optional[float] = None,
-        norm_type: Optional[float] = None,
-        initial_weight: Parameter = None,
-    ):
-        super().__init__()
-        if padding_idx is not None:
-            raise ValueError("Not support padding index now.")
-        if max_norm is not None or norm_type is not None:
-            raise ValueError("Not support weight normalize now.")
-        self.padding_idx = padding_idx
-        self.max_norm = max_norm
-        self.norm_type = norm_type
-        self.num_embeddings = num_embeddings
-        self.embedding_dim = embedding_dim
-        if initial_weight is None:
-            self.weight = Parameter(
-                np.random.uniform(
-                    size=(self.num_embeddings, self.embedding_dim)
-                ).astype(np.float32)
-            )
-            self.reset_parameters()
-        else:
-            if initial_weight.shape != (num_embeddings, embedding_dim):
-                raise ValueError(
-                    "The weight shape should match num_embeddings and embedding_dim"
-                )
-            self.weight = Parameter(initial_weight.numpy())
-
-    def reset_parameters(self) -> None:
-        init.normal_(self.weight)
-
-    def forward(self, inputs):
-        return embedding_func(inputs, self.weight)
-
-    @classmethod
-    def from_pretrained(
-        cls,
-        embeddings: Parameter,
-        freeze: Optional[bool] = True,
-        padding_idx: Optional[int] = None,
-        max_norm: Optional[float] = None,
-        norm_type: Optional[float] = None,
-    ):
-        r"""
-        Creates Embedding instance from given 2-dimensional FloatTensor.
-
-        :param embeddings: Tensor contained weight for the embedding.
-        :param freeze: If ``True``, the weight does not get updated during the learning process. Default: ``True``.
-        :param padding_idx: should be set to None, not support Now.
-        :param max_norm: should be set to None, not support Now.
-        :param norm_type: should be set to None, not support Now.
-
-        Examples:
-
-        .. testcode::
-
-            import numpy as np
-            import megengine as mge
-            import megengine.module as M
-            weight = mge.tensor(np.array([(1.2,2.3,3.4,4.5,5.6),(0.1,1.1,2.1,3.1,4.1)], dtype=np.float32))
-            data = mge.tensor(np.array([(0,1,1),(1,0,1),(0,0,1)], dtype=np.int32))
-
-            embedding = M.Embedding.from_pretrained(weight, freeze=False)
-            output = embedding(data)
-            print(output.numpy())
-
-        Outputs:
-
-        .. testoutput::
-
-            [[[1.2 2.3 3.4 4.5 5.6]
-              [0.1 1.1 2.1 3.1 4.1]
-              [0.1 1.1 2.1 3.1 4.1]]
-
-             [[0.1 1.1 2.1 3.1 4.1]
-              [1.2 2.3 3.4 4.5 5.6]
-              [0.1 1.1 2.1 3.1 4.1]]
-
-             [[1.2 2.3 3.4 4.5 5.6]
-              [1.2 2.3 3.4 4.5 5.6]
-              [0.1 1.1 2.1 3.1 4.1]]]
-
-
-        """
-        embeddings_shape = embeddings.shape
-        embeddings_dim = len(embeddings_shape)
-        if embeddings_dim != 2:
-            raise ValueError("Embeddings parameter is expected to be 2-dimensional")
-        rows = embeddings_shape[0]
-        cols = embeddings_shape[1]
-        embedding = cls(
-            num_embeddings=rows,
-            embedding_dim=cols,
-            initial_weight=embeddings,
-            padding_idx=padding_idx,
-            max_norm=max_norm,
-            norm_type=norm_type,
-        )
-        embedding.weight.requires_grad = not freeze
-        return embedding
diff --git a/python_module/megengine/module/external.py b/python_module/megengine/module/external.py
deleted file mode 100644
index 962754e8..00000000
--- a/python_module/megengine/module/external.py
+++ /dev/null
@@ -1,83 +0,0 @@
-# -*- coding: utf-8 -*-
-# MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
-#
-# Copyright (c) 2014-2020 Megvii Inc. All rights reserved.
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-import numpy as np
-
-from ..functional.external import (
-    atlas_subgraph,
-    cambricon_subgraph,
-    extern_opr_subgraph,
-)
-from .module import Module
-
-
-class CambriconSubgraph(Module):
-    r"""Load a serialized Cambricon subgraph.
-
-    See :func:`~.cambricon_subgraph` for more details.
-    """
-
-    def __init__(
-        self, data, symbol, tensor_dim_mutable,
-    ):
-        super(CambriconSubgraph, self).__init__()
-        self._data = data
-        self.symbol = symbol
-        self.tensor_dim_mutable = tensor_dim_mutable
-
-    @property
-    def data(self):
-        return self._data.tobytes()
-
-    @data.setter
-    def data(self, val):
-        self._data = np.frombuffer(val, dtype=np.uint8)
-
-    def forward(self, inputs):
-        outputs = cambricon_subgraph(
-            inputs, self._data, self.symbol, self.tensor_dim_mutable,
-        )
-        return outputs
-
-
-class AtlasSubgraph(Module):
-    r"""Load a serialized Atlas subgraph.
-
-    See :func:`~.atlas_subgraph` for more details.
-    """
-
-    def __init__(self, data):
-        super(AtlasSubgraph, self).__init__()
-        self._data = data
-
-    @property
-    def data(self):
-        return self._data.tobytes()
-
-    @data.setter
-    def data(self, val):
-        self._data = np.frombuffer(val, dtype=np.uint8)
-
-    def forward(self, inputs):
-        outputs = atlas_subgraph(inputs, self._data)
-        return outputs
-
-
-class ExternOprSubgraph(Module):
-    r"""Load a serialized extern opr subgraph.
-    """
-
-    def __init__(self, data, name, output_shapes):
-        super(ExternOprSubgraph, self).__init__()
-        self.data = data
-        self.name = name
-        self.output_shapes = output_shapes
-
-    def forward(self, inputs):
-        outputs = extern_opr_subgraph(inputs, self.output_shapes, self.name, self.data,)
-        return outputs
diff --git a/python_module/megengine/module/identity.py b/python_module/megengine/module/identity.py
deleted file mode 100644
index 51b31e50..00000000
--- a/python_module/megengine/module/identity.py
+++ /dev/null
@@ -1,17 +0,0 @@
-# -*- coding: utf-8 -*-
-# MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
-#
-# Copyright (c) 2014-2020 Megvii Inc. All rights reserved.
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-from ..functional import identity
-from .module import Module
-
-
-class Identity(Module):
-    r"""A placeholder identity operator that will ignore any argument."""
-
-    def forward(self, x):
-        return identity(x)
diff --git a/python_module/megengine/module/init.py b/python_module/megengine/module/init.py
deleted file mode 100644
index 8c39443e..00000000
--- a/python_module/megengine/module/init.py
+++ /dev/null
@@ -1,264 +0,0 @@
-# -*- coding: utf-8 -*-
-# MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
-#
-# Copyright (c) 2014-2020 Megvii Inc. All rights reserved.
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-import math
-from functools import reduce
-from typing import Optional, Tuple, Union
-
-import numpy as np
-
-from ..core import Graph, Tensor
-from ..random import gaussian, uniform
-
-
-def fill_(tensor: Tensor, val: Union[float, int]) -> None:
-    """Fill the given ``tensor`` with value ``val``.
-
-    :param tensor: An n-dimentional tensor to be initialized
-    :param val: The value to be filled throughout the tensor
-    """
-    tensor.set_value(np.full(tensor.shape, val, tensor.dtype))
-
-
-def zeros_(tensor: Tensor) -> None:
-    """Fill the given ``tensor`` with scalar value `0`.
-
-    :param tensor: An n-dimentional tensor to be initialized
-    """
-    fill_(tensor, 0)
-
-
-def ones_(tensor: Tensor) -> None:
-    """Fill the given ``tensor`` with the scalar value `1`.
-
-    :param tensor: An n-dimentional tensor to be initialized
-    """
-    fill_(tensor, 1)
-
-
-def uniform_(tensor: Tensor, a: float = 0.0, b: float = 1.0) -> None:
-    r"""Fill the given ``tensor`` with random value sampled from uniform distribution
-    :math:`\mathcal{U}(\text{a}, \text{b})`.
-
-    :param tensor: An n-dimentional tensor to be initialized
-    :param a: Lower bound of the sampling interval
-    :param b: Upper bound of the sampling interval
-    """
-    with Graph(eager_evaluation=True):
-        tensor.set_value((b - a) * uniform(tensor.shape) + a)
-
-
-def normal_(tensor: Tensor, mean: float = 0.0, std: float = 1.0) -> None:
-    r"""Fill the given ``tensor`` with random value sampled from normal distribution
-    :math:`\mathcal{N}(\text{mean}, \text{std}^2)`.
-
-    :param tensor: An n-dimentional tensor to be initialized
-    :param mean: The mean of the normal distribution
-    :param std: The standard deviation of the normal distribution
-    """
-    with Graph(eager_evaluation=True):
-        tensor.set_value(gaussian(tensor.shape, mean=mean, std=std))
-
-
-def calculate_gain(
-    nonlinearity: str, param: Optional[Union[int, float]] = None
-) -> float:
-    r"""Return a recommended gain value (see the table below) for the given nonlinearity
-    function.
-
-    ================= ====================================================
-    nonlinearity      gain
-    ================= ====================================================
-    Linear / Identity :math:`1`
-    Conv{1,2,3}D      :math:`1`
-    Sigmoid           :math:`1`
-    Tanh              :math:`\frac{5}{3}`
-    ReLU              :math:`\sqrt{2}`
-    Leaky Relu        :math:`\sqrt{\frac{2}{1 + \text{negative_{slope}}^2}}`
-    ================= ====================================================
-
-    :param nonlinearity: Name of the non-linear function
-    :param param: Optional parameter for leaky_relu. Only effective when
-        ``nonlinearity`` is "leaky_relu".
-
-    """
-    linear_fns = [
-        "linear",
-        "conv1d",
-        "conv2d",
-        "conv3d",
-        "conv_transpose1d",
-        "conv_transpose2d",
-        "conv_transpose3d",
-    ]
-    if nonlinearity in linear_fns or nonlinearity == "sigmoid":
-        return 1
-    if nonlinearity == "tanh":
-        return 5.0 / 3
-    if nonlinearity == "relu":
-        return math.sqrt(2.0)
-    if nonlinearity == "leaky_relu":
-        if param is None:
-            negative_slope = 0.01
-        elif (
-            not isinstance(param, bool)
-            and isinstance(param, int)
-            or isinstance(param, float)
-        ):
-            # True/False are instances of int, hence check above
-            negative_slope = param
-        else:
-            raise ValueError("negative_slope {} not a valid number".format(param))
-        return math.sqrt(2.0 / (1 + negative_slope ** 2))
-    raise ValueError("Unsupported nonlinearity {}".format(nonlinearity))
-
-
-def calculate_fan_in_and_fan_out(tensor: Tensor) -> Tuple[float, float]:
-    """
-    Calculate fan_in / fan_out value for given weight tensor. This function assumes
-    input tensor is stored in NCHW format.
-
-    :param tensor: Weight tensor in NCHW format
-    """
-    shape = tensor.shape
-    ndim = len(shape)
-    if ndim < 2:
-        raise ValueError(
-            "fan_in and fan_out can not be computed for tensor with fewer than 2 "
-            "dimensions"
-        )
-
-    if ndim == 2:  # Linear
-        fan_in = shape[1]
-        fan_out = shape[0]
-    else:
-        num_input_fmaps = shape[1]
-        num_output_fmaps = shape[0]
-        receptive_field_size = 1
-        if ndim > 2:
-            receptive_field_size = reduce(lambda x, y: x * y, shape[2:], 1)
-        fan_in = num_input_fmaps * receptive_field_size
-        fan_out = num_output_fmaps * receptive_field_size
-    return fan_in, fan_out
-
-
-def calculate_correct_fan(tensor: Tensor, mode: str) -> float:
-    """
-    Calculate fan_in or fan_out value for given weight tensor, depending on given
-    ``mode``.
-
-    See :func:`calculate_fan_in_and_fan_out` for details.
-
-    :param tensor: Weight tensor in NCHW format
-    :param mode: ``'fan_in'`` or ``'fan_out'``
-    """
-    mode = mode.lower()
-    valid_modes = ["fan_in", "fan_out"]
-    if mode not in valid_modes:
-        raise ValueError(
-            "Mode {} not supported, please use one of {}".format(mode, valid_modes)
-        )
-
-    fan_in, fan_out = calculate_fan_in_and_fan_out(tensor)
-    return fan_in if mode == "fan_in" else fan_out
-
-
-def xavier_uniform_(tensor: Tensor, gain: float = 1.0) -> None:
-    r"""Fill ``tensor`` with random values sampled from :math:`\mathcal{U}(-a, a)`
-    where
-
-    .. math::
-        a = \text{gain} \times \sqrt{\frac{6}{\text{fan_in} + \text{fan_out}}}
-
-    Also known as Glorot initialization. Detailed information can be retrieved from
-    `"Understanding the difficulty of training deep feedforward neural networks" <http://proceedings.mlr.press/v9/glorot10a/glorot10a.pdf>`_.
-
-
-    :param tensor: An n-dimentional tensor to be initialized
-    :param gain: Scaling factor for :math:`a`.
-    """
-    fan_in, fan_out = calculate_fan_in_and_fan_out(tensor)
-    std = gain * math.sqrt(2.0 / float(fan_in + fan_out))
-    a = math.sqrt(3.0) * std
-    uniform_(tensor, -a, a)
-
-
-def xavier_normal_(tensor: Tensor, gain: float = 1.0) -> None:
-    r"""Fill ``tensor`` with random values sampled from
-    :math:`\mathcal{N}(0, \text{std}^2)` where
-
-    .. math::
-        \text{std} = \text{gain} \times \sqrt{\frac{2}{\text{fan_in} + \text{fan_out}}}
-
-    Also known as Glorot initialization. Detailed information can be retrieved from
-    `"Understanding the difficulty of training deep feedforward neural networks" <http://proceedings.mlr.press/v9/glorot10a/glorot10a.pdf>`_.
-
-    :param tensor: An n-dimentional tensor to be initialized
-    :param gain: Scaling factor for :math:`std`.
-    """
-    fan_in, fan_out = calculate_fan_in_and_fan_out(tensor)
-    std = gain * math.sqrt(2.0 / float(fan_in + fan_out))
-    normal_(tensor, 0.0, std)
-
-
-def msra_uniform_(
-    tensor: Tensor, a: float = 0, mode: str = "fan_in", nonlinearity: str = "leaky_relu"
-) -> None:
-    r"""Fill ``tensor`` wilth random values sampled from
-    :math:`\mathcal{U}(-\text{bound}, \text{bound})` where
-
-    .. math::
-        \text{bound} = \sqrt{\frac{6}{(1 + a^2) \times \text{fan_in}}}
-
-    Detailed information can be retrieved from
-    `"Delving deep into rectifiers: Surpassing human-level performance on ImageNet
-    classification" <https://www.cv-foundation.org/openaccess/content_iccv_2015/papers/He_Delving_Deep_into_ICCV_2015_paper.pdf>`_.
-
-
-    :param tensor: An n-dimentional tensor to be initialized
-    :param a: Optional parameter for calculating gain for leaky_relu. See
-        :func:`calculate_gain` for details.
-    :param mode: ``'fan_in'`` or ``'fan_out'``, used to calculate :math:`gain`, the
-        scaling factor for :math:`bound`. See :func:`calculate_fan_in_and_fan_out` for
-        details.
-    :param nonlinearity: Name of the non-linear function used to calculate :math:`gain`.
-        See :func:`calculate_gain` for details.
-    """
-    fan = calculate_correct_fan(tensor, mode)
-    gain = calculate_gain(nonlinearity, a)
-    std = gain / math.sqrt(fan)
-    bound = math.sqrt(3.0) * std
-    uniform_(tensor, -bound, bound)
-
-
-def msra_normal_(
-    tensor: Tensor, a: float = 0, mode: str = "fan_in", nonlinearity: str = "leaky_relu"
-) -> None:
-    r"""Fill ``tensor`` wilth random values sampled from
-    :math:`\mathcal{N}(0, \text{std}^2)` where
-
-    .. math::
-        \text{std} = \sqrt{\frac{2}{(1 + a^2) \times \text{fan_in}}}
-
-    Detailed information can be retrieved from
-    `"Delving deep into rectifiers: Surpassing human-level performance on ImageNet
-    classification" <https://www.cv-foundation.org/openaccess/content_iccv_2015/papers/He_Delving_Deep_into_ICCV_2015_paper.pdf>`_.
-
-    :param tensor: An n-dimentional tensor to be initialized
-    :param a: Optional parameter for calculating gain for leaky_relu. See
-        :func:`calculate_gain` for details.
-    :param mode: ``'fan_in'`` or ``'fan_out'``, used to calculate :math:`gain`, the
-        scaling factor for :math:`gain`. See :func:`calculate_fan_in_and_fan_out` for
-        details.
-    :param nonlinearity: Name of the non-linear function used to calculate :math:`gain`.
-        See :func:`calculate_gain` for details.
-    """
-    fan = calculate_correct_fan(tensor, mode)
-    gain = calculate_gain(nonlinearity, a)
-    std = gain / math.sqrt(fan)
-    normal_(tensor, 0, std)
diff --git a/python_module/megengine/module/linear.py b/python_module/megengine/module/linear.py
deleted file mode 100644
index 30f8ea82..00000000
--- a/python_module/megengine/module/linear.py
+++ /dev/null
@@ -1,61 +0,0 @@
-# MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
-#
-# Copyright (c) 2014-2020 Megvii Inc. All rights reserved.
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-import numpy as np
-
-from .. import functional as F
-from ..core import Parameter
-from . import init
-from .module import Module
-
-
-class Linear(Module):
-    r"""Applies a linear transformation to the input. For instance, if input
-    is x, then output y is:
-
-    .. math::
-
-            y = xW^T + b
-
-    where :math:`y_i= \sum_j W_{ij} x_j + b_i`
-
-    :param in_features: size of each input sample.
-    :param out_features: size of each output sample.
-    :param bias: If set to ``False``, the layer will not learn an additive bias.
-        Default: ``True``
-
-    """
-
-    def __init__(
-        self, in_features: int, out_features: int, bias: bool = True, **kwargs
-    ):
-        super().__init__(**kwargs)
-        self.out_features = out_features
-        self.in_features = in_features
-        w_shape = (out_features, in_features)
-        self.weight = Parameter(np.zeros(w_shape, dtype=np.float32))
-        self.bias = None
-        if bias:
-            b_shape = (out_features,)
-            self.bias = Parameter(np.zeros(b_shape, dtype=np.float32))
-        self.reset_parameters()
-
-    def _get_fanin(self):
-        return self.in_features
-
-    def reset_parameters(self) -> None:
-        fanin = self._get_fanin()
-        std = np.sqrt(1 / fanin)
-        init.normal_(self.weight, 0.0, std)
-        if self.bias is not None:
-            init.zeros_(self.bias)
-
-    def _calc_linear(self, x, weight, bias):
-        return F.linear(x, weight, bias)
-
-    def forward(self, x):
-        return self._calc_linear(x, self.weight, self.bias)
diff --git a/python_module/megengine/module/module.py b/python_module/megengine/module/module.py
deleted file mode 100644
index b999fb12..00000000
--- a/python_module/megengine/module/module.py
+++ /dev/null
@@ -1,507 +0,0 @@
-# MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
-#
-# Copyright (c) 2014-2020 Megvii Inc. All rights reserved.
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-from abc import ABCMeta, abstractmethod
-from collections import OrderedDict
-from typing import Any, Callable, Iterable, Optional, Set, Tuple, Union
-
-import numpy as np
-
-from .._internal.dtype import is_quantize
-from ..core import Buffer, Parameter, Tensor
-from ..logger import get_logger
-from ..utils.hook import HookHandler
-
-logger = get_logger(__name__)
-
-
-def _expand_structure(key, obj):
-    if isinstance(obj, (Tensor, Module)):
-        return [(key, obj)]
-    elif isinstance(obj, (list, tuple, dict)):
-        ret = []
-        if isinstance(obj, dict):
-            targets = ((k, obj[k]) for k in sorted(obj))
-        else:
-            targets = ((str(k), v) for k, v in enumerate(obj))
-        for k, o in targets:
-            sub_ret = _expand_structure(k, o)
-            if sub_ret and not isinstance(k, str):
-                raise AssertionError(
-                    "keys for Tensor and Module must be str, error key: {}".format(k)
-                )
-            for kt, vt in sub_ret:
-                ret.extend([(key + "." + kt, vt)])
-        return ret
-    else:
-        return []
-
-
-def _is_parameter(obj):
-    return isinstance(obj, Parameter)
-
-
-def _is_buffer(obj):
-    return isinstance(obj, Buffer)
-
-
-def _is_module(obj):
-    return isinstance(obj, Module)
-
-
-class Module(metaclass=ABCMeta):
-    """Base Module class.
-    """
-
-    def __init__(self):
-        # runtime attributes
-        self.training = True
-        self.quantize_disabled = False
-
-        # hooks
-        self._forward_pre_hooks = OrderedDict()
-        self._forward_hooks = OrderedDict()
-
-    @abstractmethod
-    def forward(self, inputs):
-        pass
-
-    def register_forward_pre_hook(self, hook: Callable) -> HookHandler:
-        """Register a hook to handle forward inputs. `hook` should be a function
-
-        Note that `inputs` keyword inputs
-
-        :param hook: a function that receive `module` and `inputs`, then return
-        a modified `inputs` or `None`.
-        :return: a handler with :meth:`~.HookHandler.remove` interface to delete the hook.
-        """
-        return HookHandler(self._forward_pre_hooks, hook)
-
-    def register_forward_hook(self, hook: Callable) -> HookHandler:
-        """Register a hook to handle forward results. `hook` should be a function that
-        receive `module`, `inputs` and `outputs`, then return a modified `outputs` or `None`.
-
-        This method return a handler with :meth:`~.HookHandler.remove` interface to delete the hook.
-        """
-        return HookHandler(self._forward_hooks, hook)
-
-    def __call__(self, *inputs, **kwargs):
-        for hook in self._forward_pre_hooks.values():
-            modified_inputs = hook(self, inputs)
-            if modified_inputs is not None:
-                if not isinstance(modified_inputs, tuple):
-                    modified_inputs = (modified_inputs,)
-                inputs = modified_inputs
-
-        outputs = self.forward(*inputs, **kwargs)
-
-        for hook in self._forward_hooks.values():
-            modified_outputs = hook(self, inputs, outputs)
-            if modified_outputs is not None:
-                outputs = modified_outputs
-        return outputs
-
-    def _flatten(
-        self,
-        *,
-        recursive: bool = True,
-        with_key: bool = False,
-        with_parent: bool = False,
-        prefix: Optional[str] = None,
-        predicate: Callable[[Any], bool] = lambda _: True,
-        seen: Optional[Set[int]] = None
-    ) -> Union[Iterable[Any], Iterable[Tuple[str, Any]]]:
-        """Scans the module object and returns an iterable for the :class:`~.Tensor`
-        and :class:`~.Module` attributes that agree with the ``predicate``. For multiple
-        calls of this function with same arguments, the order of objects within the
-        returned iterable is guaranteed to be identical, as long as all the involved
-        module objects' ``__dict__`` does not change thoughout those calls.
-
-        :param recursive: Whether to recursively scan all the submodules.
-        :param with_key: Whether to yield keys along with yielded objects.
-        :param with_parent: Whether to yield ``self`` along with yielded objects.
-        :param prefix: The prefix appended to the yielded keys.
-        :param predicate: The predicate function applied to scanned objects.
-        :param seen: A dict that records whether a module has been traversed yet.
-        """
-        if seen is None:
-            seen = set([id(self)])
-
-        module_dict = vars(self)
-        _prefix = "" if prefix is None else prefix + "."
-
-        for key in sorted(module_dict):
-            for expanded_key, leaf in _expand_structure(key, module_dict[key]):
-                leaf_id = id(leaf)
-                if leaf_id in seen:
-                    continue
-                seen.add(leaf_id)
-
-                if predicate(leaf):
-                    if with_key and with_parent:
-                        yield _prefix + expanded_key, leaf, self
-                    elif with_key:
-                        yield _prefix + expanded_key, leaf
-                    elif with_parent:
-                        yield leaf, self
-                    else:
-                        yield leaf
-
-                if recursive and isinstance(leaf, Module):
-                    yield from leaf._flatten(
-                        recursive=recursive,
-                        with_key=with_key,
-                        with_parent=with_parent,
-                        prefix=_prefix + expanded_key if with_key else None,
-                        predicate=predicate,
-                        seen=seen,
-                    )
-
-    def parameters(
-        self, requires_grad: Optional[bool] = None, recursive: bool = True, **kwargs
-    ) -> Iterable[Parameter]:
-        r"""Returns an iterable for the :class:`~.Parameter` of the module.
-
-        :param requires_grad: Limitation over the :attr:`~.Parameter.requires_grad`
-            attribute of returned :class:`.Parameter`. ``None`` for no limitation.
-        :param recursive: If ``True``, returns all :class:`~.Parameter` within this
-            module, else only returns :class:`~.Parameter` that are direct attributes
-            of this module.
-        """
-
-        def predicate(obj) -> bool:
-            return _is_parameter(obj) and (
-                requires_grad is None or obj.requires_grad == requires_grad
-            )
-
-        yield from self._flatten(
-            with_key=False, predicate=predicate, recursive=recursive, **kwargs
-        )
-
-    def named_parameters(
-        self,
-        requires_grad: Optional[bool] = None,
-        prefix: Optional[str] = None,
-        recursive: bool = True,
-        **kwargs
-    ) -> Iterable[Tuple[str, Parameter]]:
-        """Returns an iterable for key :class:`~.Parameter` pairs of the module, where
-        ``key`` is the dotted path from this module to the :class:`~.Parameter` .
-
-        :param requires_grad: Limitation over the :attr:`~.Parameter.requires_grad`
-            attribute of returned :class:`~.Parameter` . ``None`` for no limitation.
-        :param prefix: The prefix prepended to the keys.
-        :param recursive: If ``True``, returns all :class:`~.Parameter` within this
-            module, else only returns :class:`~.Parameter` that are direct attributes
-            of this module.
-        """
-
-        def predicate(obj) -> bool:
-            return _is_parameter(obj) and (
-                requires_grad is None or obj.requires_grad == requires_grad
-            )
-
-        yield from self._flatten(
-            with_key=True,
-            prefix=prefix,
-            predicate=predicate,
-            recursive=recursive,
-            **kwargs,
-        )
-
-    def buffers(self, recursive: bool = True, **kwargs) -> Iterable[Buffer]:
-        """Returns an iterable for the :class:`~.Buffer` of the module.
-
-        :param recursive: If ``True``, returns all :class:`~.Buffer` within this
-            module, else only returns :class:`~.Buffer` that are direct attributes
-            of this module.
-        """
-        yield from self._flatten(
-            with_key=False, predicate=_is_buffer, recursive=recursive, **kwargs
-        )
-
-    def named_buffers(
-        self, prefix: Optional[str] = None, recursive: bool = True, **kwargs
-    ) -> Iterable[Tuple[str, Buffer]]:
-        """Returns an iterable for key :class:`~.Buffer` pairs of the module, where
-        ``key`` is the dotted path from this module to the :class:`~.Buffer` .
-
-        :param prefix: The prefix prepended to the keys.
-        :param recursive: If ``True``, returns all :class:`~.Buffer` within this
-            module, else only returns :class:`~.Buffer` that are direct attributes
-            of this module.
-        """
-        yield from self._flatten(
-            with_key=True,
-            prefix=prefix,
-            predicate=_is_buffer,
-            recursive=recursive,
-            **kwargs,
-        )
-
-    def children(self, **kwargs) -> "Iterable[Module]":
-        """Returns an iterable for all the submodules that are direct attributes of this
-        module.
-        """
-        yield from self._flatten(
-            with_key=False, predicate=_is_module, recursive=False, **kwargs
-        )
-
-    def named_children(self, **kwargs) -> "Iterable[Tuple[str, Module]]":
-        """Returns an iterable of key-submodule pairs for all the submodules that are
-        direct attributes of this module, where 'key' is the attribute name of
-        submodules.
-        """
-        yield from self._flatten(
-            with_key=True, predicate=_is_module, recursive=False, **kwargs
-        )
-
-    def modules(self, **kwargs) -> "Iterable[Module]":
-        """Returns an iterable for all the modules within this module, including itself.
-        """
-        if "with_parent" in kwargs and kwargs["with_parent"]:
-            yield self, None
-        else:
-            yield self
-        yield from self._flatten(with_key=False, predicate=_is_module, **kwargs)
-
-    def named_modules(
-        self, prefix: Optional[str] = None, **kwargs
-    ) -> "Iterable[Tuple[str, Module]]":
-        """Returns an iterable of key-module pairs for all the modules within this
-        module, including itself, where 'key' is the dotted path from this module to the
-        submodules.
-
-        :param prefix: The prefix prepended to the path.
-        """
-        if "with_parent" in kwargs and kwargs["with_parent"]:
-            yield ("" if prefix is None else prefix), self, None
-        else:
-            yield ("" if prefix is None else prefix), self
-        yield from self._flatten(
-            with_key=True, prefix=prefix, predicate=_is_module, **kwargs
-        )
-
-    def apply(self, fn: "Callable[[Module], Any]") -> None:
-        """Apply function ``fn`` to all the modules within this module, including
-        itself.
-
-        :param fn: The function to be applied on modules.
-        """
-        for it in self.modules():
-            fn(it)
-
-    def zero_grad(self) -> None:
-        """Set all parameters' grads to zero
-        """
-        for param in self.parameters():
-            if param.grad is not None:
-                param.grad.reset_zero()
-
-    def train(self, mode: bool = True, recursive: bool = True) -> None:
-        """Set training mode of all the modules within this module (including itself) to
-        ``mode``. This effectively sets the ``training`` attributes of those modules
-        to ``mode``, but only has effect on certain modules (e.g.
-        :class:`~.BatchNorm2d`, :class:`~.Dropout`, :class:`~.Observer`)
-
-        :param mode: the training mode to be set on modules.
-        :param recursive: whether to recursively call submodules' ``train()``.
-        """
-        if not recursive:
-            self.training = mode
-            return
-
-        def fn(module: Module) -> None:
-            module.train(mode, recursive=False)
-
-        self.apply(fn)
-
-    def eval(self) -> None:
-        """Set training mode of all the modules within this module (including itself) to
-        ``False``. See :meth:`~.Module.train` for details.
-        """
-        self.train(False)
-
-    def disable_quantize(self, value=True):
-        r"""
-        Set ``module``'s ``quantize_disabled`` attribute and return ``module``.
-        Could be used as a decorator.
-        """
-
-        def fn(module: Module) -> None:
-            module.quantize_disabled = value
-
-        self.apply(fn)
-
-    def replace_param(
-        self, params: dict, start_pos: int, seen: Optional[Set[int]] = None
-    ):
-        """Replace module's parameters with `params`, used by :class:`~.ParamPack` to
-        speedup multimachine training.
-        """
-        offset = 0
-        if seen is None:
-            seen = set([id(self)])
-        module_dict = vars(self)
-        for key in sorted(module_dict):
-            hash_id = id(module_dict[key])
-            if hash_id in seen:
-                continue
-            seen.add(hash_id)
-            if isinstance(module_dict[key], Parameter):
-                if start_pos + offset in params:
-                    assert module_dict[key].shape == params[start_pos + offset].shape
-                    module_dict[key] = params[start_pos + offset]
-                offset += 1
-            if isinstance(module_dict[key], Module):
-                offset += module_dict[key].replace_param(
-                    params, start_pos + offset, seen
-                )
-        return offset
-
-    def state_dict(self, rst=None, prefix="", keep_var=False):
-        r"""Returns a dictionary containing whole states of the module.
-        """
-
-        def is_state(obj):
-            return _is_parameter(obj) or _is_buffer(obj)
-
-        if rst is None:
-            rst = OrderedDict()
-
-        for k, v in self._flatten(recursive=False, with_key=True, predicate=is_state):
-            assert prefix + k not in rst, "duplicated state: {}".format(k)
-            if keep_var:
-                rst[prefix + k] = v
-            else:
-                rst[prefix + k] = v.numpy()
-
-        for k, submodule in self._flatten(
-            recursive=False,
-            with_key=True,
-            predicate=lambda obj: isinstance(obj, Module),
-        ):
-            submodule.state_dict(rst, prefix + k + ".", keep_var)
-
-        return rst
-
-    def load_state_dict(
-        self,
-        state_dict: Union[dict, Callable[[str, Tensor], Optional[np.ndarray]]],
-        strict=True,
-    ):
-        r"""Load a given dictionary created by :func:`state_dict` into this module.
-        If ``strict`` is ``True``, the keys of :func:`state_dict` must exactly match the keys
-        returned by :func:`state_dict`.
-
-        Users can also pass a closure: `Function[key: str, var: Tensor] -> Optional[np.ndarray]`
-        as a `state_dict`, in order to handle complex situations. For example, load everything
-        except for the final linear classifier:
-
-        .. code-block::
-
-            state_dict = {...}  #  Dict[str, np.ndarray]
-            model.load_state_dict({
-                k: None if k.startswith('fc') else v
-                for k, v in state_dict.items()
-            }, strict=False)
-
-        Here returning `None` means skipping parameter `k`.
-
-        To prevent shape mismatch (e.g. load PyTorch weights), we can reshape before loading:
-
-        .. code-block::
-
-            state_dict = {...}
-            def reshape_accordingly(k, v):
-                return state_dict[k].reshape(v.shape)
-            model.load_state_dict(reshape_accordingly)
-
-        We can also perform inplace re-initialization or pruning:
-
-        .. code-block::
-
-            def reinit_and_pruning(k, v):
-                if 'bias' in k:
-                    M.init.zero_(v)
-                if 'conv' in k:
-                    return v.numpy() * (np.abs(v.numpy()) > 1e-3).astype("float32)
-            model.load_state_dict(reinit_and_pruning, strict=False)
-        """
-        unused = []
-        if isinstance(state_dict, dict):
-            unused = state_dict.keys()
-
-            def closure(k, _):  # var unused
-                return state_dict[k] if k in state_dict else None
-
-        elif callable(state_dict):
-            closure = state_dict
-        else:
-            raise ValueError(
-                "`state_dict` must load a dict or callable, got {}".format(
-                    type(state_dict)
-                )
-            )
-
-        loaded, skipped = self._load_state_dict_with_closure(closure)
-        unused = set(unused) - loaded
-
-        if len(unused) != 0:
-            if strict:
-                raise KeyError(
-                    "Unused params violate `strict=True`, unused={}".format(unused)
-                )
-            else:
-                logger.warning(
-                    "Unused params in `strict=False` mode, unused={}".format(unused)
-                )
-
-        if len(skipped) != 0:
-            if strict:
-                raise KeyError(
-                    "Missing params violate `strict=True`, missing={}".format(skipped)
-                )
-            else:
-                logger.warning(
-                    "Missing params in `strict=False` mode, missing={}".format(skipped)
-                )
-
-    def _load_state_dict_with_closure(self, closure):
-        """Advance state_dict load through callable `closure` whose signature is
-
-            `closure(key: str, var: Tensor) -> Union[np.ndarry, None]`
-        """
-        assert callable(closure), "closure must be a function"
-
-        loaded = []
-        skipped = []
-
-        local_state_dict = self.state_dict(keep_var=True)
-        for k, var in local_state_dict.items():
-            to_be_load = closure(k, var)
-            if to_be_load is None:
-                skipped.append(k)
-                continue
-            assert isinstance(
-                to_be_load, np.ndarray
-            ), "closure should return a `np.ndarray`, now `{}` get {}".format(
-                k, to_be_load
-            )
-            assert (
-                var.shape == to_be_load.shape
-            ), "param `{}` shape mismatch, should be {}, get {}".format(
-                k, var.shape, to_be_load.shape
-            )
-            # For quantized dtype, the initialized dtype
-            # scale/zero_points maybe invalid, use pretrained dtype instead.
-            if is_quantize(to_be_load.dtype) and is_quantize(var.dtype):
-                var.dtype = to_be_load.dtype
-            var.set_value(to_be_load)
-            loaded.append(k)
-
-        return set(loaded), set(skipped)
diff --git a/python_module/megengine/module/parampack.py b/python_module/megengine/module/parampack.py
deleted file mode 100644
index c020a41d..00000000
--- a/python_module/megengine/module/parampack.py
+++ /dev/null
@@ -1,157 +0,0 @@
-# -*- coding: utf-8 -*-
-# MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
-#
-# Copyright (c) 2014-2020 Megvii Inc. All rights reserved.
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-import collections
-from typing import Callable, Iterable, Optional, Tuple
-
-import numpy as np
-
-from .._internal.opr import param_pack_split
-from ..core import Parameter, Tensor
-from .module import Module
-
-
-class ParamPack(Module):
-    r"""Pack module's parameters by gathering their memory to continuous address.
-    Using (device, dtype, requires_grad) as key, for example ('gpu0', float32, True),
-    parameters with same key will be packed togather.
-    It helps a lot for multimachine training by speeding up allreduce gradients.
-
-    :param model: the module you want to pack parameters.
-    :param nr_ignore_first: how many parameters will be unpacked at first.
-    :param max_size_per_group: upper bound of packed parameters' size in MB.
-    :param max_nr_params_per_group: upper bound of the number of parameters of each group.
-
-    """
-
-    def __init__(
-        self,
-        model: Module,
-        nr_ignore_first: int = 8,
-        max_size_per_group: int = 10,
-        max_nr_params_per_group: int = 100,
-        group_func: Callable = lambda name, param: 0,
-    ):
-        super().__init__()
-        self._model = model
-        self._nr_ignore_first = nr_ignore_first
-        self._max_size_per_group = max_size_per_group
-        self._max_nr_params_per_group = max_nr_params_per_group
-        self._group_func = group_func
-        self._grouped_params = []
-        self._packed_params = []
-
-        params = model.named_parameters()
-        self._pack_params(params)
-
-    def parameters(self, requires_grad: Optional[bool] = None) -> Iterable[Parameter]:
-        for param in self._packed_params:
-            if requires_grad is None or param.requires_grad == requires_grad:
-                yield param
-
-    def named_parameters(
-        self, requires_grad: Optional[bool] = None
-    ) -> Iterable[Tuple[str, Parameter]]:
-        for idx, param in enumerate(self._packed_params):
-            if requires_grad is None or param.requires_grad == requires_grad:
-                yield "packed_param_" + str(idx), param
-
-    def _pack_params(self, params: Iterable[Tuple[str, Parameter]]):
-        groups = collections.defaultdict(list)
-        ignored = 0
-        param_id = 0
-        for name, param in params:
-            if self._nr_ignore_first > ignored:
-                ignored += 1
-                self._grouped_params.append([{"shape": param.shape, "id": param_id}])
-                param.pack_group_key = self._group_func(name, param)
-                self._packed_params.append(param)
-            else:
-                key = (
-                    param.dtype,
-                    param.device,
-                    param.requires_grad,
-                    self._group_func(name, param),
-                )
-                groups[key].append({"tensor": param, "id": param_id})
-            param_id += 1
-        for (dtype, device, requires_grad, group_key) in groups.keys():
-            dtype_sz = np.dtype(dtype).itemsize
-            align = device.mem_align
-            if align < dtype_sz:
-                align = 1
-            else:
-                assert align % dtype_sz == 0
-                align //= dtype_sz
-
-            group = groups[(dtype, device, requires_grad, group_key)]
-            while group:
-                aligned_pos = []
-                offset = 0
-                params = []
-                idx = 0
-                while idx < len(group):
-                    param = group[idx]
-                    assert param["tensor"].device == device
-                    padding = (align - (offset & (align - 1))) & (align - 1)
-                    offset += padding
-                    aligned_pos.append(offset)
-                    params.append(param)
-                    offset += int(np.prod(param["tensor"].shape))
-                    idx += 1
-
-                    if (
-                        offset * dtype_sz >= self._max_size_per_group * 1024 * 1024
-                        or idx >= self._max_nr_params_per_group
-                    ):
-                        break
-                group = group[idx:]
-                if idx == 1:
-                    # ignore param packs with only one item
-                    params[0]["tensor"].pack_group_key = group_key
-                    self._packed_params.append(params[0]["tensor"])
-                    self._grouped_params.append(
-                        [{"shape": params[0]["tensor"].shape, "id": params[0]["id"]}]
-                    )
-                    continue
-
-                packed_value = np.zeros((offset,), dtype=dtype)
-                for param, pos in zip(params, aligned_pos):
-                    val = param["tensor"].numpy()
-                    packed_value[pos : pos + val.size] = val.flatten()
-                new_param = Parameter(
-                    value=packed_value,
-                    device=device,
-                    dtype=dtype,
-                    requires_grad=requires_grad,
-                )
-                new_param.pack_group_key = group_key
-                self._packed_params.append(new_param)
-                self._grouped_params.append(
-                    [{"shape": i["tensor"].shape, "id": i["id"]} for i in params]
-                )
-
-    def forward(self, *args, **kwargs):
-        replace_param = dict()
-        for i in range(len(self._packed_params)):
-            packed_param = self._packed_params[i]
-            grouped_params = self._grouped_params[i]
-            if len(grouped_params) == 1:
-                continue
-            split = param_pack_split(
-                packed_param._symvar, [i["shape"] for i in grouped_params]
-            )
-            split = [
-                Parameter(Tensor(i, requires_grad=packed_param.requires_grad))
-                for i in split
-            ]
-            for j in range(len(split)):
-                replace_param[grouped_params[j]["id"]] = split[j]
-        self._model.replace_param(replace_param, 0)
-
-        return self._model.forward(*args, **kwargs)
diff --git a/python_module/megengine/module/pooling.py b/python_module/megengine/module/pooling.py
deleted file mode 100644
index 8126ddc1..00000000
--- a/python_module/megengine/module/pooling.py
+++ /dev/null
@@ -1,80 +0,0 @@
-# -*- coding: utf-8 -*-
-# MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
-#
-# Copyright (c) 2014-2020 Megvii Inc. All rights reserved.
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-from abc import abstractmethod
-from typing import Tuple, Union
-
-from ..functional import avg_pool2d, max_pool2d
-from .module import Module
-
-
-class _PoolNd(Module):
-    def __init__(
-        self,
-        kernel_size: Union[int, Tuple[int, int]],
-        stride: Union[int, Tuple[int, int]] = None,
-        padding: Union[int, Tuple[int, int]] = 0,
-    ):
-        super(_PoolNd, self).__init__()
-        self.kernel_size = kernel_size
-        self.stride = stride or kernel_size
-        self.padding = padding
-
-    @abstractmethod
-    def forward(self, inp):
-        pass
-
-
-class MaxPool2d(_PoolNd):
-    r"""Applies a 2D max pooling over an input.
-
-    For instance, given an input of the size :math:`(N, C, H, W)` and
-    :attr:`kernel_size` :math:`(kH, kW)`, this layer generates the output of
-    the size :math:`(N, C, H_{out}, W_{out})` through a process described as:
-
-    .. math::
-        \begin{aligned}
-            out(N_i, C_j, h, w) ={} & \max_{m=0, \ldots, kH-1} \max_{n=0, \ldots, kW-1}
-                \text{input}(N_i, C_j, \text{stride[0]} \times h + m,
-                \text{stride[1]} \times w + n)
-        \end{aligned}
-
-    If :attr:`padding` is non-zero, then the input is implicitly zero-padded on
-    both sides for :attr:`padding` number of points.
-
-    :param kernel_size: the size of the window to take a max over.
-    :param stride: the stride of the window. Default value is ``kernel_size``.
-    :param padding: implicit zero padding to be added on both sides.
-    """
-
-    def forward(self, inp):
-        return max_pool2d(inp, self.kernel_size, self.stride, self.padding)
-
-
-class AvgPool2d(_PoolNd):
-    r"""Applies a 2D average pooling over an input.
-
-    For instance, given an input of the size :math:`(N, C, H, W)` and
-    :attr:`kernel_size` :math:`(kH, kW)`, this layer generates the output of
-    the size :math:`(N, C, H_{out}, W_{out})` through a process described as:
-
-    .. math::
-
-        out(N_i, C_j, h, w)  = \frac{1}{kH * kW} \sum_{m=0}^{kH-1} \sum_{n=0}^{kW-1}
-                               input(N_i, C_j, stride[0] \times h + m, stride[1] \times w + n)
-
-    If :attr:`padding` is non-zero, then the input is implicitly zero-padded on
-    both sides for :attr:`padding` number of points.
-
-    :param kernel_size: the size of the window.
-    :param stride: the stride of the window. Default value is ``kernel_size``.
-    :param padding: implicit zero padding to be added on both sides.
-    """
-
-    def forward(self, inp):
-        return avg_pool2d(inp, self.kernel_size, self.stride, self.padding)
diff --git a/python_module/megengine/module/pytorch/__init__.py b/python_module/megengine/module/pytorch/__init__.py
deleted file mode 100644
index 5902d9c3..00000000
--- a/python_module/megengine/module/pytorch/__init__.py
+++ /dev/null
@@ -1,9 +0,0 @@
-# -*- coding: utf-8 -*-
-# MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
-#
-# Copyright (c) 2014-2020 Megvii Inc. All rights reserved.
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-from .pytorch import PyTorchModule
diff --git a/python_module/megengine/module/pytorch/pytorch.py b/python_module/megengine/module/pytorch/pytorch.py
deleted file mode 100644
index 81548a50..00000000
--- a/python_module/megengine/module/pytorch/pytorch.py
+++ /dev/null
@@ -1,451 +0,0 @@
-# -*- coding: utf-8 -*-
-# MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
-#
-# Copyright (c) 2014-2020 Megvii Inc. All rights reserved.
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-import collections
-import copy
-import functools
-import os
-from typing import Any, Callable, List, Optional, Tuple
-
-import torch
-from torch.utils.cpp_extension import load as load_torch_extension
-
-import megengine._internal as mgb
-from megengine._internal import CompGraph
-from megengine._internal.mgb import CompGraphCallbackValueProxy
-
-from ...core import Parameter, Tensor, get_default_device
-from ..module import Module
-from .utils import device_to_torch_device, torch_dtype_to_numpy_dtype
-
-# A global dict to map opr during graph copy
-_copy_dict = {}
-
-
-@functools.lru_cache(None)
-def _get_torch_mem_fwd_lib():
-    source_file = os.path.join(os.path.dirname(__file__), "torch_mem_fwd.cpp")
-    return load_torch_extension(
-        "torch_mem_fwd",
-        [source_file],
-        extra_include_paths=[mgb.config.get_include_path()],
-    )
-
-
-def inp_mem_fwd(pubapi_dev_tensor_ptr: int) -> torch.Tensor:
-    """Forward a MegBrain tensor to torch tensor
-
-    :param pubapi_dev_tensor_ptr: pointer to MegBrain tensor
-    """
-    return _get_torch_mem_fwd_lib().inp_mem_fwd(pubapi_dev_tensor_ptr)
-
-
-def oup_mem_fwd(
-    pubapi_dev_tensor_ptr: int, tensor: torch.Tensor, keep_data_ptr: bool = True
-) -> None:
-    """Forward a torch tensor to a contiguous MegBrain tensor
-
-    :param pubapi_dev_tensor_ptr: Pointer to the MegBrain tensor
-    :param tensor: The input torch tensor
-    :param keep_data_ptr: if True, memory copy is not allowed here,
-            thus the input torch tensor must be contiguous also.
-            defaults to True
-    """
-    _get_torch_mem_fwd_lib().oup_mem_fwd(pubapi_dev_tensor_ptr, tensor, keep_data_ptr)
-
-
-def torch_param_to_mge(
-    name: str, param: torch.nn.Parameter, device, comp_graph: CompGraph
-) -> Parameter:
-    """Convert a torch parameter to a megengine parameter
-
-    :param name: parametr name
-    :param param: torch parameter
-    :param device: the device on which the megengine parameter is,
-            should be physically the same as the one on torch parameter
-    :param comp_graph: the owner graph of megengine parameter
-    :return: megengine parameter
-    """
-    assert isinstance(param, torch.nn.Parameter)
-    dtype = torch_dtype_to_numpy_dtype(param.dtype)
-    mge_param = Parameter(None, dtype=dtype)
-    shared_nd = mge_param._Tensor__val
-    oup_mem_fwd(shared_nd.pubapi_dev_tensor_ptr, param.data, True)
-    return mge_param
-
-
-class _PyTorchSubgraphGradOpr(mgb.craniotome.CraniotomeBase):
-    __nr_inputs__ = None
-    __nr_outputs__ = None
-    __allow_duplicate__ = False
-    __disable_sys_mem_alloc__ = True
-    __is_dynamic_output_shape__ = True
-    _forward_opr = None  # type: PyTorchSubgraphImplOpr
-    _shape_infer_func = None
-    _condensed_out_grad_idx = None  # type: List[Optional[int]]
-
-    _forward_input_cnt = None
-    _forward_output_cnt = None
-    _output_grad_cnt = None
-    _param_cnt = None
-
-    def setup(
-        self, forward_opr, condensed_out_grad_idx: List[Optional[int]], infer_shape=None
-    ):
-        self._forward_opr = forward_opr
-        self._forward_input_cnt = forward_opr.input_cnt
-        self._forward_output_cnt = forward_opr.output_cnt
-        self._param_cnt = forward_opr.param_cnt
-        self._output_grad_cnt = sum([idx is not None for idx in condensed_out_grad_idx])
-        self.__nr_inputs__ = (
-            self._forward_input_cnt
-            + self._param_cnt
-            + self._forward_output_cnt
-            + self._output_grad_cnt
-        )
-        self.__nr_outputs__ = self._forward_input_cnt + self._param_cnt
-        self._forward_opr = forward_opr
-        self._condensed_out_grad_idx = condensed_out_grad_idx
-        self._shape_infer_func = infer_shape
-        if infer_shape is not None:
-            type(self).__is_dynamic_output_shape__ = False
-
-    def execute(
-        self,
-        inputs: Tuple[CompGraphCallbackValueProxy, ...],
-        outputs: Tuple[mgb.SharedND, ...],
-    ):
-        assert self._forward_opr._last_forward_inputs is not None
-        assert self._forward_opr._last_forward_outputs is not None
-        if self._forward_opr._last_forward_outputs is None:
-            self._forward_opr.execute(inputs[: self.__nr_outputs__], None)
-
-        out_grads = [
-            inp_mem_fwd(inputs[idx].pubapi_dev_tensor_ptr) if idx else None
-            for idx in self._condensed_out_grad_idx
-        ]
-
-        grads = torch.autograd.grad(
-            self._forward_opr._last_forward_outputs,
-            self._forward_opr._last_forward_inputs
-            + self._forward_opr._last_forward_params,
-            out_grads,  # type: ignore
-            only_inputs=True,
-            allow_unused=True,
-        )
-        for ovar, oten in zip(outputs, grads):
-            oup_mem_fwd(ovar.pubapi_dev_tensor_ptr, oten)
-
-    def grad(self, wrt_idx, inputs, outputs, out_grad):
-        raise NotImplementedError("Apply grad to a grad opr is not supported")
-
-    def infer_shape(self, inp_shapes):
-        if callable(self._shape_infer_func):
-            return self._shape_infer_func(inp_shapes)
-        raise NotImplementedError(
-            "No shape inference function specified on PyTorchSubgraphImplOpr"
-        )
-
-    def copy(self):
-
-        ret = type(self)()
-        d0 = self.__dict__.copy()
-        d0.pop("this")
-        d0.pop("_forward_opr")
-
-        later_copy = self._forward_opr in _copy_dict
-        if later_copy:
-            assert len(_copy_dict) == 1
-            forward_opr_copy = _copy_dict[self._forward_opr]
-        else:
-            forward_opr_copy = self._forward_opr
-        ret.__dict__["_forward_opr"] = forward_opr_copy
-
-        ret.__dict__.update(copy.deepcopy(d0))
-        _copy_dict[self] = ret
-        if later_copy:
-            forward_opr_copy._grad_opr = ret
-            _copy_dict.clear()
-
-        return ret
-
-
-class PyTorchSubgraphImplOpr(mgb.craniotome.CraniotomeBase):
-    # pylint: disable=abstract-method
-    """This is a pytorch module wrapper to operator"""
-
-    __nr_inputs__ = None  # type: int
-    __nr_outputs__ = None  # type: int
-    __allow_duplicate__ = False
-    __disable_sys_mem_alloc__ = True
-    __is_dynamic_output_shape__ = True
-
-    _grad_opr = None
-    _func = None  # type: Callable[[Any], Any]
-    input_cnt = None  # type: int
-    output_cnt = None  # type: int
-    param_cnt = None  # type: int
-    _shape_infer_func = None
-
-    _last_forward_inputs = None
-    _last_forward_outputs = None  # type: List[torch.Tensor]
-    _last_forward_params = None  # type: List[torch.Tensor]
-
-    def setup(self, *, input_cnt, output_cnt, func, params, infer_shape=None):
-        """Setup the operator by accepted kwargs
-
-        :param input_cnt: input count of torch module
-        :param output_cnt: output count of torch module
-        :param func: a callable object accept inputs and returns outputs
-                usually a torch module itself
-        :param params: parameters of the torch module
-        :param infer_shape: a callable infers output shapes from input shapes,
-                defaults to None
-        """
-        param_cnt = len(params)
-        self.input_cnt = input_cnt
-        self.output_cnt = output_cnt
-        self.param_cnt = param_cnt
-        self.__nr_inputs__ = input_cnt + param_cnt
-        self.__nr_outputs__ = output_cnt
-        self._func = func
-        self._shape_infer_func = infer_shape
-        if infer_shape is not None:
-            type(self).__is_dynamic_output_shape__ = False
-        self._last_forward_params = params
-
-    def execute(
-        self,
-        inputs: Tuple[CompGraphCallbackValueProxy, ...],
-        outputs: Optional[Tuple[mgb.SharedND, ...]],
-    ):
-        """execute the operator, read values from *inputs*,
-        forward them to torch tensor and do execution by self.func
-        and forward results to outputs
-
-        :param inputs: values for each input var
-        :param outputs: values for each output var
-        """
-        input_value_proxys = inputs[: self.input_cnt]
-
-        input_torch_tensors = [
-            inp_mem_fwd(ivar.pubapi_dev_tensor_ptr).requires_grad_()
-            for ivar in input_value_proxys
-        ]
-
-        output_torch_tensors = self._func(*input_torch_tensors)
-
-        if isinstance(output_torch_tensors, torch.Tensor):
-            output_torch_tensors = [output_torch_tensors]
-
-        # `execute` may be called in _PyTorchSubgraphGradOp with None as outputs
-        if outputs:
-            for ovar, oten in zip(outputs, output_torch_tensors):
-                oup_mem_fwd(ovar.pubapi_dev_tensor_ptr, oten)
-
-        # Retain input / output tensors for backward
-        self._last_forward_inputs = input_torch_tensors
-        self._last_forward_outputs = output_torch_tensors
-
-    def grad(
-        self,
-        wrt_idx,
-        inputs: Tuple[mgb.SymbolVar, ...],
-        outputs: Tuple[mgb.SymbolVar, ...],
-        out_grads: Tuple[mgb.SymbolVar, ...],
-    ):
-        """generate a grad opr which calculates grad by torch.autograd.grad and cache it
-
-        :param wrt_idx: the input var with respect to which the gradient should
-                be computed
-        :param inputs: operator inputs
-        :param outputs: operator outputs
-        :param out_grads: gradients of each output var
-        :return: an initialized grad opr
-        """
-        if self._grad_opr is None:
-            condensed_out_grad = []
-            condensed_out_grad_idx = []  # type: List[Optional[int]]
-            idx = self.__nr_inputs__ + len(outputs)
-            for out_grad in out_grads:
-                if out_grad is None:
-                    condensed_out_grad_idx.append(None)
-                else:
-                    condensed_out_grad.append(out_grad)
-                    condensed_out_grad_idx.append(idx)
-                idx += 1
-            self._grad_opr = _PyTorchSubgraphGradOpr.make(
-                *(inputs + outputs + tuple(condensed_out_grad)),
-                forward_opr=self,
-                condensed_out_grad_idx=condensed_out_grad_idx,
-            )
-        return self._grad_opr
-
-    def infer_shape(self, inp_shapes):
-        """infer output shape from input shapes
-
-        :param inp_shapes: input shapes as tuple
-        :return: output shapes
-        """
-        if callable(self._shape_infer_func):
-            return self._shape_infer_func(inp_shapes)
-        raise NotImplementedError(
-            "No shape inference function specified on PyTorchSubgraphImplOpr"
-        )
-
-    def copy(self):
-        ret = type(self)()
-        d0 = self.__dict__.copy()
-        d0.pop("this")
-
-        ret.__dict__["_last_forward_inputs"] = d0.pop("_last_forward_inputs")
-        ret.__dict__["_last_forward_outputs"] = d0.pop("_last_forward_outputs")
-        ret.__dict__["_last_forward_params"] = d0.pop("_last_forward_params")
-        ret.__dict__["_func"] = d0.pop("_func")
-
-        d0.pop("_grad_opr")
-        later_copy = self._grad_opr in _copy_dict
-        if later_copy:
-            assert len(_copy_dict) == 1
-            grad_opr_copy = _copy_dict[self._grad_opr]
-        else:
-            grad_opr_copy = self._grad_opr
-        ret.__dict__["_grad_opr"] = grad_opr_copy
-
-        ret.__dict__.update(copy.deepcopy(d0))
-        _copy_dict[self] = ret
-        if later_copy:
-            grad_opr_copy._forward_opr = ret
-            _copy_dict.clear()
-
-        return ret
-
-
-class PyTorchModule(Module):
-    """Wrap a pytorch module as megengine module
-
-    :param torch_module: torch module to be wrapped
-    :param device: target device this module would be in
-    :param output_cnt: output count of this module
-    :param input_shape: input shape inferrer
-    :param comp_graph: target comp_graph on which this module would be in
-    """
-
-    __torch_module = None  # type: torch.nn.Module
-    __output_cnt = None
-    __infer_shape = None
-    __comp_graph = None
-    __device = None
-    _torch_params = None
-    _param_inputs = None
-    _name_param_list = None  # type: List[Tuple[str, Parameter]]
-
-    def __init__(
-        self,
-        torch_module,
-        device=None,
-        output_cnt=1,
-        *,
-        infer_shape=None,
-        comp_graph=None
-    ):
-        super().__init__()
-        if not isinstance(torch_module, torch.nn.Module):
-            raise TypeError(
-                "torch_module should either be an instance of torch.nn.Module "
-                "or its subclass"
-            )
-        self.__torch_module = torch_module
-
-        if not isinstance(output_cnt, int):
-            raise TypeError("output_cnt must be int")
-        if output_cnt <= 0:
-            raise ValueError("output_cnt must be greater than zero")
-        self.__output_cnt = output_cnt
-
-        if infer_shape and not callable(infer_shape):
-            raise TypeError("infer_shape should either be None or a callable object")
-        self.__infer_shape = infer_shape
-
-        if comp_graph and not isinstance(comp_graph, mgb.CompGraph):
-            raise TypeError("comp_graph shoud eighter be None or a mgb.CompGraph")
-        self.__comp_graph = comp_graph
-
-        self._torch_params = []
-        self._param_inputs = []
-        self._name_param_list = []
-
-        if device is None:
-            device = get_default_device()
-
-        if isinstance(device, str):
-            device = mgb.comp_node(device)
-        self.device = device
-
-    def init_params(self):
-        """forward torch parameters to megengine parameters and store,
-        would be called in constructor and setter of device
-        """
-        self._torch_params = []
-        self._param_inputs = []
-        self._name_param_list = []
-
-        for name, torch_param in self.__torch_module.named_parameters(recurse=True):
-            formated_name = "_torch_{}_{}".format(id(self.__torch_module), name)
-            mge_param = torch_param_to_mge(
-                formated_name, torch_param, self.device, self.__comp_graph
-            )
-            self._param_inputs.append(mge_param)
-            self._torch_params.append(torch_param)
-            self._name_param_list.append((name, mge_param))
-
-    def get_param_by_name(self, param_name: str) -> Parameter:
-        """find parameter by its name
-
-        :param param_name: name of parameter
-        :return: the parameter
-        """
-        for name, param in self._name_param_list:
-            if param_name == name:
-                return param
-        raise KeyError("Cannot find param: {}".format(param_name))
-
-    def forward(self, *inputs):
-        """apply the module on given inputs
-
-        :return: output vars
-        """
-        param_inputs = [param._symvar for param in self._param_inputs]
-
-        inputs = [tensor._symvar for tensor in list(inputs)] + param_inputs
-
-        out = PyTorchSubgraphImplOpr.make(
-            *inputs,
-            input_cnt=len(inputs) - len(param_inputs),
-            output_cnt=self.__output_cnt,
-            func=self.__torch_module.forward,
-            params=self._torch_params,
-            infer_shape=self.__infer_shape,
-        )
-        if isinstance(out, mgb.SymbolVar):
-            return Tensor(out)
-        assert isinstance(out, collections.Iterable)
-        return [Tensor(sym) for sym in out]
-
-    def get_device(self):
-        """get the device this module belongs to"""
-        return self.__device
-
-    def set_device(self, device: mgb.CompNode):
-        """set the device and move torch module to corresponding device"""
-        touch_device = device_to_torch_device(device)
-        self.__torch_module.to(device=touch_device)
-        self.__device = device
-        self.init_params()
-
-    device = property(get_device, set_device)
diff --git a/python_module/megengine/module/pytorch/torch_mem_fwd.cpp b/python_module/megengine/module/pytorch/torch_mem_fwd.cpp
deleted file mode 100644
index dfcbe8f3..00000000
--- a/python_module/megengine/module/pytorch/torch_mem_fwd.cpp
+++ /dev/null
@@ -1,148 +0,0 @@
-/**
- * \file python_module/megengine/module/pytorch/torch_mem_fwd.cpp
- * MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
- *
- * Copyright (c) 2014-2020 Megvii Inc. All rights reserved.
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- */
-#include "torch/extension.h"
-#include "megbrain_pubapi.h"
-
-using MGBTensor = mgb::pubapi::DeviceTensor;
-
-torch::Tensor mgb_to_torch(const MGBTensor *src) {
-
-    mgb::pubapi::CallbackOnce deleter;
-    void* tensor_raw_ptr;
-    src->forward_to(&tensor_raw_ptr, &deleter);
-    auto deleter_wrap = [deleter](void*) mutable {
-        deleter.consume();
-    };
-
-    // TODO: support non-contiguous layout
-    std::vector<int64_t> sizes;
-    for (size_t i = 0; i < src->desc.ndim; ++ i) {
-        sizes.push_back(src->desc.shape[i]);
-    }
-
-    torch::TensorOptions options;
-    switch (src->desc.dtype) {
-#define map_dtype(mgb_dtype, torch_dtype) \
-    case MGBTensor::DataType::mgb_dtype: \
-        options = options.dtype(caffe2::TypeMeta::Make<torch_dtype>()); \
-        break;
-        map_dtype(FLOAT32, float);
-        map_dtype(FLOAT16, torch::Half);
-        map_dtype(INT32, int);
-        map_dtype(INT16, int16_t);
-        map_dtype(INT8, int8_t);
-        map_dtype(UINT8, uint8_t);
-#undef map_dtype
-        default:
-            throw std::runtime_error("bad case for data type.");
-    }
-
-    // TODO: Maybe we should impl copy on different devices?
-    switch (src->desc.type) {
-        case MGBTensor::Type::CUDA: {
-            int device_id = src->desc.cuda_ctx.device;
-            if (device_id >= 0) {
-                options = options.device(torch::DeviceType::CUDA, device_id);
-            } else {
-                throw std::runtime_error("bad case for device(cuda) id.");
-            }
-            // TODO: consider cuda synchronization here
-            // Maybe all tasks issued on cuda_ctx(device, stream) should be done?
-            break;
-        }
-        case MGBTensor::Type::CPU:
-            options = options.device(torch::DeviceType::CPU);
-            // Torch's API are all synchronous.
-            src->sync();
-            break;
-        default:
-            throw std::runtime_error("bad case for device type.");
-    }
-
-    auto tensor = torch::from_blob(tensor_raw_ptr, sizes, deleter_wrap, options);
-    return tensor;
-}
-
-void torch_to_mgb(MGBTensor* dst, torch::Tensor src) {
-    MGBTensor::Desc desc;
-
-    desc.dev_ptr = src.data_ptr();
-
-    // src is contiguous torch tensor here, so no strides needed
-    std::vector<size_t> shape;
-    // desc.shape is the pointer to a size array used to construct
-    // an inner-mgb tensor, which should be valid until calling of
-    // forward_other_memory return
-    for (auto &&i : src.sizes()) {
-        shape.push_back(i);
-    }
-    desc.shape = shape.data();
-    desc.ndim = shape.size();
-
-    switch (src.scalar_type()) {
-#define map_dtype(mgb_dtype, torch_dtype) \
-    case torch::ScalarType::torch_dtype: \
-        desc.dtype = MGBTensor::DataType::mgb_dtype; \
-        break;
-        map_dtype(FLOAT32, Float);
-        map_dtype(FLOAT16, Half);
-        map_dtype(INT32, Int);
-        map_dtype(INT16, Short);
-        map_dtype(INT8, Char);
-        map_dtype(UINT8, Byte);
-#undef map_dtype
-        default:
-            throw std::runtime_error("bad case for data type.");
-    }
-
-    // TODO: cuda setting and synchronization like mgb_to_torch
-    if (src.device().type() == torch::DeviceType::CUDA) {
-        desc.type = MGBTensor::Type::CUDA;
-        desc.cuda_ctx.device = src.get_device();
-        desc.cuda_ctx.stream = nullptr;
-    } else {
-        assert(src.device().type() == torch::DeviceType::CPU);
-        desc.type = MGBTensor::Type::CUDA;
-    }
-
-    mgb::pubapi::CallbackOnce deleter;
-    deleter.user_data = new torch::Tensor(src);
-    deleter.fptr = [](void* ptr) {
-        delete static_cast<torch::Tensor*>(ptr);
-    };
-    dst->forward_other_memory(desc, deleter);
-}
-
-torch::Tensor inp_mem_fwd(uintptr_t dv_ptr) {
-    // construct torch Tensor from mgb DeviceTensor stored in dv_ptr.
-    return mgb_to_torch(reinterpret_cast<MGBTensor*>(dv_ptr));
-}
-
-void oup_mem_fwd(uintptr_t dv_ptr, torch::Tensor src,
-                 bool keep_data_ptr=false) {
-    // forward storage in torch Tensor to mgb DeviceTensor
-    // keep_data_ptr: set to True to ensure forwarding data_ptr under \p src
-    // to megbrain, or it maybe copy src to a new contiguous tensor storage.
-
-    // which would return src itself if tensor is contiguous
-    auto src_contig = src.contiguous();
-
-    if (keep_data_ptr && src_contig.data_ptr() != src.data_ptr()) {
-        throw std::runtime_error("should keep tensor data ptr, but it changed");
-    }
-    torch_to_mgb(reinterpret_cast<MGBTensor*>(dv_ptr), src_contig);
-}
-
-PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
-    m.def("inp_mem_fwd", &inp_mem_fwd, "Forward mgb DeviceTensor ptr into torch Tensor as network input.");
-    m.def("oup_mem_fwd", &oup_mem_fwd, "Forward torch network Tensor to corresponding mgb VarNode.",
-        py::arg("dv_ptr"), py::arg("src"), py::arg("keep_data_ptr") = false);
-}
diff --git a/python_module/megengine/module/pytorch/utils.py b/python_module/megengine/module/pytorch/utils.py
deleted file mode 100644
index fea87bfb..00000000
--- a/python_module/megengine/module/pytorch/utils.py
+++ /dev/null
@@ -1,67 +0,0 @@
-# -*- coding: utf-8 -*-
-# MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
-#
-# Copyright (c) 2014-2020 Megvii Inc. All rights reserved.
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-import numpy as np
-import torch
-
-import megengine._internal as mgb
-
-_TORCH_NUMPY_MAPPING = {
-    torch.float16: np.float16,
-    torch.float32: np.float32,
-    torch.float64: np.float64,
-    torch.int8: np.int8,
-    torch.int16: np.int16,
-    torch.int32: np.int32,
-}
-
-
-def torch_dtype_to_numpy_dtype(torch_dtype: torch.dtype):
-    """map torch dtype to numpy dtype
-
-    :param torch_dtype: torch dtype
-    :return: numpy dtype
-    """
-    if not isinstance(torch_dtype, torch.dtype):
-        raise TypeError("Argument `torch_dtype` should be an instance of torch.dtype")
-    if torch_dtype not in _TORCH_NUMPY_MAPPING:
-        raise ValueError("Unknown PyTorch dtype: {}".format(torch_dtype))
-    return _TORCH_NUMPY_MAPPING[torch_dtype]
-
-
-def torch_device_to_device(device: torch.device):
-    """map torch device to device
-
-    :param device: torch device
-    :return: device
-    """
-    if not isinstance(device, torch.device):
-        raise TypeError("Argument `device` should be an instance of torch.device")
-    index = device.index
-    if index is None:
-        index = "x"
-    if device.type == "cpu":
-        return "cpu{}".format(index)
-    elif device.type == "cuda":
-        return "gpu{}".format(index)
-    raise ValueError("Unknown PyTorch device: {}".format(device))
-
-
-def device_to_torch_device(device: mgb.CompNode):
-    """map device to torch device
-
-    :param device: megbrain compute node
-    :return: corresponding torch device
-    """
-    t, d, _ = device.locator_physical
-    if t == "CUDA":
-        return torch.device("cuda", d)
-    elif t == "CPU":
-        return torch.device("cpu", d)
-    else:
-        raise Exception("Unsupported device type: {}".format(t))
diff --git a/python_module/megengine/module/qat/__init__.py b/python_module/megengine/module/qat/__init__.py
deleted file mode 100644
index b6adab4d..00000000
--- a/python_module/megengine/module/qat/__init__.py
+++ /dev/null
@@ -1,14 +0,0 @@
-# MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
-#
-# Copyright (c) 2014-2020 Megvii Inc. All rights reserved.
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-from .concat import Concat
-from .conv import Conv2d, ConvRelu2d
-from .conv_bn import ConvBn2d, ConvBnRelu2d
-from .elemwise import Elemwise
-from .linear import Linear
-from .module import QATModule
-from .quant_dequant import DequantStub, QuantStub
diff --git a/python_module/megengine/module/qat/concat.py b/python_module/megengine/module/qat/concat.py
deleted file mode 100644
index 893b1ad0..00000000
--- a/python_module/megengine/module/qat/concat.py
+++ /dev/null
@@ -1,30 +0,0 @@
-# MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
-#
-# Copyright (c) 2014-2020 Megvii Inc. All rights reserved.
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-from typing import Iterable
-
-from ...core.tensor import Tensor
-from .. import concat as Float
-from .module import QATModule
-
-
-class Concat(Float.Concat, QATModule):
-    r"""
-    A :class:`~.QATModule` to do functional concat with QAT support.
-    Could be applied with :class:`~.Observer` and :class:`~.FakeQuantize`.
-    """
-
-    def forward(self, inps: Iterable[Tensor], axis: int = 0):
-        return self.apply_quant_activation(super().forward(inps, axis))
-
-    @classmethod
-    def from_float_module(cls, float_module):
-        r"""
-        Return a :class:`~.QATModule` instance converted from
-        a float :class:`~.Module` instance.
-        """
-        return cls()
diff --git a/python_module/megengine/module/qat/conv.py b/python_module/megengine/module/qat/conv.py
deleted file mode 100644
index 315da839..00000000
--- a/python_module/megengine/module/qat/conv.py
+++ /dev/null
@@ -1,59 +0,0 @@
-# MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
-#
-# Copyright (c) 2014-2020 Megvii Inc. All rights reserved.
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-from ... import functional as F
-from ...quantization.utils import fake_quant_bias
-from .. import conv as Float
-from .module import QATModule
-
-
-class Conv2d(Float.Conv2d, QATModule):
-    r"""
-    A :class:`~.QATModule` Conv2d with QAT support.
-    Could be applied with :class:`~.Observer` and :class:`~.FakeQuantize`.
-    """
-
-    def calc_conv_qat(self, inp):
-        w_qat = self.apply_quant_weight(self.weight)
-        b_qat = fake_quant_bias(self.bias, inp, w_qat)
-        conv = self.calc_conv(inp, w_qat, b_qat)
-        return conv
-
-    @classmethod
-    def from_float_module(cls, float_module: Float.Conv2d):
-        r"""
-        Return a :class:`~.QATModule` instance converted from
-        a float :class:`~.Module` instance.
-        """
-        qat_module = cls(
-            float_module.in_channels,
-            float_module.out_channels,
-            float_module.kernel_size,
-            float_module.stride,
-            float_module.padding,
-            float_module.dilation,
-            float_module.groups,
-            float_module.bias is not None,
-            float_module.conv_mode.name,
-            float_module.compute_mode.name,
-        )
-        qat_module.weight = float_module.weight
-        qat_module.bias = float_module.bias
-        return qat_module
-
-    def forward(self, inp):
-        return self.apply_quant_activation(self.calc_conv_qat(inp))
-
-
-class ConvRelu2d(Conv2d):
-    r"""
-    A :class:`~.QATModule` include Conv2d and Relu with QAT support.
-    Could be applied with :class:`~.Observer` and :class:`~.FakeQuantize`.
-    """
-
-    def forward(self, inp):
-        return self.apply_quant_activation(F.relu(self.calc_conv_qat(inp)))
diff --git a/python_module/megengine/module/qat/conv_bn.py b/python_module/megengine/module/qat/conv_bn.py
deleted file mode 100644
index 9ed6ebab..00000000
--- a/python_module/megengine/module/qat/conv_bn.py
+++ /dev/null
@@ -1,196 +0,0 @@
-# MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
-#
-# Copyright (c) 2014-2020 Megvii Inc. All rights reserved.
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-from ...core import ones, zeros
-from ...functional import add_update, relu, sqrt, sum, zero_grad
-from ...quantization.utils import fake_quant_bias
-from .. import conv_bn as Float
-from .module import QATModule
-
-
-class _ConvBnActivation2d(Float._ConvBnActivation2d, QATModule):
-    def get_batch_mean_var(self, inp):
-        def _sum_channel(inp, axis=0, keepdims=True):
-            if isinstance(axis, int):
-                out = sum(inp, axis=axis, keepdims=keepdims)
-            elif isinstance(axis, tuple):
-                for idx, elem in enumerate(axis):
-                    out = sum(inp if idx == 0 else out, axis=elem, keepdims=keepdims)
-            return out
-
-        sum1 = _sum_channel(inp, (0, 2, 3))
-        sum2 = _sum_channel(inp ** 2, (0, 2, 3))
-        reduce_size = inp.shapeof().prod() / inp.shapeof(1)
-        batch_mean = sum1 / reduce_size
-        batch_var = (sum2 - sum1 ** 2 / reduce_size) / reduce_size
-        return batch_mean, batch_var
-
-    def fold_weight_bias(self, bn_mean, bn_var):
-        # get fold bn conv param
-        # bn_istd = 1 / bn_std
-        # w_fold = gamma / bn_std * W
-        # b_fold = gamma * (b - bn_mean) / bn_std + beta
-        gamma = self.bn.weight
-        if gamma is None:
-            gamma = ones((self.bn.num_features), dtype="float32")
-        gamma = gamma.reshape(1, -1, 1, 1)
-        beta = self.bn.bias
-        if beta is None:
-            beta = zeros((self.bn.num_features), dtype="float32")
-        beta = beta.reshape(1, -1, 1, 1)
-
-        if bn_mean is None:
-            bn_mean = zeros((1, self.bn.num_features, 1, 1), dtype="float32")
-        if bn_var is None:
-            bn_var = ones((1, self.bn.num_features, 1, 1), dtype="float32")
-
-        conv_bias = self.conv.bias
-        if conv_bias is None:
-            conv_bias = zeros(self.conv._infer_bias_shape(), dtype="float32")
-
-        bn_istd = 1.0 / sqrt(bn_var + self.bn.eps)
-        # bn_istd = 1 / bn_std
-        # w_fold = gamma / bn_std * W
-        scale_factor = gamma * bn_istd
-        if self.conv.groups == 1:
-            w_fold = self.conv.weight * scale_factor.reshape(-1, 1, 1, 1)
-        else:
-            w_fold = self.conv.weight * scale_factor.reshape(
-                self.conv.groups, -1, 1, 1, 1
-            )
-
-        w_fold = self.apply_quant_weight(w_fold)
-        # b_fold = gamma * (b - bn_mean) / bn_std + beta
-        b_fold = beta + gamma * (conv_bias - bn_mean) * bn_istd
-        return w_fold, b_fold
-
-    def update_running_mean_and_running_var(
-        self, bn_mean, bn_var, num_elements_per_channel
-    ):
-        # update running mean and running var. no grad, use unbiased bn var
-        bn_mean = zero_grad(bn_mean)
-        bn_var = (
-            zero_grad(bn_var)
-            * num_elements_per_channel
-            / (num_elements_per_channel - 1)
-        )
-        exponential_average_factor = 1 - self.bn.momentum
-        add_update(
-            self.bn.running_mean,
-            delta=bn_mean,
-            alpha=1 - exponential_average_factor,
-            beta=exponential_average_factor,
-        )
-        add_update(
-            self.bn.running_var,
-            delta=bn_var,
-            alpha=1 - exponential_average_factor,
-            beta=exponential_average_factor,
-        )
-
-    def calc_conv_bn_qat(self, inp, approx=True):
-        if self.training and not approx:
-            conv = self.conv(inp)
-            bn_mean, bn_var = self.get_batch_mean_var(conv)
-            num_elements_per_channel = conv.shapeof().prod() / conv.shapeof(1)
-            self.update_running_mean_and_running_var(
-                bn_mean, bn_var, num_elements_per_channel
-            )
-        else:
-            bn_mean, bn_var = self.bn.running_mean, self.bn.running_var
-
-        # get gamma and beta in BatchNorm
-        gamma = self.bn.weight
-        if gamma is None:
-            gamma = ones((self.bn.num_features), dtype="float32")
-        gamma = gamma.reshape(1, -1, 1, 1)
-        beta = self.bn.bias
-        if beta is None:
-            beta = zeros((self.bn.num_features), dtype="float32")
-        beta = beta.reshape(1, -1, 1, 1)
-        # conv_bias
-        conv_bias = self.conv.bias
-        if conv_bias is None:
-            conv_bias = zeros(self.conv._infer_bias_shape(), dtype="float32")
-
-        bn_istd = 1.0 / sqrt(bn_var + self.bn.eps)
-        # bn_istd = 1 / bn_std
-        # w_fold = gamma / bn_std * W
-        scale_factor = gamma * bn_istd
-        if self.conv.groups == 1:
-            w_fold = self.conv.weight * scale_factor.reshape(-1, 1, 1, 1)
-        else:
-            w_fold = self.conv.weight * scale_factor.reshape(
-                self.conv.groups, -1, 1, 1, 1
-            )
-        b_fold = None
-        if not (self.training and approx):
-            # b_fold = gamma * (conv_bias - bn_mean) / bn_std + beta
-            b_fold = beta + gamma * (conv_bias - bn_mean) * bn_istd
-
-        w_qat = self.apply_quant_weight(w_fold)
-        b_qat = fake_quant_bias(b_fold, inp, w_qat)
-        conv = self.conv.calc_conv(inp, w_qat, b_qat)
-        if not (self.training and approx):
-            return conv
-
-        # rescale conv to get original conv output
-        orig_conv = conv / scale_factor.reshape(1, -1, 1, 1)
-        if self.conv.bias is not None:
-            orig_conv = orig_conv + self.conv.bias
-        # calculate batch norm
-        bn_mean, bn_var = self.get_batch_mean_var(orig_conv)
-        bn_istd = 1.0 / sqrt(bn_var + self.bn.eps)
-        conv = gamma * bn_istd * (orig_conv - bn_mean) + beta
-        num_elements_per_channel = conv.shapeof().prod() / conv.shapeof(1)
-        self.update_running_mean_and_running_var(
-            bn_mean, bn_var, num_elements_per_channel
-        )
-        return conv
-
-    @classmethod
-    def from_float_module(cls, float_module: Float._ConvBnActivation2d):
-        r"""
-        Return a :class:`~.QATModule` instance converted from
-        a float :class:`~.Module` instance.
-        """
-        qat_module = cls(
-            float_module.conv.in_channels,
-            float_module.conv.out_channels,
-            float_module.conv.kernel_size,
-            float_module.conv.stride,
-            float_module.conv.padding,
-            float_module.conv.dilation,
-            float_module.conv.groups,
-            float_module.conv.bias is not None,
-            float_module.conv.conv_mode.name,
-            float_module.conv.compute_mode.name,
-        )
-        qat_module.conv.weight = float_module.conv.weight
-        qat_module.conv.bias = float_module.conv.bias
-        qat_module.bn = float_module.bn
-        return qat_module
-
-
-class ConvBn2d(_ConvBnActivation2d):
-    r"""
-    A fused :class:`~.QATModule` including Conv2d, BatchNorm2d with QAT support.
-    Could be applied with :class:`~.Observer` and :class:`~.FakeQuantize`.
-    """
-
-    def forward(self, inp):
-        return self.apply_quant_activation(self.calc_conv_bn_qat(inp))
-
-
-class ConvBnRelu2d(_ConvBnActivation2d):
-    r"""
-    A fused :class:`~.QATModule` including Conv2d, BatchNorm2d and relu with QAT support.
-    Could be applied with :class:`~.Observer` and :class:`~.FakeQuantize`.
-    """
-
-    def forward(self, inp):
-        return self.apply_quant_activation(relu(self.calc_conv_bn_qat(inp)))
diff --git a/python_module/megengine/module/qat/elemwise.py b/python_module/megengine/module/qat/elemwise.py
deleted file mode 100644
index f99583bd..00000000
--- a/python_module/megengine/module/qat/elemwise.py
+++ /dev/null
@@ -1,31 +0,0 @@
-# MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
-#
-# Copyright (c) 2014-2020 Megvii Inc. All rights reserved.
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-from .. import elemwise as Float
-from .module import QATModule
-
-
-class Elemwise(Float.Elemwise, QATModule):
-    r"""
-    A :class:`~.QATModule` to do elemwise operator with QAT support.
-    Could be applied with :class:`~.Observer` and :class:`~.FakeQuantize`.
-
-    :param method: the elemwise method, see :class:`~.module.elemwise.Elemwise` for detail.
-    """
-
-    with_weight = False
-
-    def forward(self, *inps):
-        return self.apply_quant_activation(super().forward(*inps))
-
-    @classmethod
-    def from_float_module(cls, float_module: Float.Elemwise):
-        r"""
-        Return a :class:`~.QATModule` instance converted from
-        a float :class:`~.Module` instance.
-        """
-        return cls(float_module.method.name)
diff --git a/python_module/megengine/module/qat/linear.py b/python_module/megengine/module/qat/linear.py
deleted file mode 100644
index 4067d51c..00000000
--- a/python_module/megengine/module/qat/linear.py
+++ /dev/null
@@ -1,39 +0,0 @@
-# MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
-#
-# Copyright (c) 2014-2020 Megvii Inc. All rights reserved.
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-from ...quantization.utils import fake_quant_bias
-from .. import linear as Float
-from .module import QATModule
-
-
-class Linear(Float.Linear, QATModule):
-    r"""
-    A :class:`~.QATModule` version of :class:`~.module.linear.Linear`.
-    Could be applied with :class:`~.Observer` and :class:`~.FakeQuantize`.
-
-    :param in_features: size of each input sample.
-    :param out_features: size of each output sample.
-    :param bias: If set to ``False``, the layer will not learn an additive bias.
-        Default: ``True``
-
-    """
-
-    def forward(self, x):
-        w_qat = self.apply_quant_weight(self.weight)
-        b_qat = fake_quant_bias(self.bias, x, w_qat)
-        return self.apply_quant_activation(self._calc_linear(x, w_qat, b_qat))
-
-    @classmethod
-    def from_float_module(cls, float_module: Float.Linear):
-        r"""
-        Return a :class:`~.QATModule` instance converted from
-        a float :class:`~.Module` instance.
-        """
-        qmod = cls(float_module.in_features, float_module.out_features)
-        qmod.weight = float_module.weight
-        qmod.bias = float_module.bias
-        return qmod
diff --git a/python_module/megengine/module/qat/module.py b/python_module/megengine/module/qat/module.py
deleted file mode 100644
index c7cb80cb..00000000
--- a/python_module/megengine/module/qat/module.py
+++ /dev/null
@@ -1,154 +0,0 @@
-# MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
-#
-# Copyright (c) 2014-2020 Megvii Inc. All rights reserved.
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-from abc import abstractmethod
-
-from ...core import Tensor
-from ...quantization import FakeQuantize, Observer, QConfig
-from ..module import Module
-
-
-class QATModule(Module):
-    r"""
-    Base class of quantized-float related Module, basically for QAT and Calibration.
-
-    Use :meth:`~.QATModule.from_float_module` to generate a instance from float :class:`~.Module`.
-    Or use :func:`~.quantize.quantize_qat` to do it recursively and automatically.
-
-    Can also be converted to :class:`~.QuantizedModule` for deployment using
-    :func:`~.quantize.quantize` further.
-    """
-
-    with_weight = True
-    with_act = True
-
-    def __init__(self):
-        super().__init__()
-
-        self.weight_observer = None  # type: Observer
-        self.act_observer = None  # type: Observer
-
-        self.weight_fake_quant = None  # type: FakeQuantize
-        self.act_fake_quant = None  # type: FakeQuantize
-
-    def set_qconfig(self, qconfig: QConfig):
-        r"""
-        Set quantization related configs with ``qconfig``, including
-        observer and fake_quant for weight and activation.
-        """
-
-        def safe_call(func):
-            return func() if func is not None else None
-
-        if self.with_act:
-            self.act_observer = safe_call(qconfig.act_observer)
-            self.act_fake_quant = safe_call(qconfig.act_fake_quant)
-        if self.with_weight:
-            self.weight_observer = safe_call(qconfig.weight_observer)
-            self.weight_fake_quant = safe_call(qconfig.weight_fake_quant)
-
-    def _enable_exec(self, with_module, func, enable):
-        if not with_module or not func:
-            return
-        if enable:
-            func.enable()
-        else:
-            func.disable()
-
-    def set_fake_quant(self, enable):
-        self._enable_exec(self.with_act, self.act_fake_quant, enable)
-        self._enable_exec(self.with_weight, self.weight_fake_quant, enable)
-
-    def set_observer(self, enable):
-        self._enable_exec(self.with_act, self.act_observer, enable)
-        self._enable_exec(self.with_weight, self.weight_observer, enable)
-
-    def _apply_fakequant_with_observer(
-        self, target: Tensor, fake_quant: FakeQuantize, observer: Observer
-    ):
-        # do observer
-        if observer is None:
-            oup = target
-            q_dict = None
-        else:
-            oup = observer(target)
-            q_dict = observer.get_qparams()
-        # do fake quant
-        if fake_quant is not None:
-            oup = fake_quant(oup, q_dict)
-            # use qparams of fake_quant if have.
-            if hasattr(fake_quant, "get_qparams"):
-                q_dict = fake_quant.get_qparams()
-        # set to tensor qparams.
-        if q_dict is not None:
-            oup.q_dict.update(q_dict)
-        return oup
-
-    def apply_quant_weight(self, target: Tensor):
-        r"""
-        Apply weight's observer and fake_quant from ``qconfig`` on ``target``.
-        """
-        return self._apply_fakequant_with_observer(
-            target, self.weight_fake_quant, self.weight_observer
-        )
-
-    def apply_quant_activation(self, target: Tensor):
-        r"""
-        Apply weight's observer and fake_quant from ``qconfig`` on ``target``.
-        """
-        return self._apply_fakequant_with_observer(
-            target, self.act_fake_quant, self.act_observer
-        )
-
-    def _get_method_result(
-        self, method: str, fake_quant: FakeQuantize, observer: Observer
-    ):
-        if hasattr(fake_quant, method):
-            return getattr(fake_quant, method)()
-        elif hasattr(observer, method):
-            return getattr(observer, method)()
-        return None
-
-    def get_weight_dtype(self):
-        r"""
-        Get weight's quantization dtype as the method from ``qconfig``.
-        """
-        return self._get_method_result(
-            "get_dtype", self.weight_fake_quant, self.weight_observer
-        )
-
-    def get_activation_dtype(self):
-        r"""
-        Get activation's quantization dtype as the method from ``qconfig``.
-        """
-        return self._get_method_result(
-            "get_dtype", self.act_fake_quant, self.act_observer
-        )
-
-    def get_weight_qparams(self):
-        r"""
-        Get weight's quantization parameters.
-        """
-        return self._get_method_result(
-            "get_qparams", self.weight_fake_quant, self.weight_observer
-        )
-
-    def get_activation_qparams(self):
-        r"""
-        Get activation's quantization parameters.
-        """
-        return self._get_method_result(
-            "get_qparams", self.act_fake_quant, self.act_observer
-        )
-
-    @classmethod
-    @abstractmethod
-    def from_float_module(cls, float_module: Module):
-        r"""
-        Return a :class:`~.QATModule` instance converted from
-        a float :class:`~.Module` instance.
-        """
diff --git a/python_module/megengine/module/qat/quant_dequant.py b/python_module/megengine/module/qat/quant_dequant.py
deleted file mode 100644
index 0baa3e1c..00000000
--- a/python_module/megengine/module/qat/quant_dequant.py
+++ /dev/null
@@ -1,50 +0,0 @@
-# MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
-#
-# Copyright (c) 2014-2020 Megvii Inc. All rights reserved.
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-from .. import quant_dequant as Float
-from .module import QATModule
-
-
-class QuantStub(Float.QuantStub, QATModule):
-    r"""
-    A helper QATModule simply return input, but will quantize
-    input after converted to :class:`~.QuantizedModule`.
-    """
-
-    with_weight = False
-
-    def forward(self, inp):
-        return self.apply_quant_activation(inp)
-
-    @classmethod
-    def from_float_module(cls, float_module: Float.QuantStub):
-        r"""
-        Return a :class:`~.QATModule` instance converted from
-        a float :class:`~.Module` instance.
-        """
-        return cls()
-
-
-class DequantStub(Float.DequantStub, QATModule):
-    r"""
-    A helper QATModule simply return input, but will de-quantize
-    input after converted to :class:`~.QuantizedModule`.
-    """
-
-    with_weight = False
-    with_act = False
-
-    def forward(self, inp):
-        return inp
-
-    @classmethod
-    def from_float_module(cls, float_module: Float.DequantStub):
-        r"""
-        Return a :class:`~.QATModule` instance converted from
-        a float :class:`~.Module` instance.
-        """
-        return cls()
diff --git a/python_module/megengine/module/quant_dequant.py b/python_module/megengine/module/quant_dequant.py
deleted file mode 100644
index aaf2b0cc..00000000
--- a/python_module/megengine/module/quant_dequant.py
+++ /dev/null
@@ -1,28 +0,0 @@
-# MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
-#
-# Copyright (c) 2014-2020 Megvii Inc. All rights reserved.
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-from .module import Module
-
-
-class QuantStub(Module):
-    r"""
-    A helper :class:`~.Module` simply returning input. Could be replaced with :class:`~.QATModule`
-    version :class:`~.qat.QuantStub` using :func:`~.quantize.quantize_qat`.
-    """
-
-    def forward(self, inp):
-        return inp
-
-
-class DequantStub(Module):
-    r"""
-    A helper :class:`~.Module` simply returning input. Could be replaced with :class:`~.QATModule`
-    version :class:`~.qat.DequantStub` using :func:`~.quantize.quantize_qat`.
-    """
-
-    def forward(self, inp):
-        return inp
diff --git a/python_module/megengine/module/quantized/__init__.py b/python_module/megengine/module/quantized/__init__.py
deleted file mode 100644
index e641476d..00000000
--- a/python_module/megengine/module/quantized/__init__.py
+++ /dev/null
@@ -1,14 +0,0 @@
-# MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
-#
-# Copyright (c) 2014-2020 Megvii Inc. All rights reserved.
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-from .concat import Concat
-from .conv import Conv2d, ConvRelu2d
-from .conv_bn import ConvBn2d, ConvBnRelu2d
-from .elemwise import Elemwise
-from .linear import Linear
-from .module import QuantizedModule
-from .quant_dequant import DequantStub, QuantStub
diff --git a/python_module/megengine/module/quantized/concat.py b/python_module/megengine/module/quantized/concat.py
deleted file mode 100644
index f9ef05d9..00000000
--- a/python_module/megengine/module/quantized/concat.py
+++ /dev/null
@@ -1,35 +0,0 @@
-# MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
-#
-# Copyright (c) 2014-2020 Megvii Inc. All rights reserved.
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-from typing import Iterable
-
-from ... import functional as F
-from ...core.tensor import Tensor
-from ..qat import concat as QAT
-from .module import QuantizedModule
-
-
-class Concat(QuantizedModule):
-    r"""
-    A :class:`~.QuantizedModule` to do quantized concat, inference only.
-    """
-
-    def __init__(self, dtype=None):
-        super().__init__()
-        self.output_dtype = dtype
-
-    def forward(self, inps: Iterable[Tensor], axis: int = 0):
-        new_inps = (x.astype(self.output_dtype) for x in inps)
-        return F.concat(new_inps, axis)
-
-    @classmethod
-    def from_qat_module(cls, qat_module: QAT.Concat):
-        r"""
-        return a :class:`~.QuantizedModule` instance converted from a
-        :class:`~.QATModule` instance.
-        """
-        return cls(qat_module.get_activation_dtype())
diff --git a/python_module/megengine/module/quantized/conv.py b/python_module/megengine/module/quantized/conv.py
deleted file mode 100644
index 3118451d..00000000
--- a/python_module/megengine/module/quantized/conv.py
+++ /dev/null
@@ -1,108 +0,0 @@
-# MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
-#
-# Copyright (c) 2014-2020 Megvii Inc. All rights reserved.
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-from typing import Tuple, Union
-
-import numpy as np
-
-import megengine._internal as mgb
-
-from ... import module as Float
-from ...core import Parameter
-from ...functional import conv_bias_activation
-from ..qat import conv as QAT
-from .module import QuantizedModule
-
-
-class Conv2d(Float.Conv2d, QuantizedModule):
-    r"""quantized version of :class:`~.qat.conv.Conv2d`."""
-    r"""Applies a 2D convolution over an quantized input tensor, inference only.
-
-    The parameter is same with :class: `~.Conv2d`
-    """
-
-    def __init__(
-        self,
-        in_channels: int,
-        out_channels: int,
-        kernel_size: Union[int, Tuple[int, int]],
-        stride: Union[int, Tuple[int, int]] = 1,
-        padding: Union[int, Tuple[int, int]] = 0,
-        dilation: Union[int, Tuple[int, int]] = 1,
-        groups: int = 1,
-        conv_mode: str = "CROSS_CORRELATION",
-        compute_mode: str = "DEFAULT",
-        dtype=None,
-    ):
-        super().__init__(
-            in_channels,
-            out_channels,
-            kernel_size,
-            stride,
-            padding,
-            dilation,
-            groups,
-            True,
-            conv_mode,
-            compute_mode,
-        )
-        self.output_dtype = dtype
-
-    def calc_conv_quantized(self, inp, nonlinear_mode="IDENTITY"):
-        inp_scale = mgb.dtype.get_scale(inp.dtype)
-        w_scale = mgb.dtype.get_scale(self.weight.dtype)
-        bias_scale = inp_scale * w_scale
-        return conv_bias_activation(
-            inp,
-            self.weight,
-            self.bias.astype(mgb.dtype.qint32(bias_scale)),
-            self.output_dtype,
-            self.stride,
-            self.padding,
-            self.dilation,
-            self.groups,
-            conv_mode=self.conv_mode,
-            compute_mode=self.compute_mode,
-            nonlinear_mode=nonlinear_mode,
-        )
-
-    @classmethod
-    def from_qat_module(cls, qat_module: QAT.Conv2d):
-        r"""
-        return a :class:`~.QuantizedModule` instance converted from a
-        :class:`~.QATModule` instance.
-        """
-        output_dtype = qat_module.get_activation_dtype()
-        qconv = cls(
-            qat_module.in_channels,
-            qat_module.out_channels,
-            qat_module.kernel_size,
-            qat_module.stride,
-            qat_module.padding,
-            qat_module.dilation,
-            qat_module.groups,
-            dtype=output_dtype,
-        )
-        weight = qat_module.weight.astype(qat_module.get_weight_dtype())
-        qconv.weight = Parameter(weight.numpy())
-        if qat_module.bias is not None:
-            qconv.bias = Parameter(qat_module.bias.numpy())
-        else:
-            qconv.bias = Parameter(
-                np.zeros(qat_module._infer_bias_shape(), dtype=np.float32)
-            )
-        return qconv
-
-    def forward(self, inp):
-        return self.calc_conv_quantized(inp, nonlinear_mode="IDENTITY")
-
-
-class ConvRelu2d(Conv2d):
-    r"""quantized version of :class:`~.qat.conv.ConvRelu2d`."""
-
-    def forward(self, inp):
-        return self.calc_conv_quantized(inp, nonlinear_mode="RELU")
diff --git a/python_module/megengine/module/quantized/conv_bn.py b/python_module/megengine/module/quantized/conv_bn.py
deleted file mode 100644
index ceb36d13..00000000
--- a/python_module/megengine/module/quantized/conv_bn.py
+++ /dev/null
@@ -1,56 +0,0 @@
-# MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
-#
-# Copyright (c) 2014-2020 Megvii Inc. All rights reserved.
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-from ...core import Parameter
-from ..qat import conv_bn as QAT
-from .conv import Conv2d
-
-
-class _ConvBnActivation2d(Conv2d):
-    r"""Applies a 2D convolution over an quantized input tensor, inference only.
-
-    The parameter is same with :class: `~.Conv2d`
-    """
-
-    @classmethod
-    def from_qat_module(cls, qat_module: QAT._ConvBnActivation2d):
-        r"""
-        return a :class:`~.QuantizedModule` instance converted from a
-        :class:`~.QATModule` instance.
-        """
-        output_dtype = qat_module.get_activation_dtype()
-        qconv = cls(
-            qat_module.conv.in_channels,
-            qat_module.conv.out_channels,
-            qat_module.conv.kernel_size,
-            qat_module.conv.stride,
-            qat_module.conv.padding,
-            qat_module.conv.dilation,
-            qat_module.conv.groups,
-            dtype=output_dtype,
-        )
-        w_fold, b_fold = qat_module.fold_weight_bias(
-            qat_module.bn.running_mean, qat_module.bn.running_var
-        )
-        weight = w_fold.astype(qat_module.get_weight_dtype())
-        qconv.weight = Parameter(weight.numpy())
-        qconv.bias = Parameter(b_fold.numpy())
-        return qconv
-
-
-class ConvBn2d(_ConvBnActivation2d):
-    r"""quantized version of :class:`~.qat.conv_bn.ConvBn2d`."""
-
-    def forward(self, inp):
-        return self.calc_conv_quantized(inp, nonlinear_mode="IDENTITY")
-
-
-class ConvBnRelu2d(_ConvBnActivation2d):
-    r"""quantized version of :class:`~.qat.conv_bn.ConvBnRelu2d`."""
-
-    def forward(self, inp):
-        return self.calc_conv_quantized(inp, nonlinear_mode="RELU")
diff --git a/python_module/megengine/module/quantized/elemwise.py b/python_module/megengine/module/quantized/elemwise.py
deleted file mode 100644
index db04ed65..00000000
--- a/python_module/megengine/module/quantized/elemwise.py
+++ /dev/null
@@ -1,47 +0,0 @@
-# MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
-#
-# Copyright (c) 2014-2020 Megvii Inc. All rights reserved.
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-from ... import _internal as mgb
-from ...core import Tensor, wrap_io_tensor
-from ...core.graph import _use_default_if_none
-from ..qat import elemwise as QAT
-from .module import QuantizedModule
-
-
-@wrap_io_tensor
-def _elemwise_multi_type(mode, *inputs, **kwargs) -> Tensor:
-    if all(isinstance(i, (int, float)) for i in inputs):
-        device, comp_graph = _use_default_if_none(None, None)
-        ret = mgb.opr.elemwise_multi_type(
-            *inputs, mode=mode, comp_node=device, comp_graph=comp_graph, **kwargs,
-        )
-        return ret.inferred_value[0]
-    return mgb.opr.elemwise_multi_type(*inputs, mode=mode, **kwargs)
-
-
-class Elemwise(QuantizedModule):
-    r"""quantized version of :class:`~.qat.elemwise.Elemwise`."""
-
-    _elemwise_multi_type_mode = mgb.opr_param_defs.ElemwiseMultiType.Mode
-
-    def __init__(self, method, dtype=None):
-        super().__init__()
-        self.method = self._elemwise_multi_type_mode.convert("Q" + method)
-        self.output_dtype = dtype
-
-    def forward(self, *inps):
-        if self.training:
-            raise ValueError("quantized module only support inference.")
-        return _elemwise_multi_type(self.method, *inps, dtype=self.output_dtype)
-
-    @classmethod
-    def from_qat_module(cls, qat_module: QAT.Elemwise):
-        r"""
-        return a :class:`~.QuantizedModule` instance converted from a
-        :class:`~.QATModule` instance.
-        """
-        return cls(qat_module.method.name, qat_module.get_activation_dtype())
diff --git a/python_module/megengine/module/quantized/linear.py b/python_module/megengine/module/quantized/linear.py
deleted file mode 100644
index a6e61a6e..00000000
--- a/python_module/megengine/module/quantized/linear.py
+++ /dev/null
@@ -1,55 +0,0 @@
-# MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
-#
-# Copyright (c) 2014-2020 Megvii Inc. All rights reserved.
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-import numpy as np
-
-import megengine._internal as mgb
-
-from ... import functional as F
-from ...core import Parameter
-from ..qat import linear as QAT
-from .module import QuantizedModule
-
-
-class Linear(QuantizedModule):
-    r"""quantized version of :class:`~.qat.linear.Linear`."""
-
-    def __init__(
-        self, dtype: np.dtype = None,
-    ):
-        super().__init__()
-        self.weight = None
-        self.bias = None
-        self.output_dtype = dtype
-
-    def forward(self, inp):
-        if self.training:
-            raise ValueError("quantized module only support inference.")
-        inp_scale = mgb.dtype.get_scale(inp.dtype)
-        w_scale = mgb.dtype.get_scale(self.weight.dtype)
-        bias_dtype = mgb.dtype.qint32(inp_scale * w_scale)
-        ret = F.linear(
-            inp,
-            self.weight,
-            None if self.bias is None else self.bias.astype(bias_dtype),
-        )
-        ret = ret if self.output_dtype is None else ret.astype(self.output_dtype)
-        return ret
-
-    @classmethod
-    def from_qat_module(cls, qat_module: QAT.Linear):
-        r"""
-        return a :class:`~.QuantizedModule` instance converted from a
-        :class:`~.QATModule` instance.
-        """
-        output_dtype = qat_module.get_activation_dtype()
-        qmod = cls(dtype=output_dtype)
-        weight = qat_module.weight.astype(qat_module.get_weight_dtype())
-        qmod.weight = Parameter(weight.numpy())
-        if qat_module.bias is not None:
-            qmod.bias = Parameter(qat_module.bias.numpy())
-        return qmod
diff --git a/python_module/megengine/module/quantized/module.py b/python_module/megengine/module/quantized/module.py
deleted file mode 100644
index 4fccdbfa..00000000
--- a/python_module/megengine/module/quantized/module.py
+++ /dev/null
@@ -1,31 +0,0 @@
-# MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
-#
-# Copyright (c) 2014-2020 Megvii Inc. All rights reserved.
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-from abc import abstractmethod
-
-from ..module import Module
-from ..qat import QATModule
-
-
-class QuantizedModule(Module):
-    r"""
-    Base class of quantized Module, which should be converted from QATModule
-    and not support traning.
-    """
-
-    def __call__(self, *inputs, **kwargs):
-        if self.training:
-            raise ValueError("quantized module only support inference.")
-        return super().__call__(*inputs, **kwargs)
-
-    @classmethod
-    @abstractmethod
-    def from_qat_module(cls, qat_module: QATModule):
-        r"""
-        return a :class:`~.QuantizedModule` instance converted from a
-        :class:`~.QATModule` instance.
-        """
diff --git a/python_module/megengine/module/quantized/quant_dequant.py b/python_module/megengine/module/quantized/quant_dequant.py
deleted file mode 100644
index 0c245011..00000000
--- a/python_module/megengine/module/quantized/quant_dequant.py
+++ /dev/null
@@ -1,49 +0,0 @@
-# MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
-#
-# Copyright (c) 2014-2020 Megvii Inc. All rights reserved.
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-from ..qat import quant_dequant as QAT
-from .module import QuantizedModule
-
-
-class QuantStub(QuantizedModule):
-    r"""
-    quantized version of :class:`~.qat.quant_dequant.QuantStub`,
-    will convert input to quantized dtype.
-    """
-
-    def __init__(self, dtype=None):
-        super().__init__()
-        self.output_dtype = dtype
-
-    def forward(self, inp):
-        return inp.astype(self.output_dtype)
-
-    @classmethod
-    def from_qat_module(cls, qat_module: QAT.QuantStub):
-        r"""
-        return a :class:`~.QuantizedModule` instance converted from a
-        :class:`~.QATModule` instance.
-        """
-        return cls(qat_module.get_activation_dtype())
-
-
-class DequantStub(QuantizedModule):
-    r"""
-    quantized version of :class:`~.qat.quant_dequant.DequantStub`,
-    will restore quantized input to float32 dtype.
-    """
-
-    def forward(self, inp):
-        return inp.astype("float32")
-
-    @classmethod
-    def from_qat_module(cls, qat_module: QAT.DequantStub):
-        r"""
-        return a :class:`~.QuantizedModule` instance converted from a
-        :class:`~.QATModule` instance.
-        """
-        return cls()
diff --git a/python_module/megengine/module/sequential.py b/python_module/megengine/module/sequential.py
deleted file mode 100644
index 01291e98..00000000
--- a/python_module/megengine/module/sequential.py
+++ /dev/null
@@ -1,97 +0,0 @@
-# -*- coding: utf-8 -*-
-# MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
-#
-# Copyright (c) 2014-2020 Megvii Inc. All rights reserved.
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-from collections import OrderedDict
-
-from .module import Module
-
-
-class Sequential(Module):
-    r"""A sequential container.
-    Modules will be added to it in the order they are passed in the constructor.
-    Alternatively, an ordered dict of modules can also be passed in.
-
-    To make it easier to understand, here is a small example:
-
-    .. testcode::
-        from collections import OrderedDict
-        import numpy as np
-        import megengine.functional as F
-        from megengine.module import Sequential, Linear
-        from megengine import tensor
-
-        batch_size = 64
-        data = tensor(np.zeros((batch_size, 1, 28, 28)), dtype=np.float32)
-        label = tensor(np.zeros(batch_size,), dtype=np.int32)
-
-        data = data.reshape(batch_size, -1)
-
-        net0 = Sequential(
-                Linear(28 * 28, 320),
-                Linear(320, 10)
-            )
-
-        pred0 = net0(data)
-
-        modules = OrderedDict()
-        modules["fc0"] = Linear(28 * 28, 320)
-        modules["fc1"] = Linear(320, 10)
-        net1 = Sequential(modules)
-
-        pred1 = net1(data)
-    """
-
-    def __init__(self, *args):
-        super().__init__()
-        self.layer_keys = []
-        if len(args) == 1 and isinstance(args[0], OrderedDict):
-            for key, module in args[0].items():
-                # self.add_module(key, module)
-                setattr(self, key, module)
-                self.layer_keys.append(key)
-        else:
-            for idx, module in enumerate(args):
-                # self.add_module(str(idx), module)
-                setattr(self, str(idx), module)
-                self.layer_keys.append(str(idx))
-
-    def __getitem__(self, idx):
-        if isinstance(idx, slice):
-            return self.__class__(
-                OrderedDict(zip(self.layer_keys[idx], self.layer_values[idx]))
-            )
-        else:
-            return getattr(self, self.layer_keys[idx])
-
-    def __setitem__(self, idx, module):
-        key = self.layer_keys[idx]
-        return setattr(self, key, module)
-
-    def __delitem__(self, idx):
-        if isinstance(idx, slice):
-            for key in self.layer_keys[idx]:
-                delattr(self, key)
-                del self.layer_keys[idx]
-        else:
-            delattr(self, self.layer_keys[idx])
-            del self.layer_keys[idx]
-
-    def __len__(self):
-        return len(self.layer_keys)
-
-    def __iter__(self):
-        return iter(self.layer_values)
-
-    @property
-    def layer_values(self):
-        return [getattr(self, key) for key in self.layer_keys]
-
-    def forward(self, inp):
-        for layer in self.layer_values:
-            inp = layer(inp)
-        return inp
diff --git a/python_module/megengine/optimizer/__init__.py b/python_module/megengine/optimizer/__init__.py
deleted file mode 100644
index ad783e06..00000000
--- a/python_module/megengine/optimizer/__init__.py
+++ /dev/null
@@ -1,15 +0,0 @@
-# -*- coding: utf-8 -*-
-# MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
-#
-# Copyright (c) 2014-2020 Megvii Inc. All rights reserved.
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-from .adadelta import Adadelta
-from .adagrad import Adagrad
-from .adam import Adam
-from .lr_scheduler import LRScheduler
-from .multi_step_lr import MultiStepLR
-from .optimizer import Optimizer
-from .sgd import SGD
diff --git a/python_module/megengine/optimizer/adadelta.py b/python_module/megengine/optimizer/adadelta.py
deleted file mode 100644
index 7d793608..00000000
--- a/python_module/megengine/optimizer/adadelta.py
+++ /dev/null
@@ -1,78 +0,0 @@
-from typing import Iterable, Union
-
-import numpy as np
-
-from ..core import Buffer, Parameter
-from ..functional import sqrt
-from .internal import add_update_fastpath as add_update
-from .optimizer import Optimizer
-
-
-class Adadelta(Optimizer):
-    r"""Implements Adadelta algorithm.
-
-    It has been proposed in `"ADADELTA: An Adaptive Learning Rate Method" <https://arxiv.org/abs/1212.5701>`_.
-
-    :param params: iterable of parameters to optimize or dicts defining
-        parameter groups.
-    :param lr: coefficient that scale delta before it is applied
-        to the parameters (default: 1.0).
-    :param rho: coefficient used for computing a running average
-        of squared gradients (default: 0.9).
-    :param eps: term added to the denominator to improve
-        numerical stability (default: 1e-6).
-    :param weight_decay: weight decay (L2 penalty) (default: 0).
-    """
-
-    def __init__(
-        self,
-        params: Union[Iterable[Parameter], dict],
-        lr: float = 1.0,
-        rho: float = 0.9,
-        eps: float = 1e-6,
-        weight_decay: float = 0.0,
-    ):
-        assert lr >= 0.0, "Invalid learning rate: {}".format(lr)
-        assert rho >= 0.0 and rho <= 1.0, "Invalid rho value: {}".format(rho)
-        assert eps >= 0.0, "Invalid epsilon value: {}".format(eps)
-        assert weight_decay >= 0.0, "Invalid weight_decay value: {}".format(
-            weight_decay
-        )
-
-        defaults = dict(lr=lr, rho=rho, eps=eps, weight_decay=weight_decay)
-        super().__init__(params, defaults)
-
-    def _create_state(self, param_group):
-        for param in param_group["params"]:
-            self._add_state(param, "square_avg")
-            self._add_state(param, "acc_delta")
-            self._add_state(param, "step", initializer=0.0)
-
-    def _updates(self, param_group):
-        lr = param_group["lr"]
-        weight_decay = param_group["weight_decay"]
-        rho = param_group["rho"]
-        eps = param_group["eps"]
-
-        for param in param_group["params"]:
-            if not isinstance(param.grad, Buffer):
-                raise TypeError(
-                    "grad must be a Buffer, maybe you forget to call backward()?"
-                )
-
-            if not param.requires_grad:
-                continue
-
-            step = self._state[param]["step"]
-            step = add_update(step, 1)
-            grad = param.grad
-            if weight_decay != 0.0:
-                grad = add_update(grad, param, beta=weight_decay)
-
-            square_avg = self._state[param]["square_avg"]
-            acc_delta = self._state[param]["acc_delta"]
-            square_avg = add_update(square_avg, grad ** 2, alpha=rho, beta=1 - rho)
-            std = sqrt(square_avg + eps)
-            delta = sqrt(acc_delta + eps) / std * grad
-            add_update(param, delta, beta=-lr)
-            acc_delta = add_update(acc_delta, delta ** 2, alpha=rho, beta=1 - rho)
diff --git a/python_module/megengine/optimizer/adagrad.py b/python_module/megengine/optimizer/adagrad.py
deleted file mode 100644
index 4683fa10..00000000
--- a/python_module/megengine/optimizer/adagrad.py
+++ /dev/null
@@ -1,75 +0,0 @@
-from typing import Iterable, Union
-
-import numpy as np
-
-from ..core import Buffer, Parameter
-from ..functional import sqrt
-from .internal import add_update_fastpath as add_update
-from .optimizer import Optimizer
-
-
-class Adagrad(Optimizer):
-    r"""Implements Adagrad algorithm.
-
-    It has been proposed in `"Adaptive Subgradient Methods for Online Learning
-    and Stochastic Optimization" <http://jmlr.org/papers/v12/duchi11a.html>`_.
-
-    :param params: iterable of parameters to optimize or dicts defining
-        parameter groups.
-    :param lr: coefficient that scale delta before it is applied
-        to the parameters (default: 1e-2).
-    :param lr_decay: learning rate decay (default: 0)
-    :param eps: term added to the denominator to improve
-        numerical stability (default: 1e-10).
-    :param weight_decay: weight decay (L2 penalty) (default: 0).
-    """
-
-    def __init__(
-        self,
-        params: Union[Iterable[Parameter], dict],
-        lr: float = 1e-2,
-        lr_decay: float = 0.0,
-        eps: float = 1e-10,
-        weight_decay: float = 0.0,
-    ):
-        assert lr >= 0.0, "Invalid learning rate: {}".format(lr)
-        assert lr_decay >= 0, "Invalid learning rate decay: {}".format(lr_decay)
-        assert eps >= 0.0, "Invalid epsilon value: {}".format(eps)
-        assert weight_decay >= 0.0, "Invalid weight_decay value: {}".format(
-            weight_decay
-        )
-
-        defaults = dict(lr=lr, lr_decay=lr_decay, eps=eps, weight_decay=weight_decay)
-        super().__init__(params, defaults)
-
-    def _create_state(self, param_group):
-        for param in param_group["params"]:
-            self._add_state(param, "square_avg")
-            self._add_state(param, "step", initializer=0.0)
-
-    def _updates(self, param_group):
-        lr = param_group["lr"]
-        lr_decay = param_group["lr_decay"]
-        weight_decay = param_group["weight_decay"]
-        eps = param_group["eps"]
-
-        for param in param_group["params"]:
-            if not isinstance(param.grad, Buffer):
-                raise TypeError(
-                    "grad must be a Buffer, maybe you forget to call backward()?"
-                )
-
-            if not param.requires_grad:
-                continue
-
-            step = self._state[param]["step"]
-            step = add_update(step, 1)
-            grad = param.grad
-            if weight_decay != 0.0:
-                grad = add_update(grad, param, beta=weight_decay)
-
-            square_avg = self._state[param]["square_avg"]
-            square_avg = add_update(square_avg, grad ** 2)
-            delta = grad / sqrt(square_avg + eps)
-            clr = lr / (1 + (step - 1) * lr_decay)
-            add_update(param, delta, beta=-clr)
diff --git a/python_module/megengine/optimizer/adam.py b/python_module/megengine/optimizer/adam.py
deleted file mode 100644
index 6f264d3b..00000000
--- a/python_module/megengine/optimizer/adam.py
+++ /dev/null
@@ -1,86 +0,0 @@
-# -*- coding: utf-8 -*-
-# MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
-#
-# Copyright (c) 2014-2020 Megvii Inc. All rights reserved.
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-from typing import Iterable, Tuple, Union
-
-from ..core import Buffer, Parameter
-from .internal import add_update_fastpath as add_update
-from .optimizer import Optimizer
-
-
-class Adam(Optimizer):
-    r"""Implements Adam algorithm proposed in `"Adam: A Method for Stochastic Optimization" <https://arxiv.org/abs/1412.6980>`_.
-
-    :param params: iterable of parameters to optimize or dicts defining
-            parameter groups.
-    :param lr: learning rate.
-    :param betas: coefficients used for computing running averages of gradient
-        and its square. Default: (0.9, 0.999)
-    :param eps: term added to the denominator to improve numerical stability
-        Default: 1e-8
-    :param weight_decay: weight decay (L2 penalty). Default: 0
-    """
-
-    def __init__(
-        self,
-        params: Union[Iterable[Parameter], dict],
-        lr: float,
-        betas: Tuple[float, float] = (0.9, 0.999),
-        eps: float = 1e-8,
-        weight_decay: float = 0.0,
-    ):
-        if lr < 0.0:
-            raise ValueError("Invalid learning rate: {}".format(lr))
-        if weight_decay < 0.0:
-            raise ValueError("Invalid weight_decay value: {}".format(weight_decay))
-        if not 0.0 <= betas[0] < 1.0:
-            raise ValueError("Invalid beta parameter at index 0: {}".format(betas[0]))
-        if not 0.0 <= betas[1] < 1.0:
-            raise ValueError("Invalid beta parameter at index 1: {}".format(betas[1]))
-
-        defaults = dict(lr=lr, weight_decay=weight_decay, betas=betas, eps=eps)
-        super().__init__(params, defaults)
-
-    def _create_state(self, param_group):
-        for param in param_group["params"]:
-            self._add_state(param, "exp_avg")
-            self._add_state(param, "exp_avg_sq")
-            self._add_state(param, "step", initializer=0.0)
-
-    def _updates(self, param_group):
-        lr = param_group["lr"]
-        weight_decay = param_group["weight_decay"]
-        eps = param_group["eps"]
-        beta0, beta1 = param_group["betas"]
-
-        for param in param_group["params"]:
-            if not param.requires_grad:
-                continue
-
-            step = self._state[param]["step"]
-            step = add_update(step, 1)
-            if not isinstance(param.grad, Buffer):
-                raise TypeError(
-                    "grad must be a Buffer, maybe you forget to call backward()?"
-                )
-            grad = param.grad
-            if weight_decay != 0.0:
-                grad = add_update(grad, param, beta=weight_decay)
-            exp_avg = self._state[param]["exp_avg"]
-            exp_avg_sq = self._state[param]["exp_avg_sq"]
-            exp_avg = add_update(exp_avg, grad, alpha=beta0, beta=1 - beta0)
-            exp_avg_sq = add_update(
-                exp_avg_sq, grad * grad, alpha=beta1, beta=1 - beta1
-            )
-            add_update(
-                param,
-                exp_avg
-                / (1 - beta0 ** step)
-                / (exp_avg_sq.sqrt() / (1 - beta1 ** step).sqrt() + eps),
-                beta=-lr,
-            )
diff --git a/python_module/megengine/optimizer/internal.py b/python_module/megengine/optimizer/internal.py
deleted file mode 100644
index 0483af9a..00000000
--- a/python_module/megengine/optimizer/internal.py
+++ /dev/null
@@ -1,51 +0,0 @@
-# -*- coding: utf-8 -*-
-# MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
-#
-# Copyright (c) 2014-2020 Megvii Inc. All rights reserved.
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-from typing import Union
-
-import megengine._internal as mgb
-
-from ..core.tensor import Tensor, tensor
-
-
-def add_update_fastpath(
-    dest: Tensor,
-    delta: Tensor,
-    *,
-    alpha: Union[Tensor, float, int] = 1.0,
-    beta: Union[Tensor, float, int] = 1.0,
-    bias: Union[Tensor, float, int] = 0.0
-):
-    """a fast-path ONLY used to update parameters in optimizer, since it
-    would bypass computing graph and launch dnn/add_update kernel directly,
-    it is more efficient than functional/add_update.
-    """
-
-    if isinstance(beta, Tensor) or isinstance(alpha, Tensor):
-        delta *= beta
-        beta = 1.0
-    if isinstance(alpha, Tensor):
-        delta += (alpha - 1.0) * dest
-        alpha = 1.0
-    if isinstance(bias, Tensor):
-        delta += bias
-        bias = 0.0
-
-    if not isinstance(delta, Tensor):
-        delta = tensor(delta, device=dest.device, dtype=dest.dtype)
-
-    def get_v(x):
-        if x._Tensor__val is None:
-            assert isinstance(x._Tensor__sym, mgb.SymbolVar)
-            return x._Tensor__sym.eager_val
-        else:
-            assert isinstance(x._Tensor__val, mgb.SharedND)
-            return x._Tensor__val
-
-    mgb.mgb._add_update_fastpath(get_v(dest), get_v(delta), alpha, beta, bias)
-    return dest
diff --git a/python_module/megengine/optimizer/lr_scheduler.py b/python_module/megengine/optimizer/lr_scheduler.py
deleted file mode 100644
index 7cdb6d9b..00000000
--- a/python_module/megengine/optimizer/lr_scheduler.py
+++ /dev/null
@@ -1,65 +0,0 @@
-from abc import ABCMeta
-
-from .optimizer import Optimizer
-
-
-class LRScheduler(metaclass=ABCMeta):
-    r"""Base class for all learning rate based schedulers.
-
-    :param optimizer: Wrapped optimizer.
-    :param current_epoch: The index of current epoch. Default: -1
-    """
-
-    def __init__(  # pylint: disable=too-many-branches
-        self, optimizer: Optimizer, current_epoch: int = -1
-    ):
-        if not isinstance(optimizer, Optimizer):
-            raise TypeError(
-                "optimizer argument given to the lr_scheduler should be Optimizer"
-            )
-        self.optimizer = optimizer
-        self.current_epoch = current_epoch
-        if current_epoch == -1:
-            for group in self.optimizer.param_groups:
-                group.setdefault("initial_lr", group["lr"])
-        else:
-            for i, group in enumerate(optimizer.param_groups):
-                if "initial_lr" not in group:
-                    raise KeyError(
-                        "param 'initial_lr' is not specified in "
-                        "param_groups[{}] when resuming an optimizer".format(i)
-                    )
-        self.base_lrs = list(
-            map(lambda group: group["initial_lr"], self.optimizer.param_groups)
-        )
-
-        self.step()
-
-    def state_dict(self):
-        r"""Returns the state of the scheduler as a :class:`dict`.
-            It contains an entry for every variable in self.__dict__ which
-            is not the optimizer.
-        """
-        raise NotImplementedError
-
-    def load_state_dict(self, state_dict):
-        r"""Loads the schedulers state.
-
-        :param state_dict (dict): scheduler state.
-        """
-        raise NotImplementedError
-
-    def get_lr(self):
-        r""" Compute current learning rate for the scheduler.
-        """
-        raise NotImplementedError
-
-    def step(self, epoch=None):
-        if epoch is None:
-            self.current_epoch += 1
-        else:
-            self.current_epoch = epoch
-
-        values = self.get_lr()
-        for param_group, lr in zip(self.optimizer.param_groups, values):
-            param_group["lr"] = lr
diff --git a/python_module/megengine/optimizer/multi_step_lr.py b/python_module/megengine/optimizer/multi_step_lr.py
deleted file mode 100644
index e6ac5ca0..00000000
--- a/python_module/megengine/optimizer/multi_step_lr.py
+++ /dev/null
@@ -1,67 +0,0 @@
-from bisect import bisect_right
-from typing import Iterable as Iter
-
-from .lr_scheduler import LRScheduler
-from .optimizer import Optimizer
-
-
-class MultiStepLR(LRScheduler):
-    r"""Decays the learning rate of each parameter group by gamma once the
-        number of epoch reaches one of the milestones.
-
-    :param optimizer: Wrapped optimizer.
-    :param milestones (list): List of epoch indices. Must be increasing.
-    :param gamma (float): Multiplicative factor of learning rate decay. Default: 0.1.
-    :param current_epoch: The index of current epoch. Default: -1.
-    """
-
-    def __init__(
-        self,
-        optimizer: Optimizer,
-        milestones: Iter[int],
-        gamma: float = 0.1,
-        current_epoch: int = -1,
-    ):
-        if not list(milestones) == sorted(milestones):
-            raise ValueError(
-                "Milestones should be a list of increasing integers. Got {}".format(
-                    milestones
-                )
-            )
-
-        self.milestones = milestones
-        self.gamma = gamma
-        super().__init__(optimizer, current_epoch)
-
-    def state_dict(self):
-        r"""Returns the state of the scheduler as a :class:`dict`.
-            It contains an entry for every variable in self.__dict__ which
-            is not the optimizer.
-        """
-        return {
-            key: value
-            for key, value in self.__dict__.items()
-            if key in ["milestones", "gamma", "current_epoch"]
-        }
-
-    def load_state_dict(self, state_dict):
-        r"""Loads the schedulers state.
-
-        :param state_dict (dict): scheduler state.
-        """
-        tmp_dict = {}
-        for key in ["milestones", "gamma", "current_epoch"]:
-            if not key in state_dict.keys():
-                raise KeyError(
-                    "key '{}'' is not specified in "
-                    "state_dict when loading state dict".format(key)
-                )
-            tmp_dict[key] = state_dict[key]
-
-        self.__dict__.update(tmp_dict)
-
-    def get_lr(self):
-        return [
-            base_lr * self.gamma ** bisect_right(self.milestones, self.current_epoch)
-            for base_lr in self.base_lrs
-        ]
diff --git a/python_module/megengine/optimizer/optimizer.py b/python_module/megengine/optimizer/optimizer.py
deleted file mode 100644
index 2596d26a..00000000
--- a/python_module/megengine/optimizer/optimizer.py
+++ /dev/null
@@ -1,307 +0,0 @@
-# -*- coding: utf-8 -*-
-# MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
-#
-# Copyright (c) 2014-2020 Megvii Inc. All rights reserved.
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-from abc import ABCMeta, abstractmethod
-from collections import Iterable
-from typing import Dict
-from typing import Iterable as Iter
-from typing import Union
-
-import numpy as np
-
-from .._internal.config import opr_priority_scope
-from ..core import Buffer, Parameter, Tensor, TensorDict
-from ..core.graph import get_default_graph
-from ..distributed import (
-    all_reduce_sum,
-    bcast_param,
-    get_rank,
-    get_world_size,
-    is_distributed,
-)
-from ..distributed.util import get_group_id
-from ..functional import add_update
-from ..functional import grad as grad_func
-from ..jit import sideeffect
-
-
-class _RequiredParameter:
-    def __repr__(self):
-        return "<required parameter>"
-
-
-required = _RequiredParameter()
-
-
-class Optimizer(metaclass=ABCMeta):
-    r"""Base class for all optimizers.
-
-    :param params: specifies what Tensors should be optimized.
-    :param defaults: a dict of default parameters of Optimizer, like learning rate or momentum.
-    :param bcast_period: interval time between two broadcast of distributed training. Default: 500
-    """
-
-    def __init__(  # pylint: disable=too-many-branches
-        self,
-        params: Union[Iter[Parameter], dict],
-        defaults: dict,
-        bcast_period: int = 500,
-    ):
-        self._state = TensorDict()
-        self._defaults = defaults
-        self._bcast_iter = 0
-        self._bcast_period = bcast_period
-
-        if isinstance(params, (Parameter, dict)):
-            params = [params]
-        else:
-            if not isinstance(params, Iterable):
-                raise TypeError(
-                    "params argument given to the optimizer should be "
-                    "Parameter or dict, or Iterable of them"
-                )
-
-        self.param_groups = []  # type: list
-
-        param_groups = list(params)
-        if len(param_groups) == 0:
-            raise ValueError("optimizer got an empty parameter list")
-
-        param_type = type(param_groups[0])
-        for param in param_groups:
-            if not isinstance(param, param_type):
-                raise TypeError(
-                    "types of params argument given to the optimizer shoud be same"
-                )
-
-        if not isinstance(param_groups[0], dict):
-            param_groups = [{"params": param_groups}]
-
-        for group in param_groups:
-            self.add_param_group(group)
-
-        for group in self.param_groups:
-            self._create_state(group)
-
-        if is_distributed() and bcast_period != -1:
-            self.bcast_param()
-
-    def add_param_group(self, param_group: dict):
-        r"""Add a param group to ``param_groups`` of the :class:`~megengine.optim.optimizer.Optimizer`.
-
-        This can be useful when fine tuning a pre-trained network as frozen layers can be made
-        trainable and added to the :class:`~megengine.optim.optimizer.Optimizer` as training progresses.
-
-        :param param_group: specifies what tensors should be optimized along with group.
-
-        """
-        assert isinstance(param_group, dict), "param group must be a dict"
-
-        if isinstance(param_group["params"], Parameter):
-            param_group["params"] = [param_group["params"]]
-        else:
-            param_group["params"] = list(param_group["params"])
-
-        for param in param_group["params"]:
-            if not isinstance(param, Parameter):
-                raise TypeError(
-                    "optimizer can only optimize Parameters, but one of the params is "
-                    + type(param)
-                )
-            if not param.requires_grad:
-                raise ValueError(
-                    "optimizer can only optimize Parameters with requires_grad=True"
-                )
-
-        for name, default in self._defaults.items():
-            if default is required and name not in param_group:
-                raise ValueError(
-                    "parameter group didn't specify a value of "
-                    "required optimization parameter " + name
-                )
-            param_group.setdefault(name, default)
-
-        param_set = set()
-
-        for group in self.param_groups:
-            param_set.update(set(map(id, group["params"])))
-
-        assert param_set.isdisjoint(
-            set(map(id, param_group["params"]))
-        ), "some parameters appear in more than one parameter group"
-
-        self.param_groups.append(param_group)
-
-    def _add_state(self, param, state_name, initializer=None):
-        if initializer is None:
-            initializer = np.zeros(param.shape, dtype=np.float32)
-        state_dict = self._state.setdefault(param, {})
-        assert state_name not in state_dict
-        state = Buffer(value=initializer)
-        state_dict[state_name] = state
-
-    @abstractmethod
-    def _create_state(self, param_group):
-        pass
-
-    @abstractmethod
-    def _updates(self, param_group):
-        pass
-
-    def backward(self, loss: Tensor):
-        """Computes the back-propagation of the network given loss.
-
-        :param loss: The obtained loss tensor
-        """
-        rst = []
-        params = []
-        for group in self.param_groups:
-            for param in group["params"]:
-                if param.grad is None:
-                    param.grad = Buffer(
-                        value=np.zeros(shape=param.shape, dtype=np.float32)
-                    )
-
-                params.append(param)
-                assert hasattr(param, "grad"), "param has no grad"
-                assert isinstance(param.grad, Buffer), "grad must be a buffer"
-
-        cg = get_default_graph()
-        grads = grad_func(loss, params, use_virtual_grad=not cg.is_eager())
-        if not isinstance(grads, list):
-            grads = [grads]
-        assert len(grads) == len(params)
-
-        for param, grad in zip(params, grads):
-            if is_distributed() and param.replica_mode:
-                with opr_priority_scope(cg, -(2 ** 30)):
-                    # always run all_reduce_mean first except add_update
-                    grad = (
-                        all_reduce_sum(
-                            grad, "grad_" + str(get_group_id()), get_world_size()
-                        )
-                        / get_world_size()
-                    )
-                with opr_priority_scope(cg, -(2 ** 31)):
-                    # always run add_update first
-                    grad_update = add_update(param.grad, grad)
-            else:
-                grad_update = add_update(param.grad, grad)
-            rst.append(grad_update)
-
-        return rst
-
-    @sideeffect
-    def step(self):
-        r"""Performs a single optimization step.
-
-        """
-        for group in self.param_groups:
-            if isinstance(group["params"], set):
-                raise TypeError(
-                    "optimized parameters need to be organized in ordered collections, "
-                    "but the ordering of parameters in sets will change between runs. "
-                    "Please use a list instead."
-                )
-            self._updates(group)
-
-        if is_distributed() and self._bcast_period != -1:
-            self._bcast_iter += 1
-            if self._bcast_iter == self._bcast_period:
-                self.bcast_param()
-                self._bcast_iter = 0
-
-    @sideeffect
-    def zero_grad(self):
-        r"""Reset the grad to zeros.
-
-        """
-        for param_group in self.param_groups:
-            for param in param_group["params"]:
-                if param.grad is not None:
-                    param.grad.reset_zero()
-
-    def bcast_param(self):
-        key = 0
-        for group in self.param_groups:
-            for param in group["params"]:
-                if param.replica_mode:
-                    bcast_param(
-                        param,
-                        "bcast_param_" + str(key),
-                        get_world_size(),
-                        get_rank() == 0,
-                    )
-                    key += 1
-
-    def state_dict(self) -> Dict:
-        r"""Export the optimizer state.
-
-        :return: optimizer state. Can be loaded by :meth:`load_state_dict`.
-        """
-        param_groups = []
-        state = dict()
-        param2id = TensorDict()
-
-        cur_id = 0
-        for group in self.param_groups:
-            for param in group["params"]:
-                if param not in param2id:
-                    param2id[param] = cur_id
-                    cur_id += 1
-
-        for param, st in self._state.items():
-            state[param2id[param]] = st
-
-        for group in self.param_groups:
-            param_group = {k: v for k, v in group.items() if k != "params"}
-            param_group["params"] = [param2id[param] for param in group["params"]]
-            param_groups.append(param_group)
-
-        return {"param_groups": param_groups, "state": state}
-
-    def load_state_dict(self, state: dict):
-        r"""Loads the optimizer state.
-
-        :param state: optimizer state. Should be an object returned
-                from a call to :meth:`state_dict`.
-        """
-        if len(self.param_groups) != len(state["param_groups"]):
-            raise ValueError(
-                "loaded state dict has a different number of parameter groups"
-            )
-        parameter_map = dict()  # type: Dict
-        for group_new, group_saved in zip(self.param_groups, state["param_groups"]):
-            if len(group_new["params"]) != len(group_saved["params"]):
-                raise ValueError(
-                    "loaded state dict contains a parameter group that "
-                    "doesn't match the size of optimizer's group"
-                )
-            for param_new, param_saved in zip(
-                group_new["params"], group_saved["params"]
-            ):
-                p = param_new
-                self._state[p] = state["state"][param_saved].copy()
-                for k, v in self._state[p].items():
-                    if isinstance(v, Buffer) and v._comp_graph != p._comp_graph:
-                        self._state[p][k] = Buffer(v.numpy())
-
-            if set(group_new.keys()) != set(group_saved.keys()):
-                raise ValueError(
-                    "loaded state dict contains a parameter group that "
-                    "doesn't match the keys of optimizer's group"
-                )
-            for key in group_new.keys():
-                if key != "params":
-                    group_new[key] = group_saved[key]
-
-        if len(self._state.keys()) != len(state["state"].keys()):
-            raise ValueError(
-                "loaded state dict contains a state that doesn't match "
-                "the size of optimizer's state"
-            )
diff --git a/python_module/megengine/optimizer/sgd.py b/python_module/megengine/optimizer/sgd.py
deleted file mode 100644
index a1f807b3..00000000
--- a/python_module/megengine/optimizer/sgd.py
+++ /dev/null
@@ -1,73 +0,0 @@
-# -*- coding: utf-8 -*-
-# MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
-#
-# Copyright (c) 2014-2020 Megvii Inc. All rights reserved.
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-from typing import Iterable, Union
-
-from ..core import Buffer, Parameter
-from .internal import add_update_fastpath as add_update
-from .optimizer import Optimizer
-
-
-class SGD(Optimizer):
-    r"""Implements stochastic gradient descent.
-
-    Nesterov momentum is based on the formula from
-    `"On the importance of initialization and momentum in deep learning" <http://www.cs.toronto.edu/%7Ehinton/absps/momentum.pdf>`_ .
-
-    :param params: iterable of parameters to optimize or dicts defining
-            parameter groups.
-    :param lr: learning rate.
-    :param momentum: momentum factor. Default: 0.0
-    :param weight_decay: weight decay (L2 penalty). Default: 0.0
-    """
-
-    def __init__(
-        self,
-        params: Union[Iterable[Parameter], dict],
-        lr: float,
-        momentum: float = 0.0,
-        weight_decay: float = 0.0,
-    ):
-        assert lr >= 0.0, "Invalid learning rate: {}".format(lr)
-        assert momentum >= 0.0, "Invalid momentum value: {}".format(momentum)
-        assert weight_decay >= 0.0, "Invalid weight_decay value: {}".format(
-            weight_decay
-        )
-
-        defaults = dict(lr=lr, momentum=momentum, weight_decay=weight_decay)
-        super().__init__(params, defaults)
-
-    def _create_state(self, param_group):
-        if param_group["momentum"] != 0.0:
-            for param in param_group["params"]:
-                self._add_state(param, "momentum_buffer")
-
-    def _updates(self, param_group):
-        lr = param_group["lr"]
-        weight_decay = param_group["weight_decay"]
-        momentum = param_group["momentum"]
-
-        for param in param_group["params"]:
-            if not isinstance(param.grad, Buffer):
-                raise TypeError(
-                    "grad must be a Buffer, maybe you forget to call backward()?"
-                )
-
-            if not param.requires_grad:
-                continue
-
-            grad = param.grad
-            if weight_decay != 0.0:
-                grad = add_update(grad, param, beta=weight_decay)
-
-            if momentum:
-                v = self._state[param]["momentum_buffer"]
-                update_v = add_update(v, grad, alpha=momentum)
-                add_update(param, update_v, beta=-lr)
-            else:
-                add_update(param, grad, beta=-lr)
diff --git a/python_module/megengine/quantization/__init__.py b/python_module/megengine/quantization/__init__.py
deleted file mode 100644
index 9c8a0e0d..00000000
--- a/python_module/megengine/quantization/__init__.py
+++ /dev/null
@@ -1,20 +0,0 @@
-# MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
-#
-# Copyright (c) 2014-2020 Megvii Inc. All rights reserved.
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-
-from .fake_quant import FakeQuantize
-from .internal_fake_quant import *
-from .observer import HistogramObserver, Observer
-from .qconfig import (
-    QConfig,
-    calibration_qconfig,
-    ema_fakequant_qconfig,
-    ema_lowbit_fakequant_qconfig,
-    min_max_fakequant_qconfig,
-    tqt_quant_qconfig,
-)
-from .utils import QuantMode
diff --git a/python_module/megengine/quantization/fake_quant.py b/python_module/megengine/quantization/fake_quant.py
deleted file mode 100644
index 7260b9db..00000000
--- a/python_module/megengine/quantization/fake_quant.py
+++ /dev/null
@@ -1,145 +0,0 @@
-# MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
-#
-# Copyright (c) 2014-2020 Megvii Inc. All rights reserved.
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-import copy
-import math
-
-import numpy as np
-
-from .. import functional as F
-from .._internal.dtype import _metadata_dict, get_quantized_dtype
-from ..core import Buffer, Function, Parameter
-from ..jit import sideeffect
-from ..module import Module
-from .utils import QuantMode, Round, fake_quant_tensor, get_qparam_dict
-
-
-class _FakeQuantize(Module):
-    r"""
-    A Basic Fake Quant module.
-
-    :param dtype: A string indicating the target quantization type of input.
-    :param narrow_range: Whether the absolute value of ``qmin`` is the same as ``qmax``,
-        instead of 1 greater. Usually True for weight and False for activation.
-    :param enable: Whether do ``normal_forward`` or ``fake_quant_forward``.
-    """
-
-    def __init__(self, dtype: str, narrow_range: bool = False, enable: bool = True):
-        super().__init__()
-        if not dtype in _metadata_dict.keys():
-            raise ValueError(
-                "unknown dtype: {}, only support {}".format(
-                    dtype, _metadata_dict.keys()
-                )
-            )
-        self.dtype = dtype
-        self.narrow_range = narrow_range
-        self.qmin = (
-            -_metadata_dict[dtype].qmax if narrow_range else _metadata_dict[dtype].qmin
-        )
-        self.qmax = _metadata_dict[dtype].qmax
-        self.enabled = enable
-
-    def enable(self):
-        self.enabled = True
-
-    def disable(self):
-        self.enabled = False
-
-    def fake_quant_forward(self, inp, q_dict=None):
-        return inp
-
-    def normal_foward(self, inp, q_dict=None):
-        return inp
-
-    def forward(self, inp, q_dict=None):
-        if self.enabled:
-            return self.fake_quant_forward(inp, q_dict=q_dict)
-        else:
-            return self.normal_foward(inp, q_dict=q_dict)
-
-
-class TQT_Function(Function):
-    def __init__(self, lowerbound, upperbound):
-        super().__init__()
-        self.lowerbound = lowerbound
-        self.upperbound = upperbound
-
-    def forward(self, inp, scale):
-        t = 2 ** scale
-        # t = F.maximum(t, 1e-4)
-        inp_scaled = inp / t
-        inp_clipped = F.maximum(F.minimum(inp_scaled, self.upperbound), self.lowerbound)
-        inp_rounded = F.round(inp_clipped)
-        inp_flq = inp_rounded * t
-        self.save_for_backward(inp_scaled, inp_rounded, t)
-        return inp_flq
-
-    def backward(self, grad_inp_flq):
-        (inp_scaled, inp_rounded, t) = self.saved_tensors
-        mask_clip = (inp_scaled < -0.5 + self.lowerbound) + (
-            inp_scaled > self.upperbound + 0.5
-        )  # mask for accumulating the gradients of |data_scaled|>L
-        mask_quant = F.abs(
-            mask_clip - 1
-        )  # mask for accumulating the gradients with |data_scaled|<=L
-        grad_quant = (
-            grad_inp_flq * mask_quant * (inp_rounded - inp_scaled)
-        )  # gradient within |data_scaled|<=L
-        grad_clip = (
-            grad_inp_flq * mask_clip * inp_rounded
-        )  # gradient with   | data_scaled|>L
-        grad_s = grad_clip.sum() + grad_quant.sum()
-        # dL/ds = dL/dt * t * ln(2)
-        grad_s = grad_s * t * math.log(2)
-        grad_inp = grad_inp_flq * mask_quant
-        return grad_inp, grad_s
-
-
-class TQT(_FakeQuantize):
-    r"""
-    TQT: https://arxiv.org/abs/1903.08066 Trained Quantization Thresholds
-    for Accurate and Efficient Fixed-Point Inference of Deep Neural Networks.
-    """
-
-    def __init__(self, dtype: str, narrow_range: bool = False, enable: bool = True):
-        super().__init__(dtype, narrow_range, enable)
-        self.scale = Parameter(0.0, dtype=np.float32)
-
-    def fake_quant_forward(self, inp, q_dict=None):
-        # when enable, TQT will do fakequant forward, finetune the scale
-        return TQT_Function(self.qmin, self.qmax)(inp, self.scale)
-
-    def normal_foward(self, inp, q_dict=None):
-        if q_dict["enable_observer"]:
-            # when disable, TQT will do normal forward, initialize scale weight
-            tmp_scale = F.maximum(F.abs(q_dict["min_val"]), F.abs(q_dict["max_val"]))
-            tmp_scale = F.log(tmp_scale / 127) / F.log(2)
-            F.add_update(self.scale, tmp_scale, alpha=0.0, beta=1.0, bias=0.0)
-        return inp
-
-    def get_qparams(self):
-        q_dict = get_qparam_dict(QuantMode.TQT)
-        q_dict["scale"] = 2 ** self.scale
-        return q_dict
-
-    def get_dtype(self):
-        q_dict = self.get_qparams()
-        scale = None if "scale" not in q_dict else q_dict["scale"].numpy()[0]
-        zero_point = (
-            None if "zero_point" not in q_dict else q_dict["zero_point"].numpy()[0]
-        )
-        return get_quantized_dtype(self.dtype, scale, zero_point)
-
-
-class FakeQuantize(_FakeQuantize):
-    r"""
-    A module to do quant and dequant according to observer's scale and zero_point.
-    """
-
-    def fake_quant_forward(self, inp, q_dict=None):
-        return fake_quant_tensor(inp, self.qmin, self.qmax, q_dict)
diff --git a/python_module/megengine/quantization/internal_fake_quant.py b/python_module/megengine/quantization/internal_fake_quant.py
deleted file mode 100644
index df15a916..00000000
--- a/python_module/megengine/quantization/internal_fake_quant.py
+++ /dev/null
@@ -1,19 +0,0 @@
-# MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
-#
-# Copyright (c) 2014-2020 Megvii Inc. All rights reserved.
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-import copy
-import math
-from functools import partial
-
-import numpy as np
-
-from .. import functional as F
-from ..core import Function
-from .fake_quant import _FakeQuantize
-from .observer import MinMaxObserver
-from .qconfig import QConfig
-
diff --git a/python_module/megengine/quantization/observer.py b/python_module/megengine/quantization/observer.py
deleted file mode 100644
index 6aa3a406..00000000
--- a/python_module/megengine/quantization/observer.py
+++ /dev/null
@@ -1,422 +0,0 @@
-# MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
-#
-# Copyright (c) 2014-2020 Megvii Inc. All rights reserved.
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-import math
-from abc import abstractmethod
-from enum import Enum
-
-import numpy as np
-
-from .. import functional as F
-from .._internal.dtype import _metadata_dict, get_quantized_dtype
-from ..core import Buffer
-from ..jit import sideeffect
-from ..module import Module
-from .utils import QuantMode, Round, get_qparam_dict
-
-
-class Observer(Module):
-    r"""
-    A base class for Observer Module.
-
-    :param dtype: a string indicating to collect scale and zero_point of which dtype
-    :param narrow_range: Whether the absolute value of ``qmin`` is the same as ``qmax``,
-        instead of 1 greater. Usually True for weight and False for activation.
-    """
-
-    def __init__(self, dtype: str, narrow_range: bool = False):
-        super().__init__()
-        if dtype not in _metadata_dict.keys():
-            raise ValueError(
-                "unknown dtype: {}, only support {}".format(
-                    dtype, _metadata_dict.keys()
-                )
-            )
-        self.dtype = dtype
-        self.narrow_range = narrow_range
-        self.qmin = (
-            -_metadata_dict[dtype].qmax if narrow_range else _metadata_dict[dtype].qmin
-        )
-        self.qmax = _metadata_dict[dtype].qmax
-        self.enabled = True
-
-    def get_dtype(self):
-        q_dict = self.get_qparams()
-        numpy_scale = None if "scale" not in q_dict else q_dict["scale"].numpy()[0]
-        numpy_zero_point = (
-            None if "zero_point" not in q_dict else q_dict["zero_point"].numpy()[0]
-        )
-        return get_quantized_dtype(self.dtype, numpy_scale, numpy_zero_point)
-
-    def enable(self):
-        self.enabled = True
-
-    def disable(self):
-        self.enabled = False
-
-    def train(self, mode: bool = True, recursive: bool = True) -> None:
-        super().train(mode, recursive)
-        if mode:
-            self.enable()
-        else:
-            self.disable()
-
-    @abstractmethod
-    def forward(self, x):
-        pass
-
-    @abstractmethod
-    def get_qparams(self, **kwargs):
-        pass
-
-
-class MinMaxObserver(Observer):
-    def __init__(
-        self,
-        mode=QuantMode.SYMMERTIC,
-        eps=0.00001,
-        dtype="qint8",
-        narrow_range: bool = False,
-    ):
-        super().__init__(dtype, narrow_range)
-        self.mode = mode
-        self.min_val = Buffer(np.finfo(np.float32).max, dtype=np.float32)
-        self.max_val = Buffer(np.finfo(np.float32).min, dtype=np.float32)
-        self.scale_limit = eps
-
-    def _calculate_qparams(self, inp_min_val, inp_max_val):
-        min_val = F.minimum(0.0, inp_min_val)
-        max_val = F.maximum(0.0, inp_max_val)
-        q_dict = get_qparam_dict(self.mode)
-        q_dict["min_val"] = inp_min_val
-        q_dict["max_val"] = inp_max_val
-        q_dict["enable_observer"] = self.enable
-        if self.mode == QuantMode.SYMMERTIC:
-            symmetric_max_vals = F.maximum(-min_val, max_val)
-            # use maximun to avoid scale too small at the begin
-            q_dict["scale"] = F.maximum(
-                symmetric_max_vals / ((self.qmax - self.qmin) / 2), self.scale_limit
-            )
-            # zero_point = self.zero_point
-        else:
-            # use maximun to avoid scale too small at the begin
-            q_dict["scale"] = F.maximum(
-                (max_val - min_val) / (self.qmax - self.qmin), self.scale_limit,
-            )
-            # caculate zero_point
-            q_dict["zero_point"] = self.qmin - Round()((min_val / q_dict["scale"]))
-
-        return q_dict
-
-    def get_qparams(self):
-        return self._calculate_qparams(self.min_val, self.max_val)
-
-    def forward(self, x_orig):
-        if self.enabled:
-            # stop gradient
-            x = F.zero_grad(x_orig)
-            # find max and min
-            F.add_update(
-                self.min_val,
-                F.minimum(self.min_val, x.min()),
-                alpha=0.0,
-                beta=1.0,
-                bias=0.0,
-            )
-            F.add_update(
-                self.max_val,
-                F.maximum(self.max_val, x.max()),
-                alpha=0.0,
-                beta=1.0,
-                bias=0.0,
-            )
-        return x_orig
-
-
-class ExponentialMovingAverageObserver(MinMaxObserver):
-    def __init__(
-        self,
-        momentum=0.9,
-        mode=QuantMode.SYMMERTIC,
-        eps=0.00001,
-        dtype="qint8",
-        narrow_range: bool = False,
-    ):
-        super().__init__(mode, eps, dtype, narrow_range)
-        self.momentum = Buffer(momentum)
-        self.runtime_momentum = Buffer(0.0)
-
-    def set_momentum(self, momentum):
-        self.momentum.set_value(momentum)
-
-    def forward(self, x_orig):
-        if self.enabled:
-            # stop gradient
-            x = F.zero_grad(x_orig)
-            # Exponential Moving Average
-            tmp_min = (
-                self.min_val * self.runtime_momentum
-                + (1 - self.runtime_momentum) * x.min()
-            )
-            tmp_max = (
-                self.max_val * self.runtime_momentum
-                + (1 - self.runtime_momentum) * x.max()
-            )
-            F.add_update(self.min_val, tmp_min, alpha=0.0, beta=1.0, bias=0.0)
-            F.add_update(self.max_val, tmp_max, alpha=0.0, beta=1.0, bias=0.0)
-            F.add_update(
-                self.runtime_momentum, self.momentum, alpha=0.0, beta=1.0, bias=0.0
-            )
-        return x_orig
-
-
-class HistogramObserver(MinMaxObserver):
-    def __init__(
-        self,
-        bins=2048,
-        upsample_rate=128,
-        mode=QuantMode.SYMMERTIC,
-        eps=0.00001,
-        dtype="qint8",
-        narrow_range: bool = False,
-    ):
-        super().__init__(mode, eps, dtype, narrow_range)
-        self.bins = bins
-        self.upsample_rate = upsample_rate
-        self.dst_nbins = _metadata_dict[dtype].qmax - _metadata_dict[dtype].qmin + 1
-        self.histogram = Buffer([-1] + [0.0] * (bins - 1))
-
-    def _non_linear_param_search(self):
-        r"""Non-linear parameter search.
-        An approximation for L2 error minimization for selecting min/max.
-        By selecting new min/max, we filter out outliers in input distribution.
-        """
-
-        np_min_val = self.min_val.numpy()[0]
-        np_max_val = self.max_val.numpy()[0]
-        np_histogram = self.histogram.numpy()
-        assert len(np_histogram) == self.bins, "bins mistmatch"
-        bin_width = (np_max_val - np_min_val) / self.bins
-
-        def _get_norm(delta_begin, delta_end, density, norm_type):
-            r"""
-            Compute the norm of the values uniformaly distributed between
-            delta_begin and delta_end.
-            norm = density * (integral_{begin, end} x^2)
-                 = density * (end^3 - begin^3) / 3
-            """
-            assert norm_type == "L2", "Only L2 norms are currently supported"
-            norm = 0.0
-            if norm_type == "L2":
-                norm = (
-                    delta_end * delta_end * delta_end
-                    - delta_begin * delta_begin * delta_begin
-                ) / 3
-            return density * norm
-
-        def _compute_quantization_error(next_start_bin, next_end_bin, norm_type):
-            r"""
-            Compute the quantization error if we use start_bin to end_bin as the
-            min and max to do the quantization.
-            """
-
-            norm = 0.0
-            dst_bin_width = (
-                bin_width * (next_end_bin - next_start_bin + 1) / self.dst_nbins
-            )
-            if dst_bin_width == 0.0:
-                return 0.0
-            for src_bin in range(self.bins):
-                # distances from the beginning of first dst_bin to the beginning and
-                # end of src_bin
-                src_bin_begin = (src_bin - next_start_bin) * bin_width
-                src_bin_end = src_bin_begin + bin_width
-
-                # which dst_bins the beginning and end of src_bin belong to?
-                dst_bin_of_begin = min(
-                    self.dst_nbins - 1,
-                    max(0.0, math.floor(src_bin_begin / dst_bin_width)),
-                )
-                dst_bin_of_end = min(
-                    self.dst_nbins - 1,
-                    max(0.0, math.floor(src_bin_end / dst_bin_width)),
-                )
-                dst_bin_of_begin_center = (
-                    dst_bin_of_begin * dst_bin_width + dst_bin_width / 2
-                )
-
-                density = np_histogram[src_bin] / bin_width
-                if dst_bin_of_begin == dst_bin_of_end:
-                    # if src_bin is entirely within 1 dst_bin
-                    delta_begin = src_bin_begin - dst_bin_of_begin_center
-                    delta_end = src_bin_end - dst_bin_of_begin_center
-                    norm = norm + _get_norm(delta_begin, delta_end, density, norm_type)
-                else:
-                    delta_begin = src_bin_begin - dst_bin_of_begin_center
-                    delta_end = dst_bin_width / 2
-                    norm = norm + _get_norm(delta_begin, delta_end, density, norm_type)
-
-                    norm = norm + (dst_bin_of_end - dst_bin_of_begin - 1) * _get_norm(
-                        -dst_bin_width / 2, dst_bin_width / 2, density, norm_type
-                    )
-
-                    dst_bin_of_end_center = (
-                        dst_bin_of_end * dst_bin_width + dst_bin_width / 2
-                    )
-
-                    delta_begin = -dst_bin_width / 2
-                    delta_end = src_bin_end - dst_bin_of_end_center
-                    norm = norm + _get_norm(delta_begin, delta_end, density, norm_type)
-            return norm
-
-        # cumulative sum
-        total = sum(np_histogram)
-        cSum = np.cumsum(np_histogram, axis=0)
-
-        stepsize = 1e-5  # granularity
-        alpha = 0.0  # lower bound
-        beta = 1.0  # upper bound
-        start_bin = 0
-        end_bin = self.bins - 1
-        norm_min = float("inf")
-
-        while alpha < beta:
-            # Find the next step
-            next_alpha = alpha + stepsize
-            next_beta = beta - stepsize
-
-            # find the left and right bins between the quantile bounds
-            l = start_bin
-            r = end_bin
-            while l < end_bin and cSum[l] < next_alpha * total:
-                l = l + 1
-            while r > start_bin and cSum[r] > next_beta * total:
-                r = r - 1
-
-            # decide the next move
-            next_start_bin = start_bin
-            next_end_bin = end_bin
-            if (l - start_bin) > (end_bin - r):
-                # move the start bin
-                next_start_bin = l
-                alpha = next_alpha
-            else:
-                # move the end bin
-                next_end_bin = r
-                beta = next_beta
-
-            if next_start_bin == start_bin and next_end_bin == end_bin:
-                continue
-
-            # calculate the quantization error using next_start_bin and next_end_bin
-            norm = _compute_quantization_error(next_start_bin, next_end_bin, "L2")
-
-            if norm > norm_min:
-                break
-            norm_min = norm
-            start_bin = next_start_bin
-            end_bin = next_end_bin
-
-        new_min = self.min_val + bin_width * start_bin
-        new_max = self.min_val + bin_width * (end_bin + 1)
-        return new_min, new_max
-
-    def get_qparams(self):
-        new_min, new_max = self._non_linear_param_search()
-        return self._calculate_qparams(new_min, new_max)
-
-    def _combine_histograms(
-        self, orig_hist, new_hist, upsample_rate, downsample_rate, start_idx, Nbins
-    ):
-        # First up-sample the histogram with new data by a factor of L
-        # This creates an approximate probability density thats piecwise constant
-        upsampled_histogram = new_hist.repeat(upsample_rate)
-        # Now insert the upsampled histogram into the output
-        # histogram, which is initialized with zeros.
-        # The offset at which the histogram is introduced is determined
-        # by the start index as the output histogram can cover a wider range
-        histogram_with_output_range = np.zeros((Nbins * downsample_rate))
-        histogram_with_output_range[
-            start_idx : Nbins * upsample_rate + start_idx
-        ] = upsampled_histogram
-        # Compute integral histogram, double precision is needed to ensure
-        # that there are no overflows
-        integral_histogram = np.cumsum(histogram_with_output_range, 0)[
-            downsample_rate - 1 :: downsample_rate
-        ]
-        # Finally perform interpolation
-        shifted_integral_histogram = np.zeros((Nbins))
-        shifted_integral_histogram[1:Nbins] = integral_histogram[0:-1]
-        interpolated_histogram = (
-            integral_histogram - shifted_integral_histogram
-        ) / upsample_rate
-        orig_hist = orig_hist + interpolated_histogram
-        return orig_hist
-
-    def _adjust_min_max(self, combined_min, combined_max, upsample_rate):
-        # We ensure that:
-        # (combined_max - combined_min)/(downsample_rate*Nbins) = (max - min)/(upsample_rate*Nbins)
-        # This allows us to have a common grid of resolution s, where we can align
-        # the input histogram
-        # start_idx maps min_val to the histogram bin index.
-        np_min_val = self.min_val.numpy()[0]
-        np_max_val = self.max_val.numpy()[0]
-
-        hist_bin_width = (np_max_val - np_min_val) / (self.bins * upsample_rate)
-        downsample_rate = int(
-            np.ceil((combined_max - combined_min) / (self.bins * hist_bin_width))
-        )
-        e = downsample_rate * (self.bins * hist_bin_width) - (
-            combined_max - combined_min
-        )
-        combined_max = combined_max + e / 2
-        combined_min = combined_min - e / 2
-        start_idx = int(np.round((np_min_val - combined_min) / hist_bin_width))
-
-        return combined_min, combined_max, downsample_rate, start_idx
-
-    @sideeffect
-    def sideeffect_forward(self, x_orig):
-        x = x_orig.numpy()
-        min_val = self.min_val.numpy()[0]
-        max_val = self.max_val.numpy()[0]
-        histogram = self.histogram.numpy()
-        new_min = x.min()
-        new_max = x.max()
-        if histogram[0] == -1:
-            new_histogram, _ = np.histogram(x, self.bins, (new_min, new_max))
-        else:
-            new_min = min(new_min, min_val)
-            new_max = max(new_max, max_val)
-            # combine the existing histogram and new histogram into 1 histogram
-            # We do this by first upsampling the histogram to a dense grid
-            # and then downsampling the histogram efficiently
-            (new_min, new_max, downsample_rate, start_idx,) = self._adjust_min_max(
-                new_min, new_max, self.upsample_rate
-            )
-
-            new_histogram, _ = np.histogram(x, self.bins, (new_min, new_max))
-            new_histogram = new_histogram.astype(np.float64)
-            if new_min == min_val and new_max == max_val:
-                new_histogram += histogram
-            else:
-                new_histogram = self._combine_histograms(
-                    new_histogram,
-                    histogram,
-                    self.upsample_rate,
-                    downsample_rate,
-                    start_idx,
-                    self.bins,
-                )
-
-        self.histogram.set_value(new_histogram)
-        self.min_val.set_value(new_min)
-        self.max_val.set_value(new_max)
-
-    def forward(self, x_orig):
-        self.sideeffect_forward(x_orig)
-        return x_orig
diff --git a/python_module/megengine/quantization/qconfig.py b/python_module/megengine/quantization/qconfig.py
deleted file mode 100644
index 6606c1a5..00000000
--- a/python_module/megengine/quantization/qconfig.py
+++ /dev/null
@@ -1,109 +0,0 @@
-# MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
-#
-# Copyright (c) 2014-2020 Megvii Inc. All rights reserved.
-#'
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-from functools import partial
-
-from ..module import Module
-from .fake_quant import TQT, FakeQuantize
-from .observer import (
-    ExponentialMovingAverageObserver,
-    HistogramObserver,
-    MinMaxObserver,
-)
-
-
-class QConfig:
-    r"""
-    A config class indicating how to do quantize toward :class:`~.QATModule`'s
-    ``activation`` and ``weight``. See :meth:`~.QATModule.set_qconfig` for detail usage.
-
-    :param weight_observer: interface to instantiate an :class:`~.Observer` indicating
-        how to collect scales and zero_point of wegiht.
-    :param act_observer: similar to ``weight_observer`` but toward activation.
-    :param weight_fake_quant: interface to instantiate a :class:`~.FakeQuantize` indicating
-        how to do fake_quant calculation.
-    :param act_observer: similar to ``weight_fake_quant`` but toward activation.
-
-    Examples:
-
-    .. code-block::
-
-        # Default EMA QConfig for QAT.
-        ema_fakequant_qconfig = QConfig(
-            weight_observer=partial(MinMaxObserver, dtype="qint8", narrow_range=True),
-            act_observer=partial(ExponentialMovingAverageObserver, dtype="qint8", narrow_range=False),
-            weight_fake_quant=partial(FakeQuantize, dtype="qint8", narrow_range=True),
-            act_fake_quant=partial(FakeQuantize, dtype="qint8", narrow_range=False),
-        )
-
-    Each parameter is a ``class`` rather than an instance. And we recommand using ``functools.partial``
-    to add initialization parameters of the ``class``, so that don't need to provide parameters in
-    :meth:`~.QATModule.set_qconfig`.
-
-    Usually we set ``narrow_range`` of weight related paramters to ``True`` and of activation related
-    parameters to ``False``. For the result of multiplication and addition as ``a * b + c * d``, if
-    four variables are all -128 of dtype ``qint8``, then the result will be ``2^15`` and cause overflow.
-    Weights are commonly calculated in this way, so needed to narrow the range.
-    """
-
-    def __init__(
-        self, weight_observer, act_observer, weight_fake_quant, act_fake_quant
-    ):
-        if isinstance(act_observer, Module) or isinstance(weight_observer, Module):
-            raise ValueError(
-                "QConfig must not receive observer instance, please pass observer"
-                " class generator using `partial(Observer, ...)` instead. Use"
-                " partial(MyObserver, x=1) to override arguments to constructor if needed"
-            )
-        self.weight_observer = weight_observer
-        self.act_observer = act_observer
-        self.weight_fake_quant = weight_fake_quant
-        self.act_fake_quant = act_fake_quant
-
-
-tqt_quant_qconfig = QConfig(
-    weight_observer=partial(
-        ExponentialMovingAverageObserver, dtype="qint8", narrow_range=True
-    ),
-    act_observer=partial(
-        ExponentialMovingAverageObserver, dtype="qint8", narrow_range=False
-    ),
-    weight_fake_quant=partial(TQT, dtype="qint8", narrow_range=True),
-    act_fake_quant=partial(TQT, dtype="qint8", narrow_range=False),
-)
-
-min_max_fakequant_qconfig = QConfig(
-    weight_observer=partial(MinMaxObserver, dtype="qint8", narrow_range=True),
-    act_observer=partial(MinMaxObserver, dtype="qint8", narrow_range=False),
-    weight_fake_quant=partial(FakeQuantize, dtype="qint8", narrow_range=True),
-    act_fake_quant=partial(FakeQuantize, dtype="qint8", narrow_range=False),
-)
-
-ema_fakequant_qconfig = QConfig(
-    weight_observer=partial(MinMaxObserver, dtype="qint8", narrow_range=True),
-    act_observer=partial(
-        ExponentialMovingAverageObserver, dtype="qint8", narrow_range=False
-    ),
-    weight_fake_quant=partial(FakeQuantize, dtype="qint8", narrow_range=True),
-    act_fake_quant=partial(FakeQuantize, dtype="qint8", narrow_range=False),
-)
-
-ema_lowbit_fakequant_qconfig = QConfig(
-    weight_observer=partial(MinMaxObserver, dtype="qint4", narrow_range=False),
-    act_observer=partial(
-        ExponentialMovingAverageObserver, dtype="qint4", narrow_range=False
-    ),
-    weight_fake_quant=partial(FakeQuantize, dtype="qint4", narrow_range=False),
-    act_fake_quant=partial(FakeQuantize, dtype="qint4", narrow_range=False),
-)
-
-calibration_qconfig = QConfig(
-    weight_observer=partial(MinMaxObserver, dtype="qint8", narrow_range=True),
-    act_observer=partial(HistogramObserver, dtype="qint8", narrow_range=False),
-    weight_fake_quant=None,
-    act_fake_quant=None,
-)
diff --git a/python_module/megengine/quantization/quantize.py b/python_module/megengine/quantization/quantize.py
deleted file mode 100644
index 5dab2ae4..00000000
--- a/python_module/megengine/quantization/quantize.py
+++ /dev/null
@@ -1,191 +0,0 @@
-# MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
-#
-# Copyright (c) 2014-2020 Megvii Inc. All rights reserved.
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-from copy import copy, deepcopy
-from typing import Callable, Dict, Tuple
-
-from .. import module as Float
-from ..module import Module
-from ..module import qat as QAT
-from ..module import quantized as Quantized
-from ..module.qat import QATModule
-from ..module.quantized import QuantizedModule
-from .fake_quant import TQT
-from .qconfig import QConfig, ema_fakequant_qconfig
-
-
-def _get_quantable_module_names():
-    def is_quantable(key: str):
-        value = getattr(Quantized, key)
-        return (
-            isinstance(value, type)
-            and issubclass(value, QuantizedModule)
-            and value != QuantizedModule
-        )
-
-    # source should have all quantable modules' names
-    quantable_module_names = [key for key in dir(Quantized) if is_quantable(key)]
-    return quantable_module_names
-
-
-def _get_convert_dict() -> Tuple[
-    Dict[Module, QATModule], Dict[QATModule, QuantizedModule]
-]:
-    quantable_module_names = _get_quantable_module_names()
-
-    quantable_modules = [getattr(Float, key) for key in quantable_module_names]
-    qat_modules = [getattr(QAT, key) for key in quantable_module_names]
-    quantized_modules = [getattr(Quantized, key) for key in quantable_module_names]
-
-    float2qat_dict = dict(zip(quantable_modules, qat_modules))
-    qat2quantized_dict = dict(zip(qat_modules, quantized_modules))
-    return float2qat_dict, qat2quantized_dict
-
-
-_float2qat_dict, _qat2quantized_dict = _get_convert_dict()
-
-
-def quantize(module: Module, inplace: bool = True, mapping: dict = None):
-    r"""
-    Recursively convert :class:`~.QATModule` to :class:`~.QuantizedModule`
-    through :meth:`~.Module.apply`.
-
-    :param module: root module to do convert recursively.
-    :param inplace: whether to convert submodules in-place.
-    :param mapping: a dict indicating how to convert custom modules from QATModule to
-        QuantizedModule. Will be combined with internal default convert mapping dict.
-    """
-
-    if not inplace:
-        module = deepcopy(module)
-
-    convert_dict = copy(_qat2quantized_dict)
-    if mapping is not None:
-        convert_dict.update(mapping)
-    qat_modules = tuple(convert_dict.keys())
-
-    def is_qat(mod: Module):
-        return isinstance(mod, qat_modules)
-
-    # must use list to avoid replacement influencing successor modules
-    for key, submodule, parent in list(
-        module._flatten(with_key=True, with_parent=True, predicate=is_qat)
-    ):
-        new_mod = convert_dict[type(submodule)].from_qat_module(submodule)
-        if isinstance(parent, Float.Sequential):
-            # cannnot use setattr to be compatible with Sequential's ``__setitem__``
-            parent[int(key.split(".")[-1])] = new_mod
-        else:
-            setattr(parent, key.split(".")[-1], new_mod)
-
-    return module
-
-
-def quantize_qat(
-    module: Module,
-    inplace: bool = True,
-    qconfig: QConfig = ema_fakequant_qconfig,
-    mapping: dict = None,
-):
-    r"""
-    Recursively convert float :class:`~.Module` to :class:`~.QATModule`
-    through :meth:`~.Module.apply` and set qconfig relatively.
-
-    :param module: root module to do convert recursively.
-    :param inplace: whether to convert submodules in-place.
-    :param qconfig: an instance of :class:`~.QConfig` to be set as submodules' qconfig.
-        default is ``ema_fakequant_qconfig``.
-    :param mapping: a dict indicating how to convert custom modules from Module to QATModule.
-        Will be combined with internal default convert mapping dict.
-    """
-
-    if not inplace:
-        module = deepcopy(module)
-
-    convert_dict = copy(_float2qat_dict)
-    if mapping is not None:
-        convert_dict.update(mapping)
-    quantable_modules = tuple(convert_dict.keys())
-
-    def is_quantable(mod: Module):
-        return isinstance(mod, quantable_modules)
-
-    # must use list to avoid replacement influencing successor modules
-    for key, submodule, parent in list(
-        module._flatten(with_key=True, with_parent=True, predicate=is_quantable)
-    ):
-        # only convert top quantable module.
-        if is_quantable(parent) or submodule.quantize_disabled:
-            continue
-
-        new_mod = convert_dict[type(submodule)].from_float_module(submodule)
-        if isinstance(parent, Float.Sequential):
-            # cannnot use setattr to be compatible with Sequential's ``__setitem__``
-            parent[int(key.split(".")[-1])] = new_mod
-        else:
-            setattr(parent, key.split(".")[-1], new_mod)
-
-    propagate_qconfig(module, qconfig)
-    return module
-
-
-def _propagate(module: Module, func_str: str, *args, **kargs):
-    def fn(mod: Module):
-        if isinstance(mod, QATModule):
-            getattr(mod, func_str)(*args, **kargs)
-
-    module.apply(fn)
-
-
-def propagate_qconfig(module: QATModule, qconfig: QConfig):
-    r"""
-    Recursively set ``module``'s qconfig through :meth:`~.Module.apply`.
-
-    :param module: root module to traverse recursively.
-    :param qconfig: a instance of :class:`~.QConfig` to be set as submodules' qconfig.
-    """
-    _propagate(module, "set_qconfig", qconfig)
-
-
-def disable_fake_quant(module: Module):
-    r"""
-    Recursively disable ``module`` fake quantization in QATModule through :meth:`~.Module.apply`
-
-    :param module: root module to do disable fake quantization recursively.
-    """
-
-    _propagate(module, "set_fake_quant", False)
-
-
-def disable_observer(module: Module):
-    r"""
-    Recursively disable ``module`` observer in QATModule through :meth:`~.Module.apply`
-
-    :param module: root module to do disable observer recursively.
-    """
-
-    _propagate(module, "set_observer", False)
-
-
-def enable_fake_quant(module: Module):
-    r"""
-    Recursively enable ``module`` fake quantization in QATModule through :meth:`~.Module.apply`
-
-    :param module: root module to do enable fake quantization recursively.
-    """
-
-    _propagate(module, "set_fake_quant", True)
-
-
-def enable_observer(module: Module):
-    r"""
-    Recursively enable ``module`` observer in QATModule through :meth:`~.Module.apply`
-
-    :param module: root module to do enable observer recursively.
-    """
-
-    _propagate(module, "set_observer", True)
diff --git a/python_module/megengine/quantization/utils.py b/python_module/megengine/quantization/utils.py
deleted file mode 100644
index 2b940c9d..00000000
--- a/python_module/megengine/quantization/utils.py
+++ /dev/null
@@ -1,116 +0,0 @@
-# MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
-#
-# Copyright (c) 2014-2020 Megvii Inc. All rights reserved.
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-
-from enum import Enum
-from functools import partial, update_wrapper, wraps
-from typing import Dict
-
-from .. import functional as F
-from .._internal.dtype import _metadata_dict
-from ..core import Function, Tensor
-
-
-class Round(Function):
-    """
-    The functional round have no grad and can not use for quantization-aware-training.
-    We use Function and STE(Straight-Through Estimator) to implement backward propagation.
-    """
-
-    def forward(self, x):
-        return x.round()
-
-    def backward(self, output_grads):
-        return output_grads
-
-
-def register_method_to_class(cls):
-    def decorator(func):
-        @wraps(func)
-        def wrapper(self, *args, **kwargs):
-            return func(self, *args, **kwargs)
-
-        if isinstance(func, partial):
-            update_wrapper(func, func.func)
-        setattr(cls, func.__name__, wrapper)
-        return func
-
-    return decorator
-
-
-class QuantMode(Enum):
-    """Quantization mode enumerate class.
-    """
-
-    SYMMERTIC = 1
-    ASYMMERTIC = 2
-    TQT = 3
-
-
-qparam_dict = {
-    QuantMode.SYMMERTIC: {"mode": QuantMode.SYMMERTIC, "scale": None,},
-    QuantMode.ASYMMERTIC: {
-        "mode": QuantMode.ASYMMERTIC,
-        "scale": None,
-        "zero_point": None,
-    },
-    QuantMode.TQT: {"mode": QuantMode.TQT, "scale": None,},
-}
-
-
-def get_qparam_dict(mode: QuantMode):
-    """Return the quantization parameters dictory according to the mode.
-    """
-    return qparam_dict.get(mode, None)
-
-
-def fake_quant_tensor(inp: Tensor, qmin: int, qmax: int, q_dict: Dict) -> Tensor:
-    """Apply fake quantization to the inp tensor.
-
-    :param inp: the input tensor which need to be faked.
-    :param qmin: the minimum value which the integer limit to.
-    :param qmax: the maximum value which the integer limit to.
-    :param q_dict: the quantization parameter dict.
-
-    """
-    scale = q_dict["scale"]
-    zero_point = 0
-    if q_dict["mode"] == QuantMode.ASYMMERTIC:
-        zero_point = q_dict["zero_point"]
-    # Quant
-    oup = Round()(inp / scale) + zero_point
-    # Clip
-    oup = F.minimum(F.maximum(oup, qmin), qmax)
-    # Dequant
-    oup = (oup - zero_point) * scale
-    return oup
-
-
-def fake_quant_bias(bias: Tensor, inp: Tensor, w_qat: Tensor) -> Tensor:
-    """Apply fake quantization to bias, the special scale from input tensor
-    and weight tensor, the quantized type set to qint32 also.
-
-    :param bias: the bias tensor which need to be faked.
-    :param inp:  the input tensor which contain the quantization parameters.
-    :param qmax: the weight tensor which contain the quantization parameters.
-
-    .. warning::
-        Only work for symmetric quantization method now.
-
-    """
-    b_qat = bias
-    if hasattr(inp, "q_dict") and b_qat is not None:
-        if inp.q_dict["scale"] is not None and w_qat.q_dict["scale"] is not None:
-            # use the same mode with weight.
-            b_dict = get_qparam_dict(w_qat.q_dict["mode"])
-            b_dict["scale"] = inp.q_dict["scale"] * w_qat.q_dict["scale"]
-            # TODO: add zero_point for ASYMMERTIC mode.
-            qmax = _metadata_dict["qint32"].qmax
-            qmin = _metadata_dict["qint32"].qmin
-            b_qat = fake_quant_tensor(b_qat, qmin, qmax, b_dict)
-
-    return b_qat
diff --git a/python_module/megengine/random/__init__.py b/python_module/megengine/random/__init__.py
deleted file mode 100644
index 86c8d797..00000000
--- a/python_module/megengine/random/__init__.py
+++ /dev/null
@@ -1,13 +0,0 @@
-# -*- coding: utf-8 -*-
-# MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
-#
-# Copyright (c) 2014-2020 Megvii Inc. All rights reserved.
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-from .distribution import gaussian, uniform
-from .rng import manual_seed
-
-# pylint: disable=undefined-variable
-del distribution, rng  # type: ignore[name-defined]
diff --git a/python_module/megengine/random/distribution.py b/python_module/megengine/random/distribution.py
deleted file mode 100644
index 55977f9c..00000000
--- a/python_module/megengine/random/distribution.py
+++ /dev/null
@@ -1,102 +0,0 @@
-# -*- coding: utf-8 -*-
-# MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
-#
-# Copyright (c) 2014-2020 Megvii Inc. All rights reserved.
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-from typing import Iterable, Optional
-
-import megengine._internal as mgb
-from megengine._internal import CompGraph, CompNode
-
-from ..core.graph import _use_default_if_none
-from ..core.tensor import Tensor, wrap_io_tensor
-from .rng import _random_seed_generator
-
-__all__ = ["gaussian", "uniform"]
-
-
-@wrap_io_tensor
-def gaussian(
-    shape: Iterable[int],
-    mean: float = 0,
-    std: float = 1,
-    comp_node: Optional[CompNode] = None,
-    comp_graph: Optional[CompGraph] = None,
-) -> Tensor:
-    r"""Random variable with Gaussian distribution $N(\mu, \sigma)$
-
-    :param shape: Output tensor shape
-    :param mean: The mean or expectation of the distribution
-    :param std: The standard deviation of the distribution (variance = $\sigma ^ 2$)
-    :param comp_node: The comp node output on, default to None
-    :param comp_graph: The graph in which output is, default to None
-    :return: The output tensor
-
-    Examples:
-
-    .. testcode::
-
-        import megengine as mge
-        import megengine.random as rand
-
-        x = rand.gaussian((2, 2), mean=0, std=1)
-        print(x.numpy())
-
-    .. testoutput::
-        :options: +SKIP
-
-        [[-0.20235455 -0.6959438 ]
-         [-1.4939808  -1.5824696 ]]
-
-    """
-    comp_node, comp_graph = _use_default_if_none(comp_node, comp_graph)
-    seed = _random_seed_generator().__next__()
-    return mgb.opr.gaussian_rng(
-        shape, seed=seed, mean=mean, std=std, comp_node=comp_node, comp_graph=comp_graph
-    )
-
-
-@wrap_io_tensor
-def uniform(
-    shape: Iterable[int],
-    low: float = 0,
-    high: float = 1,
-    comp_node: Optional[CompNode] = None,
-    comp_graph: Optional[CompGraph] = None,
-) -> Tensor:
-    r"""Random variable with uniform distribution $U(0, 1)$
-
-    :param shape: Output tensor shape
-    :param low: Lower range
-    :param high: Upper range
-    :param comp_node: The comp node output on, default to None
-    :param comp_graph: The graph in which output is, default to None
-    :return: The output tensor
-
-    Examples:
-
-    .. testcode::
-
-        import megengine as mge
-        import megengine.random as rand
-
-        x = rand.uniform((2, 2))
-        print(x.numpy())
-
-    .. testoutput::
-        :options: +SKIP
-
-        [[0.76901674 0.70496535]
-         [0.09365904 0.62957656]]
-
-    """
-    assert low < high, "Uniform is not defined when low >= high"
-
-    comp_node, comp_graph = _use_default_if_none(comp_node, comp_graph)
-    seed = _random_seed_generator().__next__()
-    return low + (high - low) * mgb.opr.uniform_rng(
-        shape, seed=seed, comp_node=comp_node, comp_graph=comp_graph
-    )
diff --git a/python_module/megengine/random/rng.py b/python_module/megengine/random/rng.py
deleted file mode 100644
index 54da30bd..00000000
--- a/python_module/megengine/random/rng.py
+++ /dev/null
@@ -1,27 +0,0 @@
-# -*- coding: utf-8 -*-
-# MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
-#
-# Copyright (c) 2014-2020 Megvii Inc. All rights reserved.
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-import time
-
-from numpy.random import MT19937
-
-_rng = None
-
-
-def _random_seed_generator():
-    if _rng is None:
-        from ..distributed.util import get_rank
-
-        manual_seed(seed=int(time.time()) + get_rank())
-    while True:
-        yield _rng.random_raw()
-
-
-def manual_seed(seed: int):
-    global _rng  # pylint: disable=global-statement
-    _rng = MT19937(seed=seed)
diff --git a/python_module/megengine/test/__init__.py b/python_module/megengine/test/__init__.py
deleted file mode 100644
index 44ed54c2..00000000
--- a/python_module/megengine/test/__init__.py
+++ /dev/null
@@ -1,67 +0,0 @@
-# -*- coding: utf-8 -*-
-# MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
-#
-# Copyright (c) 2014-2020 Megvii Inc. All rights reserved.
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-import numpy as np
-
-
-def assertTensorClose(
-    v0, v1, *, max_err: float = 1e-6, allow_special_values: bool = False, name=None
-):
-    """
-    :param allow_special_values: whether to allow :attr:`v0` and :attr:`v1` to contain inf and nan values.
-    :param max_err: relative error
-    """
-    __tracebackhide__ = True  # pylint: disable=unused-variable
-
-    assert (
-        v0.dtype == v1.dtype
-    ), "Two Tensor must have same dtype, but the inputs are {} and {}".format(
-        v0.dtype, v1.dtype
-    )
-    v0 = np.ascontiguousarray(v0, dtype=np.float32).copy()
-    v1 = np.ascontiguousarray(v1, dtype=np.float32).copy()
-    if allow_special_values:
-        # check nan and rm it
-        v0_nan_mask = np.isnan(v0)
-        if np.any(v0_nan_mask):
-            assert np.array_equiv(v0_nan_mask, np.isnan(v1)), (v0, v1)
-            v0[v0_nan_mask] = 0
-            v1[v0_nan_mask] = 0
-        # check inf and rm it
-        v0_inf_mask = v0 == float("inf")
-        if np.any(v0_inf_mask):
-            assert np.array_equiv(v0_inf_mask, v1 == float("inf")), (v0, v1)
-            v0[v0_inf_mask] = 0
-            v1[v0_inf_mask] = 0
-        # check -inf and rm it
-        v0_inf_mask = v0 == float("-inf")
-        if np.any(v0_inf_mask):
-            assert np.array_equiv(v0_inf_mask, v1 == float("-inf")), (v0, v1)
-            v0[v0_inf_mask] = 0
-            v1[v0_inf_mask] = 0
-    else:
-        assert np.isfinite(v0.sum()) and np.isfinite(v1.sum()), (v0, v1)
-
-    assert v0.shape == v1.shape, "Two tensor must have same shape({} v.s. {})".format(
-        v0.shape, v1.shape
-    )
-    vdiv = np.max([np.abs(v0), np.abs(v1), np.ones_like(v0)], axis=0)
-    err = np.abs(v0 - v1) / vdiv
-    check = err > max_err
-    if check.sum():
-        idx = tuple(i[0] for i in np.nonzero(check))
-        if name is None:
-            name = "tensor"
-        else:
-            name = "tensor {}".format(name)
-        raise AssertionError(
-            "{} not equal: "
-            "shape={} nonequal_idx={} v0={} v1={} err={}".format(
-                name, v0.shape, idx, v0[idx], v1[idx], err[idx]
-            )
-        )
diff --git a/python_module/megengine/utils/__init__.py b/python_module/megengine/utils/__init__.py
deleted file mode 100644
index 6b7b2d3a..00000000
--- a/python_module/megengine/utils/__init__.py
+++ /dev/null
@@ -1,17 +0,0 @@
-# -*- coding: utf-8 -*-
-# MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
-#
-# Copyright (c) 2014-2020 Megvii Inc. All rights reserved.
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-
-from megengine._internal.plugin import load_tensor_binary
-
-
-def prod(iterable):
-    result = 1
-    for i in iterable:
-        result *= i
-    return result
diff --git a/python_module/megengine/utils/hook.py b/python_module/megengine/utils/hook.py
deleted file mode 100644
index 9864a94a..00000000
--- a/python_module/megengine/utils/hook.py
+++ /dev/null
@@ -1,23 +0,0 @@
-# MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
-#
-# Copyright (c) 2014-2020 Megvii Inc. All rights reserved.
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-import weakref
-
-
-class HookHandler:
-    hook_num = 0
-
-    def __init__(self, source_dict, hook):
-        self.id = HookHandler.hook_num
-        HookHandler.hook_num += 1
-        source_dict[self.id] = hook
-        self.source_ref = weakref.ref(source_dict)
-
-    def remove(self):
-        source_dict = self.source_ref()
-        if source_dict is not None and self.id in source_dict:
-            del source_dict[self.id]
diff --git a/python_module/megengine/utils/http_download.py b/python_module/megengine/utils/http_download.py
deleted file mode 100644
index add2a649..00000000
--- a/python_module/megengine/utils/http_download.py
+++ /dev/null
@@ -1,66 +0,0 @@
-# -*- coding: utf-8 -*-
-# MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
-#
-# Copyright (c) 2014-2020 Megvii Inc. All rights reserved.
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-import hashlib
-import os
-import shutil
-from tempfile import NamedTemporaryFile
-
-import requests
-from tqdm import tqdm
-
-from ..logger import get_logger
-
-logger = get_logger(__name__)
-
-CHUNK_SIZE = 1024
-HTTP_CONNECTION_TIMEOUT = 5
-
-
-class HTTPDownloadError(BaseException):
-    """The class that represents http request error"""
-
-
-def download_from_url(url: str, dst: str, http_read_timeout=120):
-    """
-    Downloads file from given url to ``dst``
-
-    :param url: source URL
-    :param dst: saving path
-    :param http_read_timeout: how many seconds to wait for data before giving up
-    """
-    dst = os.path.expanduser(dst)
-    dst_dir = os.path.dirname(dst)
-
-    resp = requests.get(
-        url, timeout=(HTTP_CONNECTION_TIMEOUT, http_read_timeout), stream=True
-    )
-    if resp.status_code != 200:
-        raise HTTPDownloadError("An error occured when downloading from {}".format(url))
-
-    md5 = hashlib.md5()
-    total_size = int(resp.headers.get("Content-Length", 0))
-    bar = tqdm(
-        total=total_size, unit="iB", unit_scale=True, ncols=80
-    )  # pylint: disable=blacklisted-name
-    try:
-        with NamedTemporaryFile("w+b", delete=False, suffix=".tmp", dir=dst_dir) as f:
-            logger.info("Download file to temp file %s", f.name)
-            for chunk in resp.iter_content(CHUNK_SIZE):
-                if not chunk:
-                    break
-                bar.update(len(chunk))
-                f.write(chunk)
-                md5.update(chunk)
-            bar.close()
-        shutil.move(f.name, dst)
-    finally:
-        # ensure tmp file is removed
-        if os.path.exists(f.name):
-            os.remove(f.name)
-    return md5.hexdigest()
diff --git a/python_module/megengine/utils/max_recursion_limit.py b/python_module/megengine/utils/max_recursion_limit.py
deleted file mode 100644
index d7bce6e8..00000000
--- a/python_module/megengine/utils/max_recursion_limit.py
+++ /dev/null
@@ -1,78 +0,0 @@
-# -*- coding: utf-8 -*-
-# MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
-#
-# Copyright (c) 2014-2020 Megvii Inc. All rights reserved.
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-import platform
-import sys
-import threading
-
-# Windows do not imp resource package
-if platform.system() != "Windows":
-    import resource
-
-
-class AlternativeRecursionLimit:
-    r"""A reentrant context manager for setting global recursion limits.
-    """
-
-    def __init__(self, new_py_limit):
-        self.new_py_limit = new_py_limit
-        self.count = 0
-        self.lock = threading.Lock()
-
-        self.orig_py_limit = 0
-        self.orig_rlim_stack_soft = 0
-        self.orig_rlim_stack_hard = 0
-
-    def __enter__(self):
-        with self.lock:
-            if self.count == 0:
-                self.orig_py_limit = sys.getrecursionlimit()
-            if platform.system() != "Windows":
-                (
-                    self.orig_rlim_stack_soft,
-                    self.orig_rlim_stack_hard,
-                ) = resource.getrlimit(resource.RLIMIT_STACK)
-                # FIXME: https://bugs.python.org/issue34602, python3 release version
-                # on Macos always have this issue, not all user install python3 from src
-                try:
-                    resource.setrlimit(
-                        resource.RLIMIT_STACK,
-                        (self.orig_rlim_stack_hard, self.orig_rlim_stack_hard),
-                    )
-                except ValueError as exc:
-                    if platform.system() != "Darwin":
-                        raise exc
-
-            # increase recursion limit
-            sys.setrecursionlimit(self.new_py_limit)
-            self.count += 1
-
-    def __exit__(self, type, value, traceback):
-        with self.lock:
-            self.count -= 1
-            if self.count == 0:
-                sys.setrecursionlimit(self.orig_py_limit)
-
-            if platform.system() != "Windows":
-                try:
-                    resource.setrlimit(
-                        resource.RLIMIT_STACK,
-                        (self.orig_rlim_stack_soft, self.orig_rlim_stack_hard),
-                    )
-                except ValueError as exc:
-                    if platform.system() != "Darwin":
-                        raise exc
-
-
-_max_recursion_limit_context_manager = AlternativeRecursionLimit(2 ** 31 - 1)
-
-
-def max_recursion_limit():
-    r"""Sets recursion limit to the max possible value
-    """
-    return _max_recursion_limit_context_manager
diff --git a/python_module/megengine/utils/net_stats.py b/python_module/megengine/utils/net_stats.py
deleted file mode 100644
index fa35d114..00000000
--- a/python_module/megengine/utils/net_stats.py
+++ /dev/null
@@ -1,279 +0,0 @@
-# MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
-#
-# Copyright (c) 2014-2020 Megvii Inc. All rights reserved.
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-from functools import partial
-
-import numpy as np
-import tabulate
-
-import megengine as mge
-import megengine._internal as mgb
-import megengine.module as m
-import megengine.module.qat as qatm
-import megengine.module.quantized as qm
-
-try:
-    mge.logger.MegEngineLogFormatter.max_lines = float("inf")
-except AttributeError as e:
-    raise ValueError("set logger max lines failed")
-
-logger = mge.get_logger(__name__)
-
-
-CALC_FLOPS = {}
-
-
-def _register_modules(*modules):
-    def callback(impl):
-        for module in modules:
-            CALC_FLOPS[module] = impl
-        return impl
-
-    return callback
-
-
-@_register_modules(
-    m.Conv2d,
-    m.ConvTranspose2d,
-    m.LocalConv2d,
-    qm.Conv2d,
-    qm.ConvRelu2d,
-    qm.ConvBn2d,
-    qm.ConvBnRelu2d,
-    qatm.Conv2d,
-    qatm.ConvRelu2d,
-    qatm.ConvBn2d,
-    qatm.ConvBnRelu2d,
-)
-def count_convNd(module, input, output):
-    bias = 1 if module.bias is not None else 0
-    group = module.groups
-    ic = input[0].shape[1]
-    oc = output[0].shape[1]
-    goc = oc // group
-    gic = ic // group
-    N = output[0].shape[0]
-    HW = np.prod(output[0].shape[2:])
-    # N x Cout x H x W x  (Cin x Kw x Kh + bias)
-    return N * HW * goc * (gic * np.prod(module.kernel_size) + bias)
-
-
-@_register_modules(m.ConvTranspose2d)
-def count_deconvNd(module, input, output):
-    return np.prod(input[0].shape) * output[0].shape[1] * np.prod(module.kernel_size)
-
-
-@_register_modules(m.Linear, qatm.Linear, qm.Linear)
-def count_linear(module, input, output):
-    return np.prod(output[0].shape) * module.in_features
-
-
-# does not need import qat and quantized module since they inherit from float module.
-hook_modules = (
-    m.Conv2d,
-    m.ConvTranspose2d,
-    m.LocalConv2d,
-    m.BatchNorm2d,
-    m.Linear,
-)
-
-
-def net_stats(model, input_size, bar_length_max=20, log_params=True, log_flops=True):
-    def dict2table(list_of_dict, header):
-        table_data = [header]
-        for d in list_of_dict:
-            row = []
-            for h in header:
-                v = ""
-                if h in d:
-                    v = d[h]
-                row.append(v)
-            table_data.append(row)
-        return table_data
-
-    def sizeof_fmt(num, suffix="B"):
-        for unit in ["", "Ki", "Mi", "Gi", "Ti", "Pi", "Ei", "Zi"]:
-            if abs(num) < 1024.0:
-                return "{:3.3f} {}{}".format(num, unit, suffix)
-            num /= 1024.0
-        sign_str = "-" if num < 0 else ""
-        return "{}{:.1f} {}{}".format(sign_str, num, "Yi", suffix)
-
-    def get_byteswidth(tensor):
-        dtype = tensor.dtype
-        if mgb.dtype.is_quantize(dtype):
-            return 1
-        elif mgb.dtype.is_bfloat16(dtype):
-            return 2
-        else:
-            return 4
-
-    def print_flops_stats(flops):
-        flops_list = [i["flops_num"] for i in flops]
-        max_flops_num = max(flops_list + [0])
-        # calc total flops and set flops_cum
-        total_flops_num = 0
-        for d in flops:
-            total_flops_num += int(d["flops_num"])
-            d["flops_cum"] = sizeof_fmt(total_flops_num, suffix="OPs")
-
-        for i in flops:
-            f = i["flops_num"]
-            i["flops"] = sizeof_fmt(f, suffix="OPs")
-            r = i["ratio"] = f / total_flops_num
-            i["percentage"] = "{:.2f}%".format(r * 100)
-            bar_length = int(f / max_flops_num * bar_length_max)
-            i["bar"] = "#" * bar_length
-
-        header = [
-            "name",
-            "class_name",
-            "input_shapes",
-            "output_shapes",
-            "flops",
-            "flops_cum",
-            "percentage",
-            "bar",
-        ]
-
-        total_flops_str = sizeof_fmt(total_flops_num, suffix="OPs")
-        total_var_size = sum(sum(s[1] for s in i["output_shapes"]) for i in flops)
-        flops.append(
-            dict(name="total", flops=total_flops_str, output_shapes=total_var_size)
-        )
-
-        logger.info(
-            "flops stats: \n" + tabulate.tabulate(dict2table(flops, header=header))
-        )
-
-        return total_flops_num
-
-    def print_params_stats(params):
-        total_param_dims, total_param_size = 0, 0
-        for d in params:
-            total_param_dims += int(d["param_dim"])
-            total_param_size += int(d["size"])
-            d["size"] = sizeof_fmt(d["size"])
-            d["size_cum"] = sizeof_fmt(total_param_size)
-
-        for d in params:
-            ratio = d["param_dim"] / total_param_dims
-            d["ratio"] = ratio
-            d["percentage"] = "{:.2f}%".format(ratio * 100)
-
-        # construct bar
-        max_ratio = max([d["ratio"] for d in params])
-        for d in params:
-            bar_length = int(d["ratio"] / max_ratio * bar_length_max)
-            d["size_bar"] = "#" * bar_length
-
-        param_size = sizeof_fmt(total_param_size)
-        params.append(dict(name="total", param_dim=total_param_dims, size=param_size,))
-
-        header = [
-            "name",
-            "shape",
-            "mean",
-            "std",
-            "param_dim",
-            "bits",
-            "size",
-            "size_cum",
-            "percentage",
-            "size_bar",
-        ]
-
-        logger.info(
-            "param stats: \n" + tabulate.tabulate(dict2table(params, header=header))
-        )
-
-        return total_param_size
-
-    def net_stats_hook(module, input, output, name=""):
-        class_name = str(module.__class__).split(".")[-1].split("'")[0]
-
-        flops_fun = CALC_FLOPS.get(type(module))
-        if callable(flops_fun):
-            flops_num = flops_fun(module, input, output)
-
-            if not isinstance(output, (list, tuple)):
-                output = [output]
-
-            flops.append(
-                dict(
-                    name=name,
-                    class_name=class_name,
-                    input_shapes=[i.shape for i in input],
-                    output_shapes=[o.shape for o in output],
-                    flops_num=flops_num,
-                    flops_cum=0,
-                )
-            )
-
-        if hasattr(module, "weight") and module.weight is not None:
-            w = module.weight
-            value = w.numpy()
-            param_dim = np.prod(w.shape)
-            param_bytes = get_byteswidth(w)
-            params.append(
-                dict(
-                    name=name + "-w",
-                    shape=w.shape,
-                    param_dim=param_dim,
-                    bits=param_bytes * 8,
-                    size=param_dim * param_bytes,
-                    size_cum=0,
-                    mean="{:.2g}".format(value.mean()),
-                    std="{:.2g}".format(value.std()),
-                )
-            )
-
-        if hasattr(module, "bias") and module.bias is not None:
-            b = module.bias
-            value = b.numpy()
-            param_dim = np.prod(b.shape)
-            param_bytes = get_byteswidth(b)
-            params.append(
-                dict(
-                    name=name + "-b",
-                    shape=b.shape,
-                    param_dim=param_dim,
-                    bits=param_bytes * 8,
-                    size=param_dim * param_bytes,
-                    size_cum=0,
-                    mean="{:.2g}".format(value.mean()),
-                    std="{:.2g}".format(value.std()),
-                )
-            )
-
-    # multiple inputs to the network
-    if not isinstance(input_size[0], tuple):
-        input_size = [input_size]
-
-    params = []
-    flops = []
-    hooks = []
-
-    for (name, module) in model.named_modules():
-        if isinstance(module, hook_modules):
-            hooks.append(
-                module.register_forward_hook(partial(net_stats_hook, name=name))
-            )
-
-    inputs = [mge.zeros(in_size, dtype=np.float32) for in_size in input_size]
-    model.eval()
-    model(*inputs)
-    for h in hooks:
-        h.remove()
-
-    total_flops, total_params = 0, 0
-    if log_params:
-        total_params = print_params_stats(params)
-    if log_flops:
-        total_flops = print_flops_stats(flops)
-
-    return total_params, total_flops
diff --git a/python_module/megengine/utils/profile_analyze.py b/python_module/megengine/utils/profile_analyze.py
deleted file mode 100755
index 8041c0d8..00000000
--- a/python_module/megengine/utils/profile_analyze.py
+++ /dev/null
@@ -1,424 +0,0 @@
-# -*- coding: utf-8 -*-
-# MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
-#
-# Copyright (c) 2014-2020 Megvii Inc. All rights reserved.
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-import argparse
-import collections
-import json
-import re
-import textwrap
-
-import numpy as np
-from tabulate import tabulate
-
-from megengine.utils.profile_analyzer import (
-    NonExistNum,
-    ProfileAnalyzer,
-    TimeFuncHelper,
-)
-
-
-def _tabulate_ml(tab, **kwargs):
-    """Tabulate profile output with multi-line support."""
-    new_tab = []
-    new_tab_is_row = []
-    for row in tab:
-        col_lines = [str(i).split("\n") for i in row]
-        max_nr_line = max(map(len, col_lines))
-        new_tab_is_row.append(True)
-        if max_nr_line > 1:
-            new_tab_is_row.extend([False] * (max_nr_line - 1))
-            for i in col_lines:
-                if len(i) < max_nr_line:
-                    i.extend([""] * (max_nr_line - len(i)))
-            new_tab.extend(zip(*col_lines))
-        else:
-            new_tab.append(row)
-
-    assert len(new_tab_is_row) == len(new_tab)
-    ret = [i + "\n" for i in tabulate(new_tab, **kwargs).split("\n")]
-    for idx, val in enumerate(new_tab_is_row):
-        if not val:
-            ret[idx * 2 + 2] = ""
-    return "".join(ret)[:-1]
-
-
-def _tabulate_confluence(tab, **kwargs):
-    """Tabulate profile output."""
-    kwargs.pop("tablefmt", None)
-    s = tabulate(tab, tablefmt="orgtbl", **kwargs)
-    lines = s.split("\n")
-    lines[1] = lines[1].replace("+", "|")
-    return "\n".join(lines)
-
-
-def main(passed_args=None):  # pylint: disable=too-many-statements
-    """Analyses profile info from :mod:`~.utils.profile_analyzer` .
-
-    Run this file with ``--help`` to get more usage.
-    """
-    parser = argparse.ArgumentParser(
-        description="analyze analyzer result",
-        formatter_class=argparse.ArgumentDefaultsHelpFormatter,
-    )
-    parser.add_argument("dump")
-    parser.add_argument(
-        "-t",
-        "--top",
-        type=int,
-        default=3,
-        help="number of most time-consuming operators to print",
-    )
-    parser.add_argument(
-        "--type", action="append", help="filter oprs in the top list by type"
-    )
-    parser.add_argument(
-        "--aggregate-by",
-        default=None,
-        choices=["type"],
-        help="aggragate profiling result by",
-    )
-    parser.add_argument(
-        "--opr-name", help="filter oprs in the top list by regex of name"
-    )
-    parser.add_argument(
-        "--input-dtype", type=str, help="filter oprs in the top list by input dtype"
-    )
-    parser.add_argument(
-        "--top-end-key",
-        default="end",
-        choices=["end", "kern"],
-        help="how time in top is calculated; end corresponds "
-        "to total device time, and kern corresponds to only "
-        "wait time",
-    )
-    parser.add_argument(
-        "--aggregate",
-        default=None,
-        help="aggregate operations",
-        choices=["max", "min", "sum", "mean"],
-    )
-    parser.add_argument(
-        "--order-by",
-        default="time",
-        help="sort result according to given column; the param can be "
-        "<col_name> or +<col_name>, meaning sorting in descending or "
-        "ascending order respectively",
-    )
-    parser.add_argument(
-        "--copy-time", action="store_true", help="show copy time related result"
-    )
-    parser.add_argument(
-        "--min-time",
-        type=float,
-        default=float("-inf"),
-        help="minimal time of a result to be printed",
-    )
-    parser.add_argument(
-        "--max-time",
-        type=float,
-        default=float("inf"),
-        help="maximal time of a result to be printed",
-    )
-    parser.add_argument(
-        "--show-host", action="store_true", help="show host profiling info"
-    )
-    parser.add_argument(
-        "--dump-only-opr",
-        action="store_true",
-        help="only dump operator info as plaintext; useful "
-        "for diff between two filtered profile results",
-    )
-    parser.add_argument(
-        "--confluence",
-        "--wiki",
-        action="store_true",
-        help="output confluence-markdown-compatible table",
-    )
-    parser.add_argument(
-        "--print-only",
-        choices={"summary", "device", "host"},
-        help="print only chosen info",
-    )
-
-    args = parser.parse_args(passed_args)
-
-    opr_filters = []
-    if args.type:
-        opr_filters.append(lambda o, a, b: o["type"] in args.type)
-    if args.opr_name:
-        opr_filters.append(
-            lambda o, a, b, r=re.compile(args.opr_name): r.match(o["name"])
-        )
-    if args.input_dtype:
-        opr_filters.append(
-            lambda o, a, b: any(
-                [i["mem_plan"]["layout"]["dtype"] == args.input_dtype for i in a]
-            )
-        )
-    if not opr_filters:
-
-        def opr_filter(o, a, b):  # pylint: disable=unused-argument
-            return True
-
-    else:
-
-        def opr_filter(o, a, b):
-            return all(i(o, a, b) for i in opr_filters)
-
-    with open(args.dump) as fin:
-        dump = json.load(fin)
-
-    analyzer = ProfileAnalyzer(dump, opr_filter)
-    analyzer_tot = ProfileAnalyzer(dump, lambda _, __, ___: True)
-
-    def summary():
-        device_end_func = TimeFuncHelper.eval_time_func("device", "end", np.max)
-        device_kern_func = TimeFuncHelper.eval_time_func("device", "kern", np.max)
-        host_end_func = TimeFuncHelper.eval_time_func("host", "end", np.max)
-
-        def get_tot_time(func):
-            rec = analyzer_tot.select(func, aggregate=np.sum)
-            if not rec:
-                return "N/A"
-            rec = rec[0]
-            return rec.time
-
-        tab = []
-        tot_dev_time = get_tot_time(device_end_func)
-        tot_host_time = get_tot_time(host_end_func)
-        tab.append(("total device time", tot_dev_time))
-        tab.append(("total host time", tot_host_time))
-        if args.copy_time:
-
-            def fmt(a, b):
-                a = a[0]
-                b = b[0]
-                return "tot={:.4f} avg={:.4f}".format(a.time, b.time)
-
-            tab.append(
-                (
-                    "copy time",
-                    fmt(
-                        analyzer.select(
-                            device_end_func,
-                            lambda opr: opr.opr_info["type"] == "Copy",
-                            aggregate=np.sum,
-                        ),
-                        analyzer.select(
-                            device_end_func,
-                            lambda opr: opr.opr_info["type"] == "Copy",
-                            aggregate=np.mean,
-                        ),
-                    ),
-                )
-            )
-            tab.append(
-                (
-                    "copy wait time",
-                    fmt(
-                        analyzer.select(
-                            device_kern_func,
-                            lambda opr: opr.opr_info["type"] == "Copy",
-                            aggregate=np.sum,
-                        ),
-                        analyzer.select(
-                            device_kern_func,
-                            lambda opr: opr.opr_info["type"] == "Copy",
-                            aggregate=np.mean,
-                        ),
-                    ),
-                )
-            )
-
-        if args.confluence:
-            tab_str = _tabulate_confluence(tab, headers=["name", "value"])
-        else:
-            tab_str = tabulate(tab)
-
-        return tab_str, tot_dev_time, tot_host_time
-
-    def prof_details(prof_type, tot_time):
-        tab = []
-
-        def func(
-            opr,
-            *,
-            f0=TimeFuncHelper.eval_time_func(prof_type, args.top_end_key, np.max)
-        ):
-            t = f0(opr)
-            if t is not None and (t < args.min_time or t > args.max_time):
-                return None
-            return t
-
-        records = analyzer.select(
-            func,
-            aggregate=args.aggregate,
-            aggregate_by=args.aggregate_by,
-            top_k=args.top,
-            sort_by=args.order_by,
-        )
-
-        if args.dump_only_opr:
-            ret = []
-            for i in records:
-                ret.append(" ".join(i.info.values()))
-            return "\n".join(ret)
-
-        def format_shapes(shapes, layouts=None, sep="\n"):
-            if isinstance(shapes, NonExistNum) or shapes is None:
-                return repr(shapes)
-            if layouts is None:
-                layouts = [None] * len(shapes)
-
-            comp = []
-            for i, j in zip(shapes, layouts):
-                i = "{" + ",".join(map(str, i)) + "}"
-                if j:
-                    i += "\n -[" + ",".join(map(str, j)) + "]"
-                comp.append(i)
-            return sep.join(comp)
-
-        def fix_num_and_find_unit(x, base):
-            if isinstance(x, NonExistNum) or (
-                isinstance(x, float) and not np.isfinite(x)
-            ):
-                return x, ""
-            unit = iter(["", "K", "M", "G", "T", "P"])
-            while x >= base:
-                x /= base
-                next(unit)
-            return x, next(unit)
-
-        def get_number_with_unit(num, unit, base, sep="\n"):
-            num, unit_prefix = fix_num_and_find_unit(num, base)
-            if isinstance(unit, list):
-                unit = unit[int(unit_prefix != "")]
-            return ("{:.2f}" + sep + "{}{}").format(num, unit_prefix, unit)
-
-        if args.confluence:
-            rows = []
-            cum_time = 0
-
-            max_time = max([r.time for r in records])
-            max_bandwidth = max([r.bandwidth for r in records])
-            max_flops = max(
-                [r.flops for r in records if not isinstance(r.flops, NonExistNum)]
-            )
-
-            bar_length = 15
-            for idx, record in enumerate(records):
-                cum_time += record.time
-
-                opr_info = [("opr " + k, v) for k, v in record.info.items()]
-
-                row = collections.OrderedDict(
-                    [
-                        ("#", idx),
-                        ("time", "{:.3}".format(record.time)),
-                        ("ratio", "{:.1f}%".format(record.time / tot_time * 100)),
-                        ("time bar", "#" * int(record.time / max_time * bar_length)),
-                        ("cum-time", cum_time),
-                        ("cum-time ratio", cum_time / tot_time),
-                    ]
-                    + opr_info
-                    + [
-                        (
-                            "computation (MFLO)",
-                            "{:.1f}".format(record.computation / 1000 ** 2),
-                        ),
-                        ("MFLOPS", "{:.1f}".format(record.flops / 1000 ** 2)),
-                        (
-                            "MFLOPS-bar",
-                            ""
-                            if isinstance(record.flops, NonExistNum)
-                            else ("#" * int(record.flops / max_flops * bar_length)),
-                        ),
-                        ("memory (MB)", "{:.1f}".format(record.memory / 1024 ** 2)),
-                        (
-                            "bandwidth (MiB/s)",
-                            "{:.1f}".format(record.bandwidth / 1024 ** 2),
-                        ),
-                        (
-                            "bandwidth bar",
-                            "#" * int(record.bandwidth / max_bandwidth * bar_length),
-                        ),
-                        (
-                            "in_shapes",
-                            format_shapes(
-                                record.in_shapes, record.in_layouts, sep=", "
-                            ),
-                        ),
-                        ("out_shapes", format_shapes(record.out_shapes, sep=", ")),
-                    ]
-                )
-                rows.append(row)
-            headers = list(rows[0].keys())
-            tab = [[row[i] for i in headers] for row in rows]
-
-            return _tabulate_confluence(tab, headers=headers)
-
-        else:
-            cum_time = 0
-            for idx, record in enumerate(records):
-                cum_time += record.time
-                tab.append(
-                    (
-                        "#{}\n{:.3}\n{:.1f}%".format(
-                            idx, record.time, record.time / tot_time * 100
-                        ),
-                        "{:.3}\n{:.1f}%".format(cum_time, cum_time / tot_time * 100),
-                        "\n".join(
-                            "\n-  ".join(textwrap.wrap(str(i), width=30))
-                            for i in record.info.values()
-                        ),
-                        get_number_with_unit(record.computation, "FLO", 1000),
-                        get_number_with_unit(record.flops, "FLOPS", 1000),
-                        get_number_with_unit(record.memory, ["byte", "iB"], 1024),
-                        get_number_with_unit(
-                            record.bandwidth, ["byte/s", "iB/s"], 1024
-                        ),
-                        format_shapes(record.in_shapes, record.in_layouts),
-                        format_shapes(record.out_shapes),
-                    )
-                )
-            return _tabulate_ml(
-                tab,
-                headers=[
-                    "{} self time".format(prof_type),
-                    "cumulative",
-                    "operator info",
-                    "computation",
-                    "FLOPS",
-                    "memory",
-                    "bandwidth",
-                    "in_shapes",
-                    "out_shapes",
-                ],
-                tablefmt="fancy_grid",
-            )
-
-    summary_tab, tot_dev_time, tot_host_time = summary()
-    if args.print_only:
-        print(
-            {
-                "summary": lambda: summary_tab,
-                "device": lambda: prof_details("device", tot_dev_time),
-                "host": lambda: prof_details("host", tot_host_time),
-            }[args.print_only]()
-        )
-    else:
-        print(summary_tab)
-        print()
-        print(prof_details("device", tot_dev_time))
-        if args.show_host:
-            print()
-            print(prof_details("host", tot_host_time))
-
-
-if __name__ == "__main__":
-    main()
diff --git a/python_module/megengine/utils/profile_analyzer.py b/python_module/megengine/utils/profile_analyzer.py
deleted file mode 100644
index 75cc0c0c..00000000
--- a/python_module/megengine/utils/profile_analyzer.py
+++ /dev/null
@@ -1,401 +0,0 @@
-# -*- coding: utf-8 -*-
-# MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
-#
-# Copyright (c) 2014-2020 Megvii Inc. All rights reserved.
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-import collections
-import copy
-import functools
-from typing import Callable, List, Optional, Union
-
-import numpy as np
-
-
-class NonExistNum:
-    """An object that behaves like a number but means a field does not exist; It is
-    always greater than any real number
-    """
-
-    def __truediv__(self, _):
-        return self
-
-    def __add__(self, rhs):
-        return rhs
-
-    def __radd__(self, lhs):
-        return lhs
-
-    def __neg__(self):
-        return self
-
-    def __gt__(self, rhs):
-        if isinstance(rhs) is NonExistNum:
-            return id(self) > id(rhs)
-        return True
-
-    def __ge__(self, rhs):
-        return self > rhs or self == rhs
-
-    def __lt__(self, rhs):
-        if isinstance(rhs) is NonExistNum:
-            return id(self) < id(rhs)
-        return False
-
-    def __le__(self, rhs):
-        return self < rhs or self == rhs
-
-    def __eq__(self, rhs):
-        return self is rhs
-
-    def __format__(self, spec):
-        return "N/A"
-
-    def __repr__(self):
-        return "N/A"
-
-
-class OprProfRst:
-    """Opr profiling result dumped from megengine profiler."""
-
-    opr_info = None
-    """A dict containing operator info:  name, id and type."""
-
-    time_dict = None
-    """A mapping from ``"host"`` or ``"device"`` to list of profiling
-    results."""
-
-    footprint = None
-    """A mapping from ``"memory"`` or ``"computation"`` to the actual number
-    of corresponding operations"""
-
-    def __init__(self, entry: dict):
-        """Opr profiling initialization, which sets up name, type and id of opr_info.
-
-        :param entry: profiling json exec_graph items
-        """
-        assert isinstance(entry, dict)
-        self.opr_info = collections.OrderedDict()
-        for key in ["name", "type", "id"]:
-            self.opr_info[key] = entry[key]
-        self.time_dict = collections.defaultdict(list)
-        self.footprint = collections.defaultdict(NonExistNum)
-
-    def update_device_prof_info(self, dev_time: dict):
-        """Updates device profiling info
-
-        :param dev_time: device time for single opr,
-            is an attribute of profiling result.
-        """
-        assert isinstance(dev_time, dict)
-        self.time_dict["device"].append(copy.deepcopy(dev_time))
-
-    def update_host_prof_info(self, host_time: dict):
-        """Updates host profiling info
-
-        :param host_time: host time for single opr,
-            is an attribute of profiling result.
-        """
-        assert isinstance(host_time, dict)
-        self.time_dict["host"].append(copy.deepcopy(host_time))
-
-    def update_footprint(self, footprint: dict):
-        """Updates opr footprint
-
-        :param footprint: footprint for single opr,
-            is an attribute of profiling result.
-        """
-        assert isinstance(footprint, dict)
-        self.footprint.update(footprint)
-
-
-class Record:
-    """A record of analyzing result"""
-
-    __slot__ = [
-        "time",
-        "info",
-        "computation",
-        "memory",
-        "in_shapes",
-        "in_layouts",
-        "out_shapes",
-        "flops",
-        "bandwidth",
-        "opr_id",
-    ]
-
-    def __init__(self, time: float, info: dict, footprint: dict):
-        """Initializes single record
-
-        :param time: opr running time, evaluated by applying users providing
-            function to OprProfRst.
-        :param info: opr information, could be original opr information or
-            aggregate infomation if aggregating enabled.
-        :param footprint: contains footprint information, for now, we have
-            ``"computation"``, ``"memory"``, ``"in_shapes"``, ``"out_shapes"``.
-        """
-
-        assert isinstance(footprint, dict)
-        self.time = time
-        self.info = collections.OrderedDict(copy.deepcopy(info))
-        self.computation = footprint["computation"] or NonExistNum()
-        self.memory = footprint["memory"]
-        self.in_shapes = footprint["in_shapes"]
-        self.in_layouts = footprint.get("in_layouts")
-        self.out_shapes = footprint["out_shapes"]
-        self.flops = self.computation / self.time
-        self.bandwidth = self.memory / self.time
-        self.opr_id = info.get("id")
-        if isinstance(self.opr_id, str) and self.opr_id != "N/A":
-            self.opr_id = int(self.opr_id)
-
-    def get_column_by_name(self, name: str = None):
-        """extracts column value by its column name
-
-        :param name: column name, None for time.
-        """
-
-        if name is None:
-            name = "time"
-        return getattr(self, name)
-
-
-class ProfileAnalyzer:
-    def __init__(self, obj: dict, opr_filter: Callable = lambda opr, inp, out: True):
-        """Initializes ProfileAnalyzer
-
-        :param obj: dict dumped from json str.
-        :param opr_filter: function that filter oprs.
-        """
-        self._opr_set = dict()  # type: dict
-        assert isinstance(obj, dict)
-        varz = obj["graph_exec"]["var"]
-        for opr_id, entry in obj["graph_exec"]["operator"].items():
-            inp = [varz[i] for i in entry["input"]]
-            out = [varz[i] for i in entry["output"]]
-            if opr_filter(entry, inp, out):
-                self._opr_set[opr_id] = OprProfRst(entry)
-
-        for opr_id, entry in obj["profiler"]["device"].items():
-            if opr_id not in self._opr_set:
-                continue
-            opr = self._opr_set[opr_id]
-            for _, time in entry.items():
-                opr.update_device_prof_info(time)
-
-        for opr_id, entry in obj["profiler"]["host"].items():
-            if opr_id not in self._opr_set:
-                continue
-            opr = self._opr_set[opr_id]
-            for _, time in entry.items():
-                opr.update_host_prof_info(time)
-
-        for opr_id, entry in obj["profiler"].get("opr_footprint", {}).items():
-            if opr_id not in self._opr_set:
-                continue
-            opr = self._opr_set[opr_id]
-            opr.update_footprint(entry)
-
-    def _aggregate(
-        self, records: List[Record], aop: Union[str, Callable], atype: Optional[str]
-    ) -> List[Record]:
-        """Aggregate operation
-
-        :param records: selected records
-        :param aop: aggregate operation, if aop is str, we would replace it
-            with associated numpy function wth aop name"
-        :param atype: the type aggregated by, None for aggregating all into single
-            record.
-        """
-        if aop is None:
-            assert atype is None, "must specify aggregate op"
-            return records
-        if isinstance(aop, str):
-            aop = getattr(np, aop)
-        type2stat = collections.defaultdict(lambda: [[], [], []])  # type: dict
-        for item in records:
-            if atype == "type":
-                d = type2stat[item.info["type"]]
-            else:
-                d = type2stat["all"]
-            d[0].append(item.time)
-            d[1].append(item.computation)
-            d[2].append(item.memory)
-
-        rst = []
-        for opr_type in type2stat.keys():
-            time, computation, memory = type2stat[opr_type]
-            nr_oprs = len(time)
-            time_rst = aop(time)
-            comp_rst = aop(computation)
-            mem_rst = aop(memory)
-
-            item = Record(
-                time_rst,
-                {"type": opr_type, "count": nr_oprs, "id": "N/A"},
-                {
-                    "computation": comp_rst,
-                    "memory": mem_rst,
-                    "in_shapes": None,
-                    "out_shapes": None,
-                },
-            )
-            rst.append(item)
-        return rst
-
-    def _sort(self, records: List[Record], sort_by: str) -> List[Record]:
-        """sort operation
-
-        :param records: the records after aggregate operation.
-        :param sort_by: keyword for sorting the list
-        """
-        if sort_by is None:
-            return records
-        if sort_by.startswith("+"):
-            sort_by = sort_by[1:]
-            key = lambda record: record.get_column_by_name(sort_by)
-        else:
-            key = lambda record: -record.get_column_by_name(sort_by)
-        records.sort(key=key)
-        return records
-
-    def select(
-        self,
-        time_func: Callable,
-        opr_filter: Callable = lambda opr: True,
-        aggregate: Callable = None,
-        aggregate_by: str = None,
-        sort_by: str = None,
-        top_k: int = 0,
-    ) -> List[Record]:
-        """Select operation
-
-        :param time_func: time_func provided by user, would apply to every
-            OprProfRst
-        :param opr_filter: filter satisfied operatiors.
-        :param aggregate: function that apply to list of records which are
-            aggregated by atype
-        :param aggregate_by: the type aggregated by
-        :param sort_by: keyword for sorting all records.
-        :param top_k: specify the maximum number of records.
-        :return: the records that go through select, aggregate, sort.
-        """
-
-        records = []
-        for opr in self._opr_set.values():
-            if opr_filter(opr):
-                time = time_func(opr)
-                if time is None:
-                    continue
-                item = Record(time, opr.opr_info, opr.footprint)
-                records.append(item)
-
-        records = self._aggregate(records, aggregate, aggregate_by)
-        if not records:
-            return records
-        return self._sort(records, sort_by)[0 : len(records) if top_k == 0 else top_k]
-
-
-class TimeFuncHelper:
-    """Time Function Helper for users."""
-
-    @staticmethod
-    def _eval_time(prof_type, end_key, func, opr_prof):
-        """Eval time
-
-        :type prof_type: str
-        :param prof_type: 'host' or 'device'
-        :type end_key: str
-        :param end_key: 'kern' or 'end'
-        :type func: function
-        :param func: apply to list of all ``thread`` of ``gpu`` time.
-        :type opr_prof: `class OprProfRst`
-        :param opr_prof: operator profiling result
-        :rtype: float
-        :return: time
-        """
-
-        if prof_type not in opr_prof.time_dict:
-            return None
-        time = [time[end_key] - time["start"] for time in opr_prof.time_dict[prof_type]]
-        return func(time)
-
-    @staticmethod
-    def eval_time_func(prof_type: str, end_key: str, func: Callable) -> float:
-        """Eval oprerator profile time.
-
-        :param prof_type: 'host' or 'device'
-        :param end_key: 'kern' or 'end'
-        :param func: apply to list of all ``thread`` of ``gpu`` time.
-        :return: Eval time results
-        """
-        return functools.partial(TimeFuncHelper._eval_time, prof_type, end_key, func)
-
-    @staticmethod
-    def _min_start(
-        prof_type, end_key, func, opr_prof
-    ):  # pylint: disable=unused-argument
-        """Eval minimum start time
-
-        :type prof_type: str
-        :param prof_type: 'host' or 'device'
-        :type end_key: str
-        :param end_key: 'kern' or 'end'
-        :type func: function
-        :param func: apply to list of all ``thread`` of ``gpu`` time.
-        :type opr_prof: `class OprProfRst`
-        :param opr_prof: operator profiling result
-        :rtype: float
-        :return: time
-        """
-        if prof_type not in opr_prof.time_dict:
-            return None
-        time = [time["start"] for time in opr_prof.time_dict[prof_type]]
-        return np.min(time)
-
-    @staticmethod
-    def min_start_func(
-        prof_type: str, end_key: str, func: Callable
-    ) -> float:  # pylint: disable=unused-argument
-        """Eval oprerator profile min start time
-
-        :param prof_type: 'host' or 'device'
-        :param end_key: 'kern' or 'end'
-        :param func: apply to list of all ``thread`` of ``gpu`` time.
-        :return: Eval time results
-        """
-        return functools.partial(TimeFuncHelper._min_start, prof_type, end_key, func)
-
-    @staticmethod
-    def _max_end(prof_type, end_key, func, opr_prof):  # pylint: disable=unused-argument
-        """Eval maximum end time
-
-        :type prof_type: str
-        :param prof_type: 'host' or 'device'
-        :type end_key: str
-        :param end_key: 'kern' or 'end'
-        :type func: function
-        :param func: apply to list of all ``thread`` of ``gpu`` time.
-        :type opr_prof: `class OprProfRst`
-        :param opr_prof: operator profiling result
-        :rtype: float
-        :return: time
-        """
-        if prof_type not in opr_prof.time_dict:
-            return None
-        time = [time["end"] for time in opr_prof.time_dict[prof_type]]
-        return np.max(time)
-
-    @staticmethod
-    def max_end_func(prof_type: str, end_key: str, func: Callable) -> float:
-        """Eval oprerator profile max end time
-
-        :param prof_type: 'host' or 'device'
-        :param end_key: 'kern' or 'end'
-        :param func: apply to list of all ``thread`` of ``gpu`` time.
-        :return: Eval time results
-        """
-        return functools.partial(TimeFuncHelper._max_end, prof_type, end_key, func)
diff --git a/python_module/megengine/utils/types.py b/python_module/megengine/utils/types.py
deleted file mode 100644
index 465ca03c..00000000
--- a/python_module/megengine/utils/types.py
+++ /dev/null
@@ -1,37 +0,0 @@
-# -*- coding: utf-8 -*-
-# MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
-#
-# Copyright (c) 2014-2020 Megvii Inc. All rights reserved.
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-import collections
-import functools
-
-
-def get_ndtuple(value, *, n, allow_zero=True):
-    r"""Converts possibly 1D tuple to nd tuple
-
-    :type allow_zero: bool
-    :param allow_zero: whether to allow zero tuple value"""
-    if not isinstance(value, collections.Iterable):
-        value = int(value)
-        value = tuple([value for i in range(n)])
-    else:
-        assert len(value) == n, "tuple len is not equal to n: {}".format(value)
-        spatial_axis = map(int, value)
-        value = tuple(spatial_axis)
-    if allow_zero:
-        minv = 0
-    else:
-        minv = 1
-    assert min(value) >= minv, "invalid value: {}".format(value)
-    return value
-
-
-_single = functools.partial(get_ndtuple, n=1, allow_zero=True)
-_pair = functools.partial(get_ndtuple, n=2, allow_zero=True)
-_pair_nonzero = functools.partial(get_ndtuple, n=2, allow_zero=False)
-_triple = functools.partial(get_ndtuple, n=3, allow_zero=True)
-_quadruple = functools.partial(get_ndtuple, n=4, allow_zero=True)
diff --git a/python_module/megengine/version.py b/python_module/megengine/version.py
deleted file mode 100644
index 3d187266..00000000
--- a/python_module/megengine/version.py
+++ /dev/null
@@ -1 +0,0 @@
-__version__ = "0.5.0"
diff --git a/python_module/requires-style.txt b/python_module/requires-style.txt
deleted file mode 100644
index 899aac52..00000000
--- a/python_module/requires-style.txt
+++ /dev/null
@@ -1,4 +0,0 @@
-black==19.10b0
-isort==4.3.21
-pylint==2.4.3
-mypy==0.750
diff --git a/python_module/requires-test.txt b/python_module/requires-test.txt
deleted file mode 100644
index 9e732684..00000000
--- a/python_module/requires-test.txt
+++ /dev/null
@@ -1,3 +0,0 @@
-pytest==5.3.0
-pytest-sphinx>=0.2.2
-pytest-json-report>=1.2.1
diff --git a/python_module/requires.txt b/python_module/requires.txt
deleted file mode 100644
index 5ebb52a3..00000000
--- a/python_module/requires.txt
+++ /dev/null
@@ -1,7 +0,0 @@
-numpy>=1.17
-opencv-python
-pyarrow
-requests
-tabulate
-tqdm
-redispy
diff --git a/python_module/setup.py b/python_module/setup.py
deleted file mode 100644
index 414f4fcf..00000000
--- a/python_module/setup.py
+++ /dev/null
@@ -1,118 +0,0 @@
-# -*- coding: utf-8 -*-
-# This file is part of MegBrain.
-# Copyright (c) 2014-2020 Megvii Inc. All rights reserved.
-
-import os
-import re
-import pathlib
-import platform
-from distutils.file_util import copy_file
-from setuptools import setup, find_packages, Extension
-from setuptools.command.build_ext import build_ext as _build_ext
-
-class PrecompiledExtesion(Extension):
-    def __init__(self, name):
-        super().__init__(name, sources=[])
-
-class build_ext(_build_ext):
-
-    def build_extension(self, ext):
-        if not isinstance(ext, PrecompiledExtesion):
-            return super().build_extension(ext)
-
-        if not self.inplace:
-            fullpath = self.get_ext_fullpath(ext.name)
-            extdir = pathlib.Path(fullpath)
-            extdir.parent.mkdir(parents=True, exist_ok=True)
-
-            modpath = self.get_ext_fullname(ext.name).split('.')
-            if platform.system() == "Windows":
-                modpath[-1] += '.pyd'
-            else:
-                modpath[-1] += '.so'
-            modpath = str(pathlib.Path(*modpath).resolve())
-
-            copy_file(modpath, fullpath, verbose=self.verbose, dry_run=self.dry_run)
-
-package_name = 'MegEngine'
-
-v = {}
-with open("megengine/version.py") as fp:
-    exec(fp.read(), v)
-__version__ = v['__version__']
-
-email = 'megengine@megvii.com'
-local_version = os.environ.get('LOCAL_VERSION')
-if local_version:
-    __version__ = '{}+{}'.format(__version__, local_version)
-
-packages = find_packages(exclude=['test'])
-package_data = [
-    str(f.relative_to('megengine'))
-    for f in pathlib.Path('megengine', '_internal', 'include').glob('**/*')
-]
-package_data += [
-    str(f.relative_to('megengine'))
-    for f in pathlib.Path('megengine', '_internal', 'lib').glob('**/*')
-]
-package_data += [
-    os.path.join('module', 'pytorch', 'torch_mem_fwd.cpp')
-]
-
-with open('requires.txt') as f:
-    requires = f.read().splitlines()
-with open('requires-style.txt') as f:
-    requires_style = f.read().splitlines()
-with open('requires-test.txt') as f:
-    requires_test = f.read().splitlines()
-
-setup_kwargs = dict(
-    name=package_name,
-    version=__version__,
-    description='Framework for numerical evaluation with '
-    'auto-differentiation',
-    author='Megvii Engine Team',
-    author_email=email,
-    packages=packages,
-    package_data={
-        'megengine': package_data,
-    },
-    ext_modules=[PrecompiledExtesion('megengine._internal._mgb')],
-    install_requires=requires,
-    extras_require={
-        'dev': requires_style + requires_test,
-        'ci': requires_test,
-    },
-    cmdclass={'build_ext': build_ext},
-)
-
-
-setup_kwargs.update(dict(
-    classifiers=[
-    'Development Status :: 3 - Alpha',
-    'Intended Audience :: Developers',
-    'Intended Audience :: Education',
-    'Intended Audience :: Science/Research',
-    'License :: OSI Approved :: Apache Software License',
-    'Programming Language :: C++',
-    'Programming Language :: Python :: 3',
-    'Programming Language :: Python :: 3.5',
-    'Programming Language :: Python :: 3.6',
-    'Programming Language :: Python :: 3.7',
-    'Programming Language :: Python :: 3.8',
-    'Topic :: Scientific/Engineering',
-    'Topic :: Scientific/Engineering :: Mathematics',
-    'Topic :: Scientific/Engineering :: Artificial Intelligence',
-    'Topic :: Software Development',
-    'Topic :: Software Development :: Libraries',
-    'Topic :: Software Development :: Libraries :: Python Modules',
-    ],
-    license='Apache 2.0',
-    keywords='megengine deep learning',
-    data_files = [("megengine", [
-        "../LICENSE",
-        "../ACKNOWLEDGMENTS",
-    ])]
-))
-
-setup(**setup_kwargs)
diff --git a/python_module/src/cpp/bfloat16.cpp b/python_module/src/cpp/bfloat16.cpp
deleted file mode 100644
index 712e5219..00000000
--- a/python_module/src/cpp/bfloat16.cpp
+++ /dev/null
@@ -1,296 +0,0 @@
-/**
- * \file python_module/src/cpp/bfloat16.cpp
- *
- * This file is part of MegBrain, a deep learning framework developed by Megvii.
- *
- * \brief numpy dtypes for bfloat16
- *
- * \copyright Copyright (c) 2014-2020 Megvii Inc. All rights reserved.
- *
- */
-
-#include "megbrain/common.h"
-#include "megbrain/dtype.h"
-
-#include <Python.h>
-#include <structmember.h>
-
-#define NO_IMPORT_ARRAY 1
-#include "./numpy_incl.h"
-
-#pragma GCC diagnostic ignored "-Wmissing-field-initializers"
-
-namespace {
-
-struct BFloat16Type {
-    static int npy_typenum;
-    mgb::dt_bfloat16 value;
-
-    struct PyObj;
-    struct NpyType;
-
-    template <typename S, typename T>
-    struct NpyCast;
-};
-
-int BFloat16Type::npy_typenum;
-
-/* ==================== BFloat16Type::NpyCast ==================== */
-
-template <typename S>
-struct BFloat16Type::NpyCast<S, BFloat16Type> {
-    static void apply(void* from_, void* to_, npy_intp n, void* /*fromarr*/,
-                      void* /*toarr*/) {
-        auto from = static_cast<S*>(from_);
-        auto to = static_cast<BFloat16Type*>(to_);
-        for (npy_intp i = 0; i < n; ++i) {
-            float cur = static_cast<float>(from[i]);
-            to[i].value = cur;
-        }
-    }
-};
-
-template <typename T>
-struct BFloat16Type::NpyCast<BFloat16Type, T> {
-    static void apply(void* from_, void* to_, npy_intp n, void* /*fromarr*/,
-                      void* /*toarr*/) {
-        auto from = static_cast<BFloat16Type*>(from_);
-        auto to = static_cast<T*>(to_);
-        for (npy_intp i = 0; i < n; ++i) {
-            to[i] = from[i].value;
-        }
-    }
-};
-
-/* ==================== BFloat16Type::PyObj ==================== */
-struct BFloat16Type::PyObj {
-    PyObject_HEAD BFloat16Type obj;
-
-    static PyTypeObject py_type;
-
-    static PyObject* from_bfloat16(BFloat16Type val) {
-        auto p = reinterpret_cast<PyObj*>(py_type.tp_alloc(&py_type, 0));
-        p->obj.value = val.value;
-        return reinterpret_cast<PyObject*>(p);
-    }
-
-    static PyObject* py_new(PyTypeObject* type, PyObject* args, PyObject* kwds);
-    static PyObject* py_repr(PyObject* obj);
-    static PyObject* py_richcompare(PyObject* a, PyObject* b, int op);
-};
-PyTypeObject BFloat16Type::PyObj::py_type;
-
-PyObject* BFloat16Type::PyObj::py_new(PyTypeObject* type, PyObject* args,
-                                      PyObject* kwds) {
-    PyObj* self;
-    Py_ssize_t size;
-
-    self = (PyObj*)type->tp_alloc(type, 0);
-
-    size = PyTuple_GET_SIZE(args);
-    if (size > 1) {
-        PyErr_SetString(PyExc_TypeError, "BFloat16Type Only has 1 parameter");
-        return NULL;
-    }
-    PyObject* x = PyTuple_GET_ITEM(args, 0);
-    if (PyObject_IsInstance(x, (PyObject*)&py_type)) {
-        Py_INCREF(x);
-        return x;
-    }
-
-    if (!PyFloat_Check(x)) {
-        PyErr_SetString(PyExc_TypeError,
-                        "BFloat16Type must be initialized wit float");
-        return NULL;
-    }
-
-    const float s = PyFloat_AsDouble(x);
-
-    self->obj.value = s;
-
-    return (PyObject*)self;
-}
-
-PyObject* BFloat16Type::PyObj::py_repr(PyObject* obj) {
-    float fval = static_cast<float>(((PyObj*)obj)->obj.value);
-    return PyUnicode_FromString(mgb::ssprintf("%f", fval).c_str());
-}
-
-PyObject* BFloat16Type::PyObj::py_richcompare(PyObject* a, PyObject* b,
-                                              int op) {
-    mgb_assert(PyObject_IsInstance(a, (PyObject*)&py_type));
-    auto bval = PyFloat_AsDouble(b);
-    if (bval == -1 && PyErr_Occurred()) {
-        return NULL;
-    }
-    double aval = ((PyObj*)a)->obj.value;
-#define OP(py, op)           \
-    case py: {               \
-        if (aval op bval) {  \
-            Py_RETURN_TRUE;  \
-        } else {             \
-            Py_RETURN_FALSE; \
-        }                    \
-    }
-    switch (op) {
-        OP(Py_LT, <)
-        OP(Py_LE, <=)
-        OP(Py_EQ, ==)
-        OP(Py_NE, !=)
-        OP(Py_GT, >)
-        OP(Py_GE, >=)
-    };
-#undef OP
-    return Py_NotImplemented;
-}
-
-/* ==================== BFloat16Type<N>::NpyType ==================== */
-struct BFloat16Type::NpyType {
-    static PyArray_ArrFuncs funcs;
-    static PyArray_Descr descr;
-
-    static bool init();
-
-    static void copyswap(void* dst, void* src, int swap, void* /*arr*/) {
-        if (src) {
-            mgb_assert(!swap);
-            memcpy(dst, src, sizeof(BFloat16Type));
-        }
-    }
-    static PyObject* getitem(void* data, void* ap) {
-        return BFloat16Type::PyObj::from_bfloat16(
-                *static_cast<BFloat16Type*>(data));
-    }
-    static int setitem(PyObject* op, void* ov, void* ap);
-};
-
-PyArray_ArrFuncs BFloat16Type::NpyType::funcs;
-PyArray_Descr BFloat16Type::NpyType::descr;
-
-int BFloat16Type::NpyType::setitem(PyObject* op, void* ov, void* ap) {
-    if (PyLong_Check(op)) {
-        int a = PyLong_AsLong(op);
-        static_cast<BFloat16Type*>(ov)->value = a;
-    } else if (PyFloat_Check(op)) {
-        float a = PyFloat_AsDouble(op);
-        static_cast<BFloat16Type*>(ov)->value = a;
-    } else if (PyObject_IsInstance(
-                       op, (PyObject*)(&(BFloat16Type::PyObj::py_type)))) {
-        static_cast<BFloat16Type*>(ov)->value = ((PyObj*)op)->obj.value;
-    } else {
-        PyErr_SetString(PyExc_ValueError,
-                        "input type must be int/float/bfloat16");
-        return -1;
-    }
-    return 0;
-}
-
-bool BFloat16Type::NpyType::init() {
-    descr = {PyObject_HEAD_INIT(0) & BFloat16Type::PyObj::py_type,
-             'V',  // kind
-             'f',  // type
-             '=',  // byteorder
-             NPY_NEEDS_PYAPI | NPY_USE_GETITEM | NPY_USE_SETITEM,
-             1,  // type num
-             sizeof(BFloat16Type),
-             alignof(BFloat16Type),
-             NULL,
-             NULL,
-             NULL,
-             &funcs};
-    Py_TYPE(&descr) = &PyArrayDescr_Type;
-    PyArray_InitArrFuncs(&funcs);
-    funcs.copyswap = copyswap;
-    funcs.getitem = getitem;
-    funcs.setitem = setitem;
-    npy_typenum = PyArray_RegisterDataType(&descr);
-
-#define REGISTER_CAST(From, To, From_descr, To_typenum, safe)         \
-    {                                                                 \
-        PyArray_Descr* from_descr = (From_descr);                     \
-        if (PyArray_RegisterCastFunc(from_descr, (To_typenum),        \
-                                     NpyCast<From, To>::apply) < 0) { \
-            return false;                                             \
-        }                                                             \
-        if (safe && PyArray_RegisterCanCast(from_descr, (To_typenum), \
-                                            NPY_NOSCALAR) < 0) {      \
-            return false;                                             \
-        }                                                             \
-    }
-#define REGISTER_INT_CASTS(bits)                                         \
-    REGISTER_CAST(npy_int##bits, BFloat16Type,                           \
-                  PyArray_DescrFromType(NPY_INT##bits),                  \
-                  BFloat16Type::npy_typenum, 1)                          \
-    REGISTER_CAST(BFloat16Type, npy_int##bits, &descr, NPY_INT##bits, 0) \
-    REGISTER_CAST(npy_uint##bits, BFloat16Type,                          \
-                  PyArray_DescrFromType(NPY_UINT##bits),                 \
-                  BFloat16Type::npy_typenum, 1)                          \
-    REGISTER_CAST(BFloat16Type, npy_uint##bits, &descr, NPY_UINT##bits, 0)
-
-    REGISTER_INT_CASTS(8)
-    REGISTER_INT_CASTS(16)
-    REGISTER_INT_CASTS(32)
-    REGISTER_INT_CASTS(64)
-    REGISTER_CAST(BFloat16Type, float, &descr, NPY_FLOAT, 0)
-    REGISTER_CAST(float, BFloat16Type, PyArray_DescrFromType(NPY_FLOAT),
-                  BFloat16Type::npy_typenum, 0)
-    REGISTER_CAST(BFloat16Type, double, &descr, NPY_DOUBLE, 1)
-    REGISTER_CAST(double, BFloat16Type, PyArray_DescrFromType(NPY_DOUBLE),
-                  BFloat16Type::npy_typenum, 0)
-    return true;
-}
-
-}  // anonymous namespace
-
-bool init_pytype_bfloat16() {
-    auto& py_type = BFloat16Type::PyObj::py_type;
-    py_type = {PyVarObject_HEAD_INIT(NULL, 0)};
-    py_type.tp_name = "megbrain._mgb.pybfloat16";
-    py_type.tp_basicsize = sizeof(BFloat16Type::PyObj);
-    py_type.tp_flags = Py_TPFLAGS_DEFAULT | Py_TPFLAGS_BASETYPE;
-    py_type.tp_doc = "bfloat16 type";
-    py_type.tp_new = BFloat16Type::PyObj::py_new;
-    py_type.tp_str = BFloat16Type::PyObj::py_repr;
-    py_type.tp_repr = BFloat16Type::PyObj::py_repr;
-    py_type.tp_richcompare = BFloat16Type::PyObj::py_richcompare;
-    py_type.tp_base = &PyGenericArrType_Type;
-    return PyType_Ready(&py_type) >= 0;
-}
-
-void register_pytype_bfloat16(PyObject* d, PyObject* m) {
-    Py_INCREF(&BFloat16Type::PyObj::py_type);
-    PyDict_SetItemString(d, "bfloat16_pytype",
-                         (PyObject*)&BFloat16Type::PyObj::py_type);
-    PyModule_AddObject(m, "bfloat16_pytype",
-                       (PyObject*)&BFloat16Type::PyObj::py_type);
-}
-
-//! called from swig init
-void _init_bfloat16_types(PyObject* m) {
-    if (m == NULL)
-        return;
-    PyObject* d = PyModule_GetDict(m);
-    PyArray_Descr* dtype;
-    if (!init_pytype_bfloat16())
-        return;
-    if (!BFloat16Type::NpyType::init())
-        return;
-    dtype = PyArray_DescrFromType(BFloat16Type::npy_typenum);
-    if (!dtype)
-        return;
-    {
-        PyObject* pytype = (PyObject*)(&BFloat16Type::PyObj::py_type);
-        Py_INCREF(pytype);
-        PyDict_SetItemString(d, "pybfloat16", pytype);
-    }
-    Py_INCREF(dtype);
-    PyDict_SetItemString(d, "bfloat16", (PyObject*)dtype);
-    register_pytype_bfloat16(d, m);
-    return;
-}
-
-int mgb::npy_num_bfloat16() {
-    return BFloat16Type::npy_typenum;
-}
-
-// vim: syntax=cpp.doxygen foldmethod=marker foldmarker=f{{{,f}}}
diff --git a/python_module/src/cpp/craniotome.cpp b/python_module/src/cpp/craniotome.cpp
deleted file mode 100644
index af9d17b0..00000000
--- a/python_module/src/cpp/craniotome.cpp
+++ /dev/null
@@ -1,435 +0,0 @@
-/**
- * \file python_module/src/cpp/craniotome.cpp
- *
- * This file is part of MegBrain, a deep learning framework developed by Megvii.
- *
- * \copyright Copyright (c) 2014-2020 Megvii Inc. All rights reserved.
- *
- */
-
-#include "./craniotome.h"
-#include "./python_helper.h"
-#include "megbrain/comp_node_env.h"
-#include "megbrain/graph/grad_impl.h"
-#include "megbrain/serialization/sereg.h"
-
-using namespace mgb;
-using namespace opr;
-
-MGB_DYN_TYPE_OBJ_FINAL_IMPL(CraniotomeDesc);
-MGB_DYN_TYPE_OBJ_FINAL_IMPL(Craniotome);
-
-
-bool CraniotomeDesc::is_same_st(const mgb::Hashable &rhs) const {
-    auto rp = static_cast<const CraniotomeDesc&>(rhs).py_self();
-    size_t ref0 = rp->ob_refcnt;
-    bool ret;
-    {
-        PYTHON_GIL;
-        Py_INCREF(rp);
-        ret = _is_same(rp);
-    }
-    size_t ref1 = rp->ob_refcnt;
-    mgb_assert(ref0 == ref1,
-            "reference count changed from %zu to %zu",
-            ref0, ref1);
-    return ret;
-}
-
-size_t CraniotomeDesc::hash() const {
-    return _hash();
-}
-
-
-PyObject* CraniotomeDesc::py_self() const {
-    if (!m_py_self) {
-        PYTHON_GIL;
-        PyObject* dst = PyList_New(0);
-        mgb_assert(dst);
-        PyObjRefKeeper dst_ref{dst};
-
-        Py_INCREF(dst);
-        _setup_self(dst);
-        mgb_assert(dst->ob_refcnt == 1);
-
-        mgb_assert(PyList_Size(dst) == 1);
-        m_py_self = PyList_GetItem(dst, 0);
-    }
-
-    return m_py_self;
-}
-
-class Craniotome::FuncDelCallbackInvoker final
-        : public UserDataContainer::UserData {
-    MGB_TYPEINFO_OBJ_DECL;
-    SmallVector<Craniotome*> m_oprs;
-
-public:
-    ~FuncDelCallbackInvoker() {
-        Craniotome* cur_opr = nullptr;
-        MGB_MARK_USED_VAR(cur_opr);
-        MGB_TRY {
-            std::vector<size_t> arr;
-            for (auto i : m_oprs) {
-                cur_opr = i;
-                mgb_assert(i->m_on_graph_compile_called);
-                i->m_desc->_on_graph_compile_or_func_del(arr);
-                i->m_on_graph_compile_called = false;
-            }
-        }
-        MGB_HANDLE_EXCEPTION_DTOR(
-                ssprintf("craniotome opr %s", cur_opr->cname()).c_str());
-    }
-    void add(Craniotome* opr) { m_oprs.push_back(opr); }
-};
-MGB_TYPEINFO_OBJ_IMPL(Craniotome::FuncDelCallbackInvoker);
-
-Craniotome::Craniotome(
-        mgb::ComputingGraph *graph, std::unique_ptr<CraniotomeDesc> desc,
-        const VarNodeArray &inputs, const OperatorNodeConfig &config):
-    Super{graph, config, desc->_get_opr_type_name().c_str(), inputs},
-    m_node_flag{desc->_node_flag()},
-    m_desc{std::move(desc)}
-{
-    for (auto i: inputs)
-        add_input({i});
-    m_nr_dev_value_inp = input().size() - m_desc->_get_nr_dev_comp_order_deps();
-    m_desc->_get_all_io_vars = [this]() {
-        SymbolVarArray ret;
-        ret.reserve(input().size() + output().size());
-        for (auto i: input())
-            ret.push_back(i);
-        for (auto i: output())
-            ret.push_back(i);
-        return ret;
-    };
-
-    auto nr_out = m_desc->_get_nr_outputs();
-    if (nr_out > 1) {
-        for (size_t i = 0, it = nr_out; i < it; ++ i)
-            add_output(ssprintf("o%zu", i));
-    } else {
-        mgb_assert(nr_out == 1,
-                "could not create an operator with %zu outputs: %s",
-                nr_out, cname());
-        add_output(None);
-    }
-    if (output_no_sys_mem_alloc()) {
-        for (auto i: output())
-            i->add_flag(VarNode::Flag::NO_SYS_MEM_ALLOC);
-    }
-    if (m_node_flag & NodeFlag::ALLOW_EMPTY_OUTPUT) {
-        for (auto i: output())
-            i->add_flag(VarNode::Flag::ALLOW_EMPTY_SHAPE);
-    }
-    add_equivalence_component<HashableObjPtrWrapper>(m_desc.get());
-
-    // init comp node early because desc may access it
-    this->init_output_comp_node();
-    m_desc->owner_opr = this;
-}
-
-Craniotome::~Craniotome() noexcept {
-    if (m_on_graph_compile_called) {
-        m_desc->_on_graph_compile_or_func_del({});
-        m_on_graph_compile_called = false;
-    }
-}
-
-Craniotome::NodeProp* Craniotome::do_make_node_prop() const {
-    auto ret = Super::do_make_node_prop();
-    if (m_node_flag & NodeFlag::DISALLOW_DUPLICATE) {
-        ret->add_flag(NodeProp::Flag::NO_AUTOMATIC_DUP);
-    }
-    if (m_nr_dev_value_inp < input().size()) {
-        using DT = NodeProp::DepType;
-        SmallVector<DT> dep_types(input().size(), DT::DEV_VALUE);
-        for (size_t i = m_nr_dev_value_inp; i < dep_types.size(); ++i) {
-            dep_types[i] = DT::DEV_COMP_ORDER;
-        }
-        ret->reset_dep_type(input(), dep_types);
-    }
-    return ret;
-}
-
-void Craniotome::scn_do_execute() {
-    m_prev_inferred_shape.invalidate();
-    auto &&env = CompNodeEnv::from_comp_node(comp_node());
-    env.activate();
-    std::vector<CompGraphCallbackValueProxy> inpval(m_nr_dev_value_inp);
-    std::vector<SharedND> outval(output().size());
-    auto dest_cn = comp_node();
-    for (size_t i = 0; i < inpval.size(); ++ i) {
-        auto ivar = input(i);
-        if (ivar->comp_node() == dest_cn) {
-            inpval[i].setup(input(i)->dev_tensor(), false);
-        } else {
-            auto tensor = input(i)->dev_tensor();
-            tensor.comp_node(dest_cn);
-            inpval[i].setup(tensor, false);
-        }
-    }
-
-    TensorShapeArray orig_shape;
-    std::vector<void*> orig_ptr;
-    if (output_no_sys_mem_alloc()) {
-        for (size_t i = 0; i < outval.size(); ++ i)
-            outval[i].assign(output(i));
-    } else {
-        for (size_t i = 0; i < outval.size(); ++ i) {
-            outval[i].assign(output(i)->dev_tensor());
-            orig_shape.push_back(output(i)->shape());
-            orig_ptr.push_back(output(i)->dev_tensor().raw_ptr());
-        }
-    }
-    m_desc->_execute(inpval, outval);
-    mgb_assert(outval.size() == output().size());
-    if (!output_no_sys_mem_alloc()) {
-        for (size_t i = 0; i < outval.size(); ++ i) {
-            mgb_assert(output(i)->shape().eq_shape(orig_shape[i]) &&
-                    orig_ptr[i] == output(i)->dev_tensor().raw_ptr(),
-                    "%s: shape or ptr of output %zu changed",
-                    cname(), i);
-        }
-    }
-}
-
-void Craniotome::get_output_var_shape(const TensorShapeArray &inp_shape,
-        TensorShapeArray &out_shape) const {
-    TensorShapeVec cvt_ishp(inp_shape.size());
-    for (size_t i = 0; i < cvt_ishp.size(); ++ i)
-        cvt_ishp[i] = npy::shape2vec(inp_shape[i]);
-    auto cvt_oshp = m_desc->_infer_shape(cvt_ishp);
-    mgb_assert(cvt_oshp.size() == output().size());
-    out_shape.resize(cvt_oshp.size());
-    for (size_t i = 0; i < cvt_oshp.size(); ++ i)
-        out_shape[i] = npy::vec2shape(cvt_oshp[i]);
-}
-
-MGB_IMPL_OPR_GRAD(Craniotome) {
-    if (wrt_idx >= opr.nr_dev_value_inp()) {
-        return nullptr;
-    }
-    SymbolVarArray isv(opr.nr_dev_value_inp()), osv(opr.output().size()),
-            ogsv(out_grad.size());
-    for (size_t i = 0; i < isv.size(); ++i)
-        isv[i] = opr.input(i);
-    for (size_t i = 0; i < osv.size(); ++i)
-        osv[i] = opr.output(i);
-    for (size_t i = 0; i < out_grad.size(); ++i)
-        ogsv[i] = out_grad[i];
-
-    auto ret = cg::to_var_node_array(const_cast<CraniotomeDesc&>(opr.desc())
-                                             ._grad(wrt_idx, isv, osv, ogsv));
-
-    auto update_shape = [&opr](size_t i, VarNode* var) {
-        auto inp = opr.input(i);
-        if (var && cg::is_static_var_shape(inp) &&
-            !cg::is_static_var_shape(var)) {
-            var = SymbolVar{var}.reshape(SymbolVar{inp}.symshape()).node();
-        }
-        return var;
-    };
-    if (ret.size() != 1) {
-        mgb_assert(ret.size() == opr.input().size());
-        for (size_t i = 0; i < ret.size(); ++i) {
-            ret[i] = update_shape(i, ret[i]);
-        }
-        return ret;
-    }
-    return update_shape(wrt_idx, ret[0]);
-}
-
-void Craniotome::add_input_layout_constraint() {
-    for (auto i : input())
-        i->add_layout_constraint_contiguous();
-
-    if (!m_on_graph_compile_called) {
-        // check used outputs and call _on_graph_compile
-        auto graph = owner_graph();
-        auto&& out = output();
-        std::vector<size_t> used_outputs;
-        used_outputs.reserve(out.size());
-        for (size_t i = 0; i < out.size(); ++i) {
-            if (!graph->var_receiver_in_current_comp_seq(out[i]).empty()) {
-                used_outputs.push_back(i);
-            }
-        }
-        mgb_assert(!used_outputs.empty());
-        m_desc->_on_graph_compile_or_func_del(used_outputs);
-        auto seq = graph->current_comp_seq();
-        if (seq) {
-            seq->user_data()
-                    .get_user_data_or_create<FuncDelCallbackInvoker>()
-                    ->add(this);
-        } else {
-            mgb_assert(graph->options().eager_evaluation);
-        }
-        m_on_graph_compile_called = true;
-    }
-}
-
-SymbolVarArray Craniotome::make(
-        std::unique_ptr<CraniotomeDesc> desc,
-        const SymbolVarArray &inputs,
-        const OperatorNodeConfig &config) {
-    VarNodeArray inp_vn(inputs.size());
-    for (size_t i = 0; i < inputs.size(); ++ i)
-        inp_vn[i] = inputs[i].node();
-    ComputingGraph *graph;
-    if (!inputs.empty()) {
-        graph = inp_vn[0]->owner_graph();
-    } else {
-        graph = &desc->_get_comp_graph().get();
-        mgb_assert(graph);
-    }
-    auto opr = graph->insert_opr(
-            std::make_unique<Craniotome>(
-                graph, std::move(desc), inp_vn, config));
-    SymbolVarArray rst;
-    for (auto i: opr->output())
-        rst.push_back(i);
-    return rst;
-}
-
-void Craniotome::init_output_static_infer_desc() {
-    if (!(m_node_flag & NodeFlag::DYNAMIC_OUTPUT_SHAPE)) {
-        Super::init_output_static_infer_desc();
-    } else if (input().empty()) {
-        using namespace cg::static_infer;
-        auto &&mgr = owner_graph()->static_infer_manager();
-        for (size_t idx = 0; idx < output().size(); ++ idx) {
-            auto infer = [this, idx](TensorShape &dest, const InpVal &) {
-                if (!m_prev_inferred_shape.valid()) {
-                    auto &&shp = m_prev_inferred_shape.emplace();
-                    shp.resize(output().size());
-                    get_output_var_shape({}, shp);
-                }
-                dest = m_prev_inferred_shape->at(idx);
-                return true;
-            };
-            mgr.register_shape_infer(output(idx),
-                    {SourceType::MUTABLE, {}, infer});
-        }
-    }
-}
-
-void Craniotome::init_output_dtype() {
-    PYTHON_GIL;
-
-    auto input_dtypes = PyList_New(input().size()),
-         ret = PyList_New(output().size());
-    mgb_assert(input_dtypes);
-    PyObjRefKeeper input_dtypes_ref{input_dtypes};
-
-    mgb_assert(ret);
-    PyObjRefKeeper ret_ref{ret};
-
-    for (size_t i = 0; i < input().size(); ++ i) {
-        auto err = PyList_SetItem(input_dtypes, i,
-                npy::dtype_mgb2np(input(i)->dtype()));
-        mgb_assert(!err);
-    }
-
-    // it seems that we need to incref before passing it to swig director method
-    Py_INCREF(input_dtypes);
-    Py_INCREF(ret);
-    if (!m_desc->_init_output_dtype(input_dtypes, ret)) {
-        Super::init_output_dtype();
-        return;
-    }
-
-    mgb_assert(PyList_Check(ret),
-            "_init_output_dtype should return list");
-    mgb_assert(PyList_Size(ret) == static_cast<Py_ssize_t>(output().size()),
-                "_init_output_dtype list size not equal to number of outputs");
-    for (size_t i = 0; i < output().size(); ++ i) {
-        auto cur = PyList_GetItem(ret, i);
-        mgb_assert(cur, "failed to get dtype for output %zu", i);
-        output(i)->dtype(npy::dtype_np2mgb(cur));
-    }
-
-    mgb_assert(input_dtypes->ob_refcnt == 1);
-    mgb_assert(ret->ob_refcnt == 1);
-}
-
-// serialization
-namespace {
-
-    void craniotome_dumper(
-            serialization::OprDumpContext &ctx,
-            const cg::OperatorNodeBase &opr) {
-
-        auto &&desc = opr.cast_final_safe<Craniotome>().desc();
-        auto result = PyList_New(0);
-        mgb_assert(result);
-        PyObjRefKeeper result_ref{result};
-
-        Py_INCREF(result);
-        desc._setup_serialize_params(result);
-        mgb_assert(result->ob_refcnt == 1);
-
-        auto sz = PyList_Size(result);
-        mgb_assert(sz >= 1 && sz <= 2);
-
-        auto name_obj = PyList_GetItem(result, 0);
-        mgb_assert(name_obj && PyUnicode_Check(name_obj));
-        Py_ssize_t name_size;
-        const char *name_str = PyUnicode_AsUTF8AndSize(name_obj, &name_size);
-        mgb_assert(name_str);
-
-        char *param_str = nullptr;
-        Py_ssize_t param_size = 0;
-        if (sz == 2) {
-            auto param_obj = PyList_GetItem(result, 1);
-            mgb_assert(param_obj && PyBytes_Check(param_obj));
-            auto err = PyBytes_AsStringAndSize(
-                    param_obj, &param_str, &param_size);
-            mgb_assert(!err);
-        }
-
-
-        ctx.dump_buf_with_len(name_str, name_size);
-        if (param_str) {
-            ctx.dump_buf_with_len(param_str, param_size);
-        }
-    }
-
-    cg::OperatorNodeBase* craniotome_shallow_copy(
-            const serialization::OprShallowCopyContext &ctx,
-            const cg::OperatorNodeBase &opr, const VarNodeArray &inputs,
-            const OperatorNodeConfig &config) {
-
-        MGB_MARK_USED_VAR(ctx);
-        auto &&orig_desc = opr.cast_final_safe<Craniotome>().desc();
-        std::unique_ptr<CraniotomeDesc> desc;
-        mgb_assert(!orig_desc._set_copy_result);
-        orig_desc._set_copy_result = [&desc](CraniotomeDesc *r) {
-            mgb_assert(!desc);
-            desc.reset(r);
-        };
-        orig_desc._copy();
-        mgb_assert(desc);
-        orig_desc._set_copy_result = {};
-
-        mgb_assert(&orig_desc != desc.get());
-        return Craniotome::make(std::move(desc),
-                {inputs.begin(), inputs.end()}, config).at(0).node(
-                    )->owner_opr();
-    }
-
-    class _RegDumper {
-        public:
-            _RegDumper() {
-                serialization::OprRegistry::add_using_dynamic_loader(
-                        Craniotome::typeinfo(), "Craniotome",
-                        craniotome_dumper);
-                MGB_REG_OPR_SHALLOW_COPY_IMPL(
-                        Craniotome, craniotome_shallow_copy);
-            }
-    };
-    _RegDumper _reg_dumper;
-
-} // anonymous namespace
-
-// vim: syntax=cpp.doxygen foldmethod=marker foldmarker=f{{{,f}}}
diff --git a/python_module/src/cpp/craniotome.h b/python_module/src/cpp/craniotome.h
deleted file mode 100644
index d4754cb0..00000000
--- a/python_module/src/cpp/craniotome.h
+++ /dev/null
@@ -1,193 +0,0 @@
-/**
- * \file python_module/src/cpp/craniotome.h
- *
- * This file is part of MegBrain, a deep learning framework developed by Megvii.
- *
- * \brief extend megbrain operators in python
- *
- * \copyright Copyright (c) 2014-2020 Megvii Inc. All rights reserved.
- *
- */
-
-#pragma once
-
-#include "megbrain/graph/operator_node.h"
-#include "./megbrain_wrap.h"
-
-using TensorShapeVec = std::vector<std::vector<size_t>>;
-using SymbolVarArray = mgb::SymbolVarArray;
-
-namespace mgb {
-namespace opr {
-class Craniotome;
-}  // namespace opr
-}  // namespace mgb
-
-class CraniotomeDesc: public mgb::Hashable {
-    MGB_DYN_TYPE_OBJ_FINAL_DECL;
-
-    mutable PyObject *m_py_self = nullptr;
-
-    bool is_same_st(const mgb::Hashable &rhs) const override;
-
-    size_t hash() const override;
-
-    public:
-        struct NodeFlag {
-            static constexpr uint32_t
-                DYNAMIC_OUTPUT_SHAPE = 1 << 0,
-                DISALLOW_DUPLICATE = 1 << 1,
-                ALLOW_EMPTY_OUTPUT = 1 << 2,
-                DISABLE_SYS_MEM_ALLOC = 1 << 3;
-        };
-        virtual ~CraniotomeDesc() = default;
-
-        mgb::opr::Craniotome* owner_opr = nullptr;
-
-        //! get final py object that implements this interface
-        PyObject* py_self() const ;
-
-        //! store self in \p result which is a list
-        virtual void _setup_self(PyObject *result) const = 0;
-
-        virtual bool _is_same(PyObject *rhs) const = 0;
-
-        virtual uint32_t _node_flag() const = 0;
-
-        virtual size_t _hash() const = 0;
-
-        virtual std::string _get_opr_type_name() = 0;
-
-        virtual size_t _get_nr_outputs() = 0;
-
-        virtual void _execute(
-                const std::vector<CompGraphCallbackValueProxy> &inputs,
-                std::vector<SharedND> &outputs) = 0;
-
-        /*!
-         * \brief infer output shape if DYNAMIC_OUTPUT_SHAPE is not set
-         */
-        virtual TensorShapeVec _infer_shape(
-                const TensorShapeVec &inp_shape) = 0;
-
-        virtual SymbolVarArray _grad(
-                size_t wrt_idx,
-                const SymbolVarArray &inputs,
-                const SymbolVarArray &outputs,
-                const SymbolVarArray &out_grad) = 0;
-
-        virtual size_t _get_nr_dev_comp_order_deps() = 0;
-
-        mgb::thin_function<SymbolVarArray()> _get_all_io_vars;
-
-        /*!
-         * \brief get output dtypes from input dtypes
-         * \param[in] input_dtypes python list of input
-         * \param[out] result initialized as an empty python list, and should
-         *      be filled with output dtypes
-         * \return whether user has set the dtype
-         */
-        virtual bool _init_output_dtype(
-                PyObject *input_dtypes, PyObject *result) = 0;
-
-        /*!
-         * \brief get computing graph when no input var is provided
-         */
-        virtual CompGraph _get_comp_graph() = 0;
-
-        /*!
-         * \brief copy this CraniotomeDesc
-         *
-         * The implementation must call _set_copy_result() to return the result;
-         * this is used to bypass some swig issues.
-         */
-        virtual void _copy() const = 0;
-        mutable mgb::thin_function<void(CraniotomeDesc*)> _set_copy_result;
-
-        /*!
-         * \brief setup params for serialization
-         * \param output an allocated list. One or two elements should be
-         *      inserted in it after this function returns: the first element
-         *      should be a string, indicating the id to be passed to
-         *      opr_maker_loader; the second element, if exists, must be a byte
-         *      object containing extra param that should be written to file.
-        */
-        virtual void _setup_serialize_params(PyObject *output) const = 0;
-
-        /*!
-         * \brief callback invoked when the graph is compiled or when func is
-         *      destructed
-         *
-         * If the graph is compiled but not executed, this function might not be
-         * called
-         *
-         * \param used_outputs an array indices indicating the used output vars;
-         *      this argument being empty means that the previously compiled
-         *      func is destructed
-         */
-        virtual void _on_graph_compile_or_func_del(
-                const std::vector<size_t>& used_outputs) = 0;
-};
-
-
-namespace mgb {
-namespace opr {
-
-MGB_DEFINE_OPR_CLASS(Craniotome, cg::SingleCNOutshapePureByInshapeOprBase) // {
-    class FuncDelCallbackInvoker;
-    using NodeFlag = CraniotomeDesc::NodeFlag;
-
-    bool m_on_graph_compile_called = false;
-    const uint32_t m_node_flag;
-
-    //! DEV_COMP_ORDER inputs are at the tail of input array; this is the
-    //! number of DEV_VALUE inputs, and also the index of the first
-    //! DEV_COMP_ORDER input
-    size_t m_nr_dev_value_inp;
-
-    std::unique_ptr<CraniotomeDesc> m_desc;
-
-    //! previously inferred shape; used when there is no input and
-    //! m_is_dynamic_output_shape is set to true
-    Maybe<TensorShapeArray> m_prev_inferred_shape;
-
-    void scn_do_execute() override;
-    void get_output_var_shape(const TensorShapeArray &inp_shape,
-            TensorShapeArray &out_shape) const override;
-
-    void add_input_layout_constraint() override;
-
-    void init_output_static_infer_desc() override;
-    void init_output_dtype() override;
-    NodeProp* do_make_node_prop() const override;
-
-    bool output_no_sys_mem_alloc() const {
-        return m_node_flag & (NodeFlag::DYNAMIC_OUTPUT_SHAPE |
-                              NodeFlag::DISABLE_SYS_MEM_ALLOC);
-    }
-
-    public:
-        Craniotome(mgb::ComputingGraph *graph,
-                std::unique_ptr<CraniotomeDesc> desc,
-                const VarNodeArray &inputs, const OperatorNodeConfig &config);
-
-        ~Craniotome() noexcept;
-
-        static SymbolVarArray make(
-                std::unique_ptr<CraniotomeDesc> desc,
-                const SymbolVarArray &inputs,
-                const OperatorNodeConfig &config = {});
-
-        const CraniotomeDesc& desc() const {
-            return *m_desc;
-        }
-
-        size_t nr_dev_value_inp() const {
-            return m_nr_dev_value_inp;
-        }
-};
-
-} // namespace opr
-} // namespace mgb
-
-// vim: syntax=cpp.doxygen foldmethod=marker foldmarker=f{{{,f}}}
diff --git a/python_module/src/cpp/function_replace.cpp b/python_module/src/cpp/function_replace.cpp
deleted file mode 100644
index 6fc0fb63..00000000
--- a/python_module/src/cpp/function_replace.cpp
+++ /dev/null
@@ -1,174 +0,0 @@
-/**
- * \file python_module/src/cpp/function_replace.cpp
- *
- * This file is part of MegBrain, a deep learning framework developed by Megvii.
- *
- * \brief replace functions in megbrain core
- *
- * \copyright Copyright (c) 2014-2020 Megvii Inc. All rights reserved.
- *
- */
-
-#include "./megbrain_wrap.h"
-#include "./python_helper.h"
-
-#include "megbrain/utils/debug.h"
-#include "megbrain/common.h"
-#include "megbrain/system.h"
-
-#include <stdexcept>
-#include <cstring>
-#include <cstdarg>
-
-#include <Python.h>
-
-#ifdef WIN32
-#include <windows.h>
-#include <stdio.h>
-#else
-#include <unistd.h>
-#endif
-
-namespace {
-
-PyObject *logger = nullptr;
-
-#if MGB_ENABLE_DEBUG_UTIL
-void throw_fork_cuda_exc() {
-    // set python error state, so when returning to parent process that calls
-    // fork(), an exception could be raised
-    //
-    // call chain:
-    // python -> fork() -> pthread_atfork -> CudaCheckOnFork ->
-    // ForkAfterCudaError::throw_
-    mgb_log_warn("try to raise python exception for fork after cuda");
-    PyErr_SetString(PyExc_SystemError, "fork after cuda has been initialized");
-}
-#endif
-
-class Init {
-    static Init inst;
-    Init() {
-#if MGB_ENABLE_DEBUG_UTIL
-        mgb::debug::ForkAfterCudaError::throw_ = throw_fork_cuda_exc;
-#endif
-    }
-};
-Init Init::inst;
-
-int fork_exec_impl(const std::string& arg0, const std::string& arg1,
-                   const std::string& arg2) {
-#ifdef WIN32
-    STARTUPINFO si;
-    PROCESS_INFORMATION pi;
-    ZeroMemory(&si, sizeof(si));
-    si.cb = sizeof(si);
-    ZeroMemory(&pi, sizeof(pi));
-    auto args_str = " " + arg1 + " " + arg2;
-
-    // Start the child process.
-    if (!CreateProcess(arg0.c_str(),                         // exe name
-                       const_cast<char*>(args_str.c_str()),  // Command line
-                       NULL,   // Process handle not inheritable
-                       NULL,   // Thread handle not inheritable
-                       FALSE,  // Set handle inheritance to FALSE
-                       0,      // No creation flags
-                       NULL,   // Use parent's environment block
-                       NULL,   // Use parent's starting directory
-                       &si,    // Pointer to STARTUPINFO structure
-                       &pi)    // Pointer to PROCESS_INFORMATION structure
-    ) {
-        mgb_log_warn("CreateProcess failed (%lu).\n", GetLastError());
-        fprintf(stderr, "[megbrain] failed to execl %s [%s, %s]\n",
-                arg0.c_str(), arg1.c_str(), arg2.c_str());
-        __builtin_trap();
-    }
-    return pi.dwProcessId;
-#else
-    auto pid = fork();
-    if (!pid) {
-        execl(arg0.c_str(), arg0.c_str(), arg1.c_str(), arg2.c_str(), nullptr);
-        fprintf(stderr, "[megbrain] failed to execl %s [%s, %s]: %s\n",
-                arg0.c_str(), arg1.c_str(), arg2.c_str(),
-                std::strerror(errno));
-        std::terminate();
-    }
-    mgb_assert(pid > 0, "failed to fork: %s", std::strerror(errno));
-    return pid;
-#endif
-}
-
-} // anonymous namespace
-
-// called from swig/misc.i
-void _timed_func_set_fork_exec_path(const char *arg0, const char *arg1) {
-    using namespace std::placeholders;
-    mgb::sys::TimedFuncInvoker::ins().set_fork_exec_impl(
-            std::bind(fork_exec_impl, std::string{arg0}, std::string{arg1},
-                _1));
-}
-
-void _timed_func_exec_cb(const char *user_data) {
-    mgb::sys::TimedFuncInvoker::ins().fork_exec_impl_mainloop(user_data);
-}
-
-void _register_logger(PyObject *l) {
-    logger = l;
-}
-
-namespace {
-void py_log_handler(mgb::LogLevel level,
-        const char *file, const char *func, int line, const char *fmt,
-        va_list ap) {
-    if (global_finalized()) {
-        return;
-    }
-
-    using mgb::LogLevel;
-
-    MGB_MARK_USED_VAR(file);
-    MGB_MARK_USED_VAR(func);
-    MGB_MARK_USED_VAR(line);
-
-    if (!logger)
-        return;
-
-    PYTHON_GIL;
-
-    const char *py_type;
-    switch (level) {
-        case LogLevel::DEBUG:
-            py_type = "debug";
-            break;
-        case LogLevel::INFO:
-            py_type = "info";
-            break;
-        case LogLevel::WARN:
-            py_type = "warning";
-            break;
-        case LogLevel::ERROR:
-            py_type = "error";
-            break;
-        default:
-            throw std::runtime_error("bad log level");
-    }
-
-    std::string msg = mgb::svsprintf(fmt, ap);
-    PyObject *py_msg = Py_BuildValue("s", msg.c_str());
-    PyObject_CallMethod(logger,
-            const_cast<char*>(py_type), const_cast<char*>("O"),
-            py_msg);
-    Py_DECREF(py_msg);
-}
-
-class LogHandlerSetter {
-    static LogHandlerSetter ins;
-    public:
-        LogHandlerSetter() {
-            mgb::set_log_handler(py_log_handler);
-        }
-};
-LogHandlerSetter LogHandlerSetter::ins;
-} // anobymous namespace
-
-// vim: syntax=cpp.doxygen foldmethod=marker foldmarker=f{{{,f}}}
diff --git a/python_module/src/cpp/intbx.cpp b/python_module/src/cpp/intbx.cpp
deleted file mode 100644
index 104a2353..00000000
--- a/python_module/src/cpp/intbx.cpp
+++ /dev/null
@@ -1,364 +0,0 @@
-/**
- * \file python_module/src/cpp/intbx.cpp
- *
- * This file is part of MegBrain, a deep learning framework developed by Megvii.
- *
- * \brief numpy dtypes for low bit
- *
- * \copyright Copyright (c) 2014-2020 Megvii Inc. All rights reserved.
- *
- */
-
-#include "megbrain/common.h"
-
-#include <Python.h>
-#include <structmember.h>
-
-#define NO_IMPORT_ARRAY 1
-#include "./numpy_incl.h"
-
-#pragma GCC diagnostic ignored "-Wmissing-field-initializers"
-
-namespace {
-
-template <size_t N>
-struct LowBitType {
-    static_assert(N < 8, "low bit only supports less than 8 bits");
-    static int npy_typenum;
-    //! numerical value (-3, -1, 1, 3)
-    int8_t value;
-
-    struct PyObj;
-    struct NpyType;
-
-    const static int32_t max_value = (1 << N) - 1;
-
-    //! check whether val is (-3, -1, 1, 3) and set python error
-    static bool check_value_set_err(int val) {
-        int t = val + max_value;
-        if ((t & 1) || t < 0 || t > (max_value << 1)) {
-            PyErr_SetString(PyExc_ValueError,
-                            mgb::ssprintf("low bit dtype number error: "
-                                          "value=%d; allowed {-3, -1, 1, 3}",
-                                          val)
-                                    .c_str());
-            return false;
-        }
-
-        return true;
-    }
-
-    template <typename S, typename T>
-    struct NpyCast;
-};
-
-template <size_t N>
-int LowBitType<N>::npy_typenum;
-
-/* ==================== LowBitType::NpyCast ==================== */
-
-template <size_t N>
-template <typename S>
-struct LowBitType<N>::NpyCast<S, LowBitType<N>> {
-    static void apply(void* from_, void* to_, npy_intp n, void* /*fromarr*/,
-                      void* /*toarr*/) {
-        auto from = static_cast<S*>(from_);
-        auto to = static_cast<LowBitType<N>*>(to_);
-        for (npy_intp i = 0; i < n; ++i) {
-            int cur = static_cast<int>(from[i]);
-            if (!LowBitType<N>::check_value_set_err(cur))
-                return;
-            to[i].value = cur;
-        }
-    }
-};
-
-template <size_t N>
-template <typename T>
-struct LowBitType<N>::NpyCast<LowBitType<N>, T> {
-    static void apply(void* from_, void* to_, npy_intp n, void* /*fromarr*/,
-                      void* /*toarr*/) {
-        auto from = static_cast<LowBitType<N>*>(from_);
-        auto to = static_cast<T*>(to_);
-        for (npy_intp i = 0; i < n; ++i) {
-            to[i] = from[i].value;
-        }
-    }
-};
-
-/* ==================== LowBitType::PyObj ==================== */
-template <size_t N>
-struct LowBitType<N>::PyObj {
-    PyObject_HEAD LowBitType<N> obj;
-
-    static PyTypeObject py_type;
-
-    static PyObject* from_lowbit(LowBitType<N> val) {
-        auto p = reinterpret_cast<PyObj*>(py_type.tp_alloc(&py_type, 0));
-        p->obj.value = val.value;
-        return reinterpret_cast<PyObject*>(p);
-    }
-
-    static PyObject* py_new(PyTypeObject* type, PyObject* args, PyObject* kwds);
-    static PyObject* py_repr(PyObject* obj);
-    static PyObject* py_richcompare(PyObject* a, PyObject* b, int op);
-};
-template <size_t N>
-PyTypeObject LowBitType<N>::PyObj::py_type;
-
-template <size_t N>
-PyObject* LowBitType<N>::PyObj::py_new(PyTypeObject* type, PyObject* args,
-                                       PyObject* kwds) {
-    PyObj* self;
-    Py_ssize_t size;
-
-    self = (PyObj*)type->tp_alloc(type, 0);
-
-    size = PyTuple_GET_SIZE(args);
-    if (size > 1) {
-        PyErr_SetString(PyExc_TypeError, "LowBitType Only has 1 parameter");
-        return NULL;
-    }
-    PyObject* x = PyTuple_GET_ITEM(args, 0);
-    if (PyObject_IsInstance(x, (PyObject*)&py_type)) {
-        Py_INCREF(x);
-        return x;
-    }
-
-    if (!PyLong_Check(x)) {
-        PyErr_SetString(PyExc_TypeError,
-                        "LowBitType must be initialized wit int");
-        return NULL;
-    }
-
-    const long s = PyLong_AsLong(x);
-
-    self->obj.value = s;
-
-    return (PyObject*)self;
-}
-
-template <size_t N>
-PyObject* LowBitType<N>::PyObj::py_repr(PyObject* obj) {
-    return PyUnicode_FromFormat("%d", ((PyObj*)obj)->obj.value);
-}
-
-template <size_t N>
-PyObject* LowBitType<N>::PyObj::py_richcompare(PyObject* a, PyObject* b,
-                                               int op) {
-    mgb_assert(PyObject_IsInstance(a, (PyObject*)&py_type));
-    auto bval = PyFloat_AsDouble(b);
-    if (bval == -1 && PyErr_Occurred()) {
-        return NULL;
-    }
-    double aval = ((PyObj*)a)->obj.value;
-#define OP(py, op)           \
-    case py: {               \
-        if (aval op bval) {  \
-            Py_RETURN_TRUE;  \
-        } else {             \
-            Py_RETURN_FALSE; \
-        }                    \
-    }
-    switch (op) {
-        OP(Py_LT, <)
-        OP(Py_LE, <=)
-        OP(Py_EQ, ==)
-        OP(Py_NE, !=)
-        OP(Py_GT, >)
-        OP(Py_GE, >=)
-    };
-#undef OP
-    return Py_NotImplemented;
-}
-
-/* ==================== LowBitType<N>::NpyType ==================== */
-template <size_t N>
-struct LowBitType<N>::NpyType {
-    static PyArray_ArrFuncs funcs;
-    static PyArray_Descr descr;
-
-    static bool init();
-
-    static void copyswap(void* dst, void* src, int swap, void* /*arr*/) {
-        if (src) {
-            mgb_assert(!swap);
-            memcpy(dst, src, sizeof(LowBitType<N>));
-        }
-    }
-    static PyObject* getitem(void* data, void* ap) {
-        return LowBitType<N>::PyObj::from_lowbit(
-                *static_cast<LowBitType<N>*>(data));
-    }
-    static int setitem(PyObject* op, void* ov, void* ap);
-    static int fill(void* data_, npy_intp length, void* arr);
-};
-
-template <size_t N>
-PyArray_ArrFuncs LowBitType<N>::NpyType::funcs;
-template <size_t N>
-PyArray_Descr LowBitType<N>::NpyType::descr;
-
-template <size_t N>
-int LowBitType<N>::NpyType::setitem(PyObject* op, void* ov, void* ap) {
-    if (!PyLong_Check(op)) {
-        PyErr_SetString(PyExc_ValueError, "input type must be int");
-        return -1;
-    }
-
-    int a = PyLong_AsLong(op);
-    if (!check_value_set_err(a))
-        return -1;
-
-    static_cast<LowBitType<N>*>(ov)->value = a;
-    return 0;
-}
-
-template <size_t N>
-int LowBitType<N>::NpyType::fill(void* data_, npy_intp length, void* arr) {
-    auto data = static_cast<LowBitType<N>*>(data_);
-    int8_t delta = data[1].value - data[0].value, r = data[1].value;
-    if (!check_value_set_err(data[0].value) ||
-        !check_value_set_err(data[1].value))
-        return -1;
-    for (npy_intp i = 2; i < length; i++) {
-        r += delta;
-        if (r > max_value)
-            r = -max_value;
-        else if (r < -max_value)
-            r = max_value;
-        data[i].value = r;
-    }
-    return 0;
-}
-
-template <size_t N>
-bool LowBitType<N>::NpyType::init() {
-    descr = {PyObject_HEAD_INIT(0) & LowBitType<N>::PyObj::py_type,
-             'V',  // kind
-             'r',  // type
-             '=',  // byteorder
-             NPY_NEEDS_PYAPI | NPY_USE_GETITEM | NPY_USE_SETITEM,
-             0,  // type num
-             sizeof(LowBitType<N>),
-             alignof(LowBitType<N>),
-             NULL,
-             NULL,
-             NULL,
-             &funcs};
-    Py_TYPE(&descr) = &PyArrayDescr_Type;
-    PyArray_InitArrFuncs(&funcs);
-    funcs.copyswap = copyswap;
-    funcs.getitem = getitem;
-    funcs.setitem = setitem;
-    funcs.fill = fill;
-    npy_typenum = PyArray_RegisterDataType(&descr);
-
-#define REGISTER_CAST(From, To, From_descr, To_typenum, safe)         \
-    {                                                                 \
-        PyArray_Descr* from_descr = (From_descr);                     \
-        if (PyArray_RegisterCastFunc(from_descr, (To_typenum),        \
-                                     NpyCast<From, To>::apply) < 0) { \
-            return false;                                             \
-        }                                                             \
-        if (safe && PyArray_RegisterCanCast(from_descr, (To_typenum), \
-                                            NPY_NOSCALAR) < 0) {      \
-            return false;                                             \
-        }                                                             \
-    }
-#define REGISTER_INT_CASTS(bits)                                          \
-    REGISTER_CAST(npy_int##bits, LowBitType<N>,                           \
-                  PyArray_DescrFromType(NPY_INT##bits),                   \
-                  LowBitType<N>::npy_typenum, 1)                          \
-    REGISTER_CAST(LowBitType<N>, npy_int##bits, &descr, NPY_INT##bits, 0) \
-    REGISTER_CAST(npy_uint##bits, LowBitType<N>,                          \
-                  PyArray_DescrFromType(NPY_UINT##bits),                  \
-                  LowBitType<N>::npy_typenum, 1)                          \
-    REGISTER_CAST(LowBitType<N>, npy_uint##bits, &descr, NPY_UINT##bits, 0)
-
-    REGISTER_INT_CASTS(8)
-    REGISTER_INT_CASTS(16)
-    REGISTER_INT_CASTS(32)
-    REGISTER_INT_CASTS(64)
-    REGISTER_CAST(LowBitType<N>, float, &descr, NPY_FLOAT, 0)
-    REGISTER_CAST(float, LowBitType<N>, PyArray_DescrFromType(NPY_FLOAT),
-                  LowBitType<N>::npy_typenum, 0)
-    REGISTER_CAST(LowBitType<N>, double, &descr, NPY_DOUBLE, 1)
-    REGISTER_CAST(double, LowBitType<N>, PyArray_DescrFromType(NPY_DOUBLE),
-                  LowBitType<N>::npy_typenum, 0)
-    return true;
-}
-
-}  // anonymous namespace
-
-#define DEFINE_INTBX(n) using IntB##n = LowBitType<n>;
-FOREACH_MGB_LOW_BIT(DEFINE_INTBX)
-#undef DEFINE_INTBX
-
-#define MGB_STR_HELPER(n) #n
-
-#define DEFINE_INIT_PYTYPE(n)                                        \
-    bool init_pytype_intb##n() {                                     \
-        auto& py_type = IntB##n::PyObj::py_type;                     \
-        py_type = {PyVarObject_HEAD_INIT(NULL, 0)};                  \
-        py_type.tp_name = "megbrain._mgb.pyintb" MGB_STR_HELPER(n);  \
-        py_type.tp_basicsize = sizeof(IntB##n::PyObj);               \
-        py_type.tp_flags = Py_TPFLAGS_DEFAULT | Py_TPFLAGS_BASETYPE; \
-        py_type.tp_doc = "an low bit int type";                      \
-        py_type.tp_new = IntB##n::PyObj::py_new;                     \
-        py_type.tp_str = IntB##n::PyObj::py_repr;                    \
-        py_type.tp_repr = IntB##n::PyObj::py_repr;                   \
-        py_type.tp_richcompare = IntB##n::PyObj::py_richcompare;     \
-        py_type.tp_base = &PyGenericArrType_Type;                    \
-        return PyType_Ready(&py_type) >= 0;                          \
-    }
-FOREACH_MGB_LOW_BIT(DEFINE_INIT_PYTYPE)
-#undef DEFINE_INIT_PYTYPE
-
-#define DEFINE_REGISTER_FUNC(n)                                     \
-    void register_pytype_intb##n(PyObject* d, PyObject* m) {        \
-        Py_INCREF(&IntB##n::PyObj::py_type);                        \
-        PyDict_SetItemString(d, "intb" MGB_STR_HELPER(n) "_pytype", \
-                             (PyObject*)&IntB##n::PyObj::py_type);  \
-        PyModule_AddObject(m, "intb" MGB_STR_HELPER(n) "_pytype",   \
-                           (PyObject*)&IntB##n::PyObj::py_type);    \
-    }
-FOREACH_MGB_LOW_BIT(DEFINE_REGISTER_FUNC)
-#undef DEFINE_REGISTER_FUNC
-
-//! called from swig init
-void _init_intbx_types(PyObject* m) {
-    if (m == NULL)
-        return;
-    PyObject* d = PyModule_GetDict(m);
-    PyArray_Descr* dtype;
-#define DEFINE_INIT_INTBX_TYPE(n)                                        \
-    if (!init_pytype_intb##n())                                          \
-        return;                                                          \
-    if (!IntB##n::NpyType::init())                                       \
-        return;                                                          \
-    dtype = PyArray_DescrFromType(IntB##n::npy_typenum);                 \
-    if (!dtype)                                                          \
-        return;                                                          \
-    {                                                                    \
-        PyObject* pytype = (PyObject*)(&IntB##n::PyObj::py_type);        \
-        Py_INCREF(pytype);                                               \
-        PyDict_SetItemString(d, "pyintb" MGB_STR_HELPER(n), pytype);     \
-    }                                                                    \
-    Py_INCREF(dtype);                                                    \
-    PyDict_SetItemString(d, "intb" MGB_STR_HELPER(n), (PyObject*)dtype); \
-    register_pytype_intb##n(d, m);
-    FOREACH_MGB_LOW_BIT(DEFINE_INIT_INTBX_TYPE)
-#undef DEFINE_INIT_INTBX_TYPE
-    return;
-}
-
-#define DEFINE_NPY_INTBX(n) \
-    int mgb::npy_num_intb##n() { return IntB##n::npy_typenum; }
-FOREACH_MGB_LOW_BIT(DEFINE_NPY_INTBX)
-#undef DEFINE_NPY_INTBX
-/*int mgb::npy_num_intb2() {
-    return IntB2::npy_typenum;
-}*/
-
-// vim: syntax=cpp.doxygen foldmethod=marker foldmarker=f{{{,f}}}
diff --git a/python_module/src/cpp/megbrain_config.cpp b/python_module/src/cpp/megbrain_config.cpp
deleted file mode 100644
index fa31950e..00000000
--- a/python_module/src/cpp/megbrain_config.cpp
+++ /dev/null
@@ -1,517 +0,0 @@
-/**
- * \file python_module/src/cpp/megbrain_config.cpp
- *
- * This file is part of MegBrain, a deep learning framework developed by Megvii.
- *
- * \copyright Copyright (c) 2014-2020 Megvii Inc. All rights reserved.
- *
- */
-
-#include "./megbrain_config.h"
-#include "./python_helper.h"
-
-#include "megbrain/graph/event.h"
-#include "megbrain/utils/debug.h"
-#include "megbrain/comp_node_env.h"
-#include "megbrain/serialization/opr_registry.h"
-
-#include <set>
-#include <fstream>
-#include <string>
-#include <sstream>
-
-#ifdef WIN32
-#include <io.h>
-#include <windows.h>
-#else
-#include <dlfcn.h>
-#endif
-
-#if MGB_ENABLE_OPR_MM
-#include "megbrain/opr/mm_handler.h"
-#endif
-
-#if MGB_CUDA
-#include <cuda.h>
-#endif
-
-#ifdef WIN32
-#define F_OK 0
-#define RTLD_LAZY 0
-#define RTLD_GLOBAL 0
-#define RTLD_NOLOAD 0
-#define access(a, b) false
-#define SPLITER ';'
-#define ENV_PATH "Path"
-#define NVCC_EXE "nvcc.exe"
-static void* dlopen(const char* file, int) {
-    return static_cast<void*>(LoadLibrary(file));
-}
-
-static void* dlerror() {
-    const char* errmsg = "dlerror not aviable in windows";
-    return const_cast<char*>(errmsg);
-}
-
-static void* dlsym(void* handle, const char* name) {
-    FARPROC symbol = GetProcAddress((HMODULE)handle, name);
-    return reinterpret_cast<void*>(symbol);
-}
-
-static int check_file_exist(const char* path, int mode) {
-    return _access(path, mode);
-}
-#else
-#define SPLITER ':'
-#define ENV_PATH "PATH"
-#define NVCC_EXE "nvcc"
-static int check_file_exist(const char* path, int mode) {
-    return access(path, mode);
-}
-#endif
-
-using namespace mgb;
-
-namespace {
-    std::unordered_map<ComputingGraph*,
-        SyncEventConnecter::ReceiverHandler>
-        set_priority_on_opr_inserted_handle;
-    std::mutex set_priority_on_opr_inserted_handle_mtx;
-
-} // anonymous namespace
-
-bool _config::set_comp_graph_option(
-        CompGraph &cg, const std::string &name, int val_int) {
-
-#define SET_CG_OPTION(name_chk) \
-    do { \
-        static_assert( \
-                std::is_same<decltype(opt.name_chk), bool>::value || \
-                std::is_same<decltype(opt.name_chk), uint8_t>::value || \
-                std::is_same<decltype(opt.name_chk), int16_t>::value || \
-                std::is_same<decltype(opt.name_chk), uint16_t>::value || \
-                std::is_same<decltype(opt.name_chk), int32_t>::value, \
-                "not bool/int opt"); \
-        if (name == #name_chk) { \
-            auto ret = opt.name_chk; \
-            opt.name_chk = val_int; \
-            return ret; \
-        } \
-    } while(0)
-
-    auto &&opt = cg.get().options();
-    SET_CG_OPTION(seq_opt.enable_mem_plan_opt);
-    SET_CG_OPTION(seq_opt.enable_mem_reuse_alloc);
-    SET_CG_OPTION(seq_opt.enable_seq_comp_node_opt);
-    SET_CG_OPTION(force_dynamic_alloc);
-    SET_CG_OPTION(enable_grad_var_static_reshape);
-    SET_CG_OPTION(async_exec_level);
-    SET_CG_OPTION(graph_opt.jit);
-    SET_CG_OPTION(graph_opt.tensorrt);
-    SET_CG_OPTION(graph_opt_level);
-    SET_CG_OPTION(allreduce_pack_max_size);
-    SET_CG_OPTION(allreduce_pack_ignore_first);
-    SET_CG_OPTION(var_sanity_check_first_run);
-    SET_CG_OPTION(no_profiling_on_shape_change);
-    SET_CG_OPTION(allocate_static_mem_after_graph_compile);
-    SET_CG_OPTION(log_level);
-    SET_CG_OPTION(enable_sublinear_memory_opt);
-    SET_CG_OPTION(sublinear_mem_config.lb_memory);
-    SET_CG_OPTION(sublinear_mem_config.genetic_nr_iter);
-    SET_CG_OPTION(sublinear_mem_config.genetic_pool_size);
-    SET_CG_OPTION(sublinear_mem_config.thresh_nr_try);
-    SET_CG_OPTION(sublinear_mem_config.num_worker);
-    SET_CG_OPTION(enable_var_mem_defragment);
-    SET_CG_OPTION(eager_evaluation);
-    SET_CG_OPTION(enable_memory_swap);
-    throw MegBrainError(ssprintf(
-                "invalid computing graph option name: %s", name.c_str()));
-#undef SET_CG_OPTION
-}
-
-bool _config::comp_graph_is_eager(CompGraph &cg) {
-    return cg.get().options().eager_evaluation;
-}
-
-void _config::add_extra_vardep(const SymbolVar &var, const SymbolVar &dep) {
-    auto og = var.node()->owner_graph();
-    mgb_assert(og == dep.node()->owner_graph());
-    og->options().extra_vardeps[var.node()].push_back(dep.node());
-}
-
-void _config::begin_set_opr_priority(CompGraph& cg, int priority) {
-    SyncEventConnecter::ReceiverHandler* handle;
-    {
-        MGB_LOCK_GUARD(set_priority_on_opr_inserted_handle_mtx);
-        handle = &set_priority_on_opr_inserted_handle[&cg.get()];
-    }
-    mgb_assert(!*handle, "multiple calls to _begin_set_opr_priority()");
-
-    auto on_opr_inserted = [priority](const cg::event::OprInserted& event) {
-        if (!event.exc && priority) {
-            int& pri = event.opr->node_prop().attribute().priority;
-            if (!pri)
-                pri = priority;
-            else
-                pri = std::min(pri, priority);
-        }
-    };
-    *handle = cg.get().event().register_receiver<cg::event::OprInserted>(
-            on_opr_inserted);
-}
-
-void _config::end_set_opr_priority(CompGraph &cg) {
-    MGB_LOCK_GUARD(set_priority_on_opr_inserted_handle_mtx);
-    auto nr = set_priority_on_opr_inserted_handle.erase(&cg.get());
-    mgb_assert(nr, "end_set_opr_priority called "
-            "before begin_set_opr_priority");
-}
-
-void _config::begin_set_exc_opr_tracker(CompGraph &cg, PyObject *tracker) {
-    OprPyTracker::begin_set_tracker(cg.get(), tracker);
-}
-
-void _config::end_set_exc_opr_tracker(CompGraph &cg) {
-    OprPyTracker::end_set_tracker(cg.get());
-}
-
-PyObject* _config::get_opr_tracker(CompGraph &cg, size_t var_id) {
-    auto var = cg.get().find_var_by_id(var_id);
-    if (!var)
-        Py_RETURN_NONE;
-    return OprPyTracker::get_tracker(var->owner_opr()).as_tuple();
-}
-
-void _config::set_opr_sublinear_memory_endpoint(const SymbolVar &var) {
-    MGB_MARK_USED_VAR(var);
-#if MGB_ENABLE_SUBLINEAR
-    auto opr = var.node()->owner_opr();
-    opr->owner_graph()->options().opr_attribute.sublinear_memory_endpoint.
-        insert(opr);
-#endif
-}
-
-void _config::set_fork_cuda_warning_flag(int flag) {
-#if MGB_ENABLE_DEBUG_UTIL
-    debug::set_fork_cuda_warning_flag(flag);
-#else
-    MGB_MARK_USED_VAR(flag);
-#endif
-}
-
-bool _config::is_cuda_ctx_set() {
-#if MGB_CUDA
-    CUcontext ctx;
-    return cuCtxGetCurrent(&ctx) == CUDA_SUCCESS && ctx;
-#else
-    return false;
-#endif
-}
-
-std::string _config::get_cuda_gencode() {
-#if MGB_CUDA
-    std::set<std::string> used;
-    int nr_dev;
-    auto err = cudaGetDeviceCount(&nr_dev);
-    if (err == cudaErrorNoDevice) {
-        return {};
-    }
-    MGB_CUDA_CHECK(err);
-    for (int i = 0; i < nr_dev; ++ i) {
-        cudaDeviceProp prop;
-        MGB_CUDA_CHECK(cudaGetDeviceProperties(&prop, i));
-        std::string cur{std::to_string(prop.major)};
-        cur += std::to_string(prop.minor);
-        used.insert(cur);
-    }
-
-    std::string ret;
-    for (auto &&i: used) {
-        if (!ret.empty())
-            ret.append(" ");
-        ret.append(i);
-    }
-    return ret;
-#else
-    mgb_throw(MegBrainError, "cuda disabled at compile time");
-#endif
-}
-
-namespace {
-
-std::string find_content_in_file(const std::string& file_name,
-                                 const std::string& content) {
-    std::ifstream fin(file_name.c_str());
-    std::string read_str;
-    while (std::getline(fin, read_str)) {
-        auto idx = read_str.find(content);
-        if (idx != std::string::npos) {
-            fin.close();
-            return read_str.substr(idx);
-        }
-    }
-    fin.close();
-    return {};
-}
-
-std::vector<std::string> split_env(const char* env) {
-    std::string e(env);
-    std::istringstream stream(e);
-    std::vector<std::string> ret;
-    std::string path;
-    while (std::getline(stream, path, SPLITER)) {
-        ret.emplace_back(path);
-    }
-    return ret;
-}
-
-//! this function will find file_name in each path in envs. It accepts add
-//! intermediate path between env and file_name
-std::string find_file_in_envs_with_intmd(
-        const std::vector<std::string>& envs, const std::string& file_name,
-        const std::vector<std::string>& itmedias = {}) {
-    for (auto&& env : envs) {
-        auto ret = getenv(env.c_str());
-        if (ret) {
-            for (auto&& path : split_env(ret)) {
-                auto file_path = std::string(path) + "/" + file_name;
-                if (!check_file_exist(file_path.c_str(), F_OK)) {
-                    return file_path;
-                }
-                if (!itmedias.empty()) {
-                    for (auto&& inter_path : itmedias) {
-                        file_path = std::string(path) + "/" + inter_path + "/" +
-                                    file_name;
-                        if (!check_file_exist(file_path.c_str(), F_OK)) {
-                            return file_path;
-                        }
-                    }
-                }
-            }
-        }
-    }
-    return std::string{};
-}
-
-std::string get_nvcc_root_path() {
-    auto nvcc_root_path = find_file_in_envs_with_intmd({ENV_PATH}, NVCC_EXE);
-    if (nvcc_root_path.empty()) {
-        mgb_throw(MegBrainError,
-                  "nvcc not found. Add your nvcc to your environment Path");
-    } else {
-        auto idx = nvcc_root_path.rfind('/');
-        return nvcc_root_path.substr(0, idx + 1);
-    }
-}
-
-size_t get_local_cuda_version() {
-    auto nvcc_root_path = get_nvcc_root_path();
-    auto ver_path = nvcc_root_path + "../version.txt";
-    if (check_file_exist(ver_path.c_str(), F_OK)) {
-        mgb_throw(MegBrainError, "No such file : %s\n", ver_path.c_str());
-    }
-    auto str_cuda_version = find_content_in_file(ver_path, "CUDA Version");
-    if (str_cuda_version.empty()) {
-        mgb_throw(MegBrainError, "can not read version information from : %s\n",
-                  ver_path.c_str());
-    }
-    size_t cuda_major = 0;
-    size_t cuda_minor = 0;
-    sscanf(str_cuda_version.c_str(), "CUDA Version %zu.%zu,", &cuda_major,
-           &cuda_minor);
-    return cuda_major * 1000 + cuda_minor * 10;
-}
-
-void check_cudnn_existence() {
-    auto cudnn_header_path = find_file_in_envs_with_intmd(
-            {"PC_CUDNN_INCLUDE_DIRS", "CUDNN_ROOT_DIR", "CUDA_TOOLKIT_INCLUDE",
-             "CUDNN_LIBRARY", "CUDA_PATH"},
-            "cudnn.h", {"../include", "include"});
-    if (cudnn_header_path.empty()) {
-        mgb_log_warn(
-                "cudnn.h not found. Please make sure cudnn install at "
-                "${CUDNN_ROOT_DIR}");
-    } else {  // check cudnn lib exist
-        auto str_cudnn_major =
-                find_content_in_file(cudnn_header_path, "#define CUDNN_MAJOR");
-        auto str_cudnn_minor =
-                find_content_in_file(cudnn_header_path, "#define CUDNN_MINOR");
-        auto str_cudnn_patch = find_content_in_file(cudnn_header_path,
-                                                    "#define CUDNN_PATCHLEVEL");
-
-        if (str_cudnn_major.empty() || str_cudnn_minor.empty() ||
-            str_cudnn_patch.empty()) {
-            mgb_log_warn(
-                    "can not find cudnn version information in %s.\n You may "
-                    "Update cudnn\n",
-                    cudnn_header_path.c_str());
-            return;
-        }
-
-        size_t cudnn_major = 0, cudnn_minor = 0, cudnn_patch = 0;
-        sscanf(str_cudnn_major.c_str(), "#define CUDNN_MAJOR %zu",
-               &cudnn_major);
-        sscanf(str_cudnn_minor.c_str(), "#define CUDNN_MINOR %zu",
-               &cudnn_minor);
-        sscanf(str_cudnn_patch.c_str(), "#define CUDNN_PATCHLEVEL %zu",
-               &cudnn_patch);
-
-#ifdef WIN32
-        std::string cudnn_lib_name =
-                "cudnn64_" + std::to_string(cudnn_major) + ".dll";
-#else
-        std::string cudnn_lib_name =
-                "libcudnn.so." + std::to_string(cudnn_major) + "." +
-                std::to_string(cudnn_minor) + "." + std::to_string(cudnn_patch);
-#endif
-
-        auto cudnn_lib_path = find_file_in_envs_with_intmd(
-                {"CUDNN_ROOT_DIR", "CUDNN_LIBRARY", "CUDA_PATH", ENV_PATH},
-                cudnn_lib_name, {"lib64", "lib/x64"});
-        if (cudnn_lib_path.empty()) {
-            mgb_log_warn(
-                    "%s not found. Please make sure cudnn install at "
-                    "${CUDNN_LIBRARY}",
-                    cudnn_lib_name.c_str());
-        }
-    }
-}
-}  // namespace
-
-std::vector<std::string> _config::get_cuda_include_path() {
-#if MGB_CUDA
-    auto nvcc_path = get_nvcc_root_path();
-    auto cudart_header_path =  nvcc_path + "../include/cuda_runtime.h";
-    //! double check path_to_nvcc/../include/cuda_runtime.h exists
-    auto ret = check_file_exist(cudart_header_path.c_str(), F_OK);
-    if (ret) {
-        mgb_throw(MegBrainError,
-                  "%s not found. Please make sure your cuda toolkit install "
-                  "right",
-                  cudart_header_path.c_str());
-    } else {
-        return {nvcc_path + "..", nvcc_path + "../include"};
-    }
-#else
-    mgb_throw(MegBrainError, "cuda disabled at compile time");
-#endif
-}
-
-std::vector<std::string> _config::get_cuda_lib_path() {
-#if MGB_CUDA
-    auto nvcc_path = get_nvcc_root_path();
-#ifdef WIN32
-    auto cuda_version = get_local_cuda_version();
-    auto cuda_major = cuda_version / 1000;
-    auto cuda_minor = cuda_version % 10;
-    auto cudart_lib_path = nvcc_path + "cudart64_" +
-                           std::to_string(cuda_major * 10 + cuda_minor) +
-                           ".dll";
-#else
-    auto cudart_lib_path = nvcc_path + "../lib64/libcudart.so";
-#endif
-    //! double check cudart_lib_path exists
-    auto ret = check_file_exist(cudart_lib_path.c_str(), F_OK);
-    if (ret) {
-        mgb_throw(MegBrainError,
-                  "%s not found. Please make sure your cuda toolkit install "
-                  "right",
-                  cudart_lib_path.c_str());
-    } else {
-#ifdef WIN32
-        //! cudart64_101.dll locates at cuda/bin
-        return {nvcc_path + "../lib/x64", nvcc_path};
-#else
-        return {nvcc_path + "../lib64"};
-#endif
-    }
-#else
-    mgb_throw(MegBrainError, "cuda disabled at compile time");
-#endif
-}
-
-int _config::get_cuda_version() {
-#if MGB_CUDA
-    int version;
-    MGB_CUDA_CHECK(cudaRuntimeGetVersion(&version));
-    return version;
-#else
-    mgb_throw(MegBrainError, "cuda disabled at compile time");
-#endif
-}
-
-bool _config::is_local_cuda_env_ok() {
-    check_cudnn_existence();
-    if (get_nvcc_root_path().empty()) {
-        return false;
-    }
-    return true;
-}
-
-bool _config::is_compiled_with_cuda() {
-#if MGB_CUDA
-    return true;
-#else
-    return false;
-#endif
-}
-
-void _config::load_opr_library(const char* self_path, const char* lib_path) {
-    static bool self_global = false;
-    static std::mutex self_global_mtx;
-    {
-        MGB_LOCK_GUARD(self_global_mtx);
-        if (!self_global) {
-            auto hdl = dlopen(self_path, RTLD_LAZY | RTLD_GLOBAL);
-            mgb_assert(hdl, "failed to set mgb to global: %s", dlerror());
-            self_global = true;
-        }
-    }
-    if (lib_path) {
-        auto hdl = dlopen(lib_path, RTLD_LAZY);
-        mgb_assert(hdl, "failed to load libray %s: %s", lib_path, dlerror());
-    }
-}
-
-std::vector<std::pair<size_t, std::string>> _config::dump_registered_oprs() {
-#if MGB_ENABLE_DEBUG_UTIL
-    return serialization::OprRegistry::dump_registries();
-#else
-    return {};
-#endif
-}
-
-#if MGB_ENABLE_OPR_MM
-/*! see definition : src/cpp/megbrain_config.h.
- * Create mm server. port 0 is permitted, leave zmqrpc to decide which port
- * should be used.
- */
-int _config::create_mm_server(const std::string& server_addr, int port) {
-    return create_zmqrpc_server(server_addr, port);
-}
-
-void _config::group_barrier(const std::string& server_addr,
-        int port, uint32_t size, uint32_t rank) {
-    mgb_assert(rank < size, "invalid rank %d", rank);
-    auto group_mgr = std::make_shared<GroupClientProxy>(
-            ssprintf("%s:%d", server_addr.c_str(), port));
-    uint32_t rsp = group_mgr->group_barrier(size, rank);
-    mgb_assert(rsp != 0, "rank already registered: %d", rank);
-    mgb_assert(size == rsp, "inconsistent size: %d, expect %d", size, rsp);
-}
-
-#else
-
-int _config::create_mm_server(const std::string& server_addr, int port) {
-    mgb_throw(mgb::MegBrainError, "OPR_MM suppport disable at compile time");
-    return 0;
-}
-
-void _config::group_barrier(const std::string& server_addr,
-        int port, uint32_t size, uint32_t rank) {
-    mgb_throw(mgb::MegBrainError, "OPR_MM suppport disable at compile time");
-}
-
-#endif
-
-// vim: syntax=cpp.doxygen foldmethod=marker foldmarker=f{{{,f}}}
diff --git a/python_module/src/cpp/megbrain_config.h b/python_module/src/cpp/megbrain_config.h
deleted file mode 100644
index ed4f4c9d..00000000
--- a/python_module/src/cpp/megbrain_config.h
+++ /dev/null
@@ -1,78 +0,0 @@
-/**
- * \file python_module/src/cpp/megbrain_config.h
- *
- * This file is part of MegBrain, a deep learning framework developed by Megvii.
- *
- * \copyright Copyright (c) 2014-2020 Megvii Inc. All rights reserved.
- *
- */
-
-#ifndef SWIG
-
-#pragma once
-
-#include "megbrain_build_config.h"
-#include "./megbrain_wrap.h"
-#include <Python.h>
-using mgb::cg::SymbolVar;
-#endif
-
-//! wrap by a class so swig can put the functions in a namespace
-class _config {
-    public:
-        static bool set_comp_graph_option(
-                CompGraph &cg, const std::string &name, int val_int);
-
-        static bool comp_graph_is_eager(CompGraph &cg);
-
-        static void add_extra_vardep(
-                const SymbolVar &var, const SymbolVar &dep);
-
-        static void begin_set_opr_priority(
-                CompGraph &cg, int priority);
-        static void end_set_opr_priority(CompGraph &cg);
-
-        static void begin_set_exc_opr_tracker(
-                CompGraph &cg, PyObject *tracker);
-        static void end_set_exc_opr_tracker(CompGraph &cg);
-
-        //! return (opr_msg, fwd tracker, grad tracker) or None
-        static PyObject* get_opr_tracker(CompGraph &cg, size_t var_id);
-
-        static void set_opr_sublinear_memory_endpoint(const SymbolVar &var);
-
-        static void set_fork_cuda_warning_flag(int flag);
-
-        static bool is_cuda_ctx_set();
-
-        //! get cuda gencode strings for local devices
-        static std::string get_cuda_gencode();
-
-        //! get cuda lib paths.
-        static std::vector<std::string> get_cuda_lib_path();
-
-        //! get cuda include paths.
-        static std::vector<std::string> get_cuda_include_path();
-
-        //! get cuda version
-        static int get_cuda_version();
-
-        //! check local cuda env. The method returns true if CUDA's nvcc
-        //! compiler and cudnn installs in PATH.
-        static bool is_local_cuda_env_ok();
-
-        static bool is_compiled_with_cuda();
-
-        static void load_opr_library(
-                const char* self_path, const char* lib_path);
-
-        static std::vector<std::pair<size_t, std::string>>
-        dump_registered_oprs();
-
-        static int create_mm_server(const std::string& server_addr, int port);
-
-        static void group_barrier(const std::string& server_addr,
-                int port, uint32_t size, uint32_t rank);
-};
-
-// vim: syntax=cpp.doxygen foldmethod=marker foldmarker=f{{{,f}}}
diff --git a/python_module/src/cpp/megbrain_pubapi.cpp b/python_module/src/cpp/megbrain_pubapi.cpp
deleted file mode 100644
index 6665723c..00000000
--- a/python_module/src/cpp/megbrain_pubapi.cpp
+++ /dev/null
@@ -1,334 +0,0 @@
-/**
- * \file python_module/src/cpp/megbrain_pubapi.cpp
- *
- * This file is part of MegBrain, a deep learning framework developed by Megvii.
- *
- * \copyright Copyright (c) 2014-2020 Megvii Inc. All rights reserved.
- *
- */
-
-#include "./megbrain_pubapi.h"
-#include "./megbrain_pubapi_internal.h"
-
-#include "megbrain/tensor.h"
-#include "megbrain/graph/var_node.h"
-#include "megbrain/comp_node_env.h"
-
-namespace {
-
-class DeleteDispatcher final : public mgb::CompNodeDepedentObject {
-    mgb::thin_function<void()> m_deleter;
-    mgb::CompNode m_comp_node;
-    std::atomic<bool> done;
-
-    std::shared_ptr<void> on_comp_node_finalize() override {
-        bool _ = false;
-        if (done.compare_exchange_strong(_, true)) {
-            m_deleter();
-        }
-        return {};
-    }
-public:
-    explicit DeleteDispatcher(mgb::thin_function<void()>&& deleter,
-                              mgb::CompNode cn)
-            : m_deleter(std::move(deleter)), m_comp_node(cn) {
-        done.store(false);
-    }
-
-    void trigger() {
-        bool _ = false;
-        if (done.compare_exchange_strong(_, true)) {
-            if (!is_finalized()) {
-                m_comp_node.add_callback(std::move(m_deleter));
-            } else {
-                m_deleter();
-            }
-        }
-    }
-};
-
-}  // namespace
-
-using namespace mgb;
-
-pubapi::DeviceTensor::DataType mgb::dtype_mgb2pubapi(DType dtype) {
-    using DevDType = pubapi::DeviceTensor::DataType;
-    switch (dtype.enumv()) {
-#define o(s, t)        \
-    case DTypeEnum::s: \
-        return DevDType::t
-        o(Float32, FLOAT32);
-        o(Float16, FLOAT16);
-        o(Int32, INT32);
-        o(Int16, INT16);
-        o(Int8, INT8);
-        o(Uint8, UINT8);
-#undef o
-        default:
-            mgb_throw(MegBrainError, "dtype %s not implemented for pubapi",
-                      dtype.name());
-    }
-}
-
-struct pubapi::DeviceTensor::_Impl {
-
-static TensorShape desc_shape_to_tensor_shape(const DeviceTensor::Desc &desc) {
-    TensorShape shape;
-    mgb_assert(desc.ndim && desc.ndim <= TensorShape::MAX_NDIM,
-            "invalid ndim: %zu", desc.ndim);
-    shape.ndim = desc.ndim;
-    for (size_t i = 0; i < desc.ndim; ++ i) {
-        shape[i] = desc.shape[i];
-    }
-    return shape;
-}
-
-#if MGB_CUDA
-class CudaCurrentDeviceRestore {
-    int m_orig_dev = -1;
-
-    public:
-
-        CudaCurrentDeviceRestore(CompNode cn) {
-            if (cn.device_type() == CompNode::DeviceType::CUDA) {
-                MGB_CUDA_CHECK(cudaGetDevice(&m_orig_dev));
-            }
-        }
-
-        ~CudaCurrentDeviceRestore() {
-            if (m_orig_dev != -1) {
-                cudaSetDevice(m_orig_dev);
-            }
-        }
-};
-#else
-class CudaCurrentDeviceRestore {
-    public:
-        CudaCurrentDeviceRestore(CompNode) {
-        }
-};
-#endif
-
-static void sync(const DeviceTensor *self, bool strong) {
-    CompNode cn;
-    if (self->m_dev_nd) {
-        cn = static_cast<DeviceTensorND*>(self->m_dev_nd)->comp_node();
-    } else {
-        mgb_assert(self->m_varptr);
-        cn = static_cast<cg::VarNode*>(self->m_varptr)->comp_node();
-    }
-    CudaCurrentDeviceRestore cuda_dev_restore{cn};
-    cn.sync();
-#if MGB_CUDA
-    if (strong && cn.device_type() == CompNode::DeviceType::CUDA) {
-        cn.activate();
-        MGB_CUDA_CHECK(cudaDeviceSynchronize());
-    }
-#endif
-}
-
-static const char* dtype_name(DataType dtype) {
-    switch (dtype) {
-#define on(c)         \
-    case DataType::c: \
-        return #c
-        on(FLOAT32);
-        on(FLOAT16);
-        on(INT32);
-        on(INT16);
-        on(INT8);
-        on(UINT8);
-#undef on
-        default:
-            mgb_throw(MegBrainError, "invalid pubapi dtype enum: %d",
-                      static_cast<int>(dtype));
-    }
-}
-
-static void copy(
-        DeviceTensor *self, const Desc &other, CopyDirection direction) {
-    mgb_assert(self->desc.dtype == other.dtype, "dtype mismatch: %s vs %s",
-               self->dtype_name(), dtype_name(other.dtype));
-    mgb_assert(self->m_varptr || self->m_dev_nd);
-    const DeviceTensorND *dv;
-    if (direction == CopyDirection::OTHER_TO_SELF) {
-        mgb_assert(!self->m_readonly, "can not copy into readonly tensor");
-        auto shape = desc_shape_to_tensor_shape(other);
-        if (self->m_varptr) {
-            auto var = static_cast<cg::VarNode*>(self->m_varptr);
-            dv = &var->shape_alloc(shape).dev_tensor();
-        } else {
-            dv = static_cast<DeviceTensorND*>(self->m_dev_nd);
-            mgb_assert(dv->shape().eq_shape(shape),
-                    "copy dest tensor shape is %s, but source shape is %s",
-                    dv->shape().to_string().c_str(), shape.to_string().c_str());
-        }
-        mgb_assert(self->desc.dtype == dtype_mgb2pubapi(dv->dtype()));
-        self->desc.dev_ptr = dv->raw_ptr();
-        self->desc.ndim = dv->shape().ndim;
-        self->desc.shape = dv->shape().shape;
-        if (!other.dev_ptr) {
-            // used in resize()
-            return;
-        }
-    } else {
-        mgb_assert(direction == CopyDirection::SELF_TO_OTHER);
-        if (self->m_varptr) {
-            dv = &static_cast<cg::VarNode*>(self->m_varptr)->dev_tensor();
-        } else {
-            dv = static_cast<DeviceTensorND*>(self->m_dev_nd);
-        }
-    }
-
-    mgb_assert(dv->layout().is_contiguous());
-    auto size = dv->layout().span().dist_byte();
-    auto cn = dv->comp_node();
-    CudaCurrentDeviceRestore cuda_dev_restore{cn};
-
-    void *dst = dv->raw_ptr(), *src = other.dev_ptr;
-    if (direction == CopyDirection::SELF_TO_OTHER) {
-        std::swap(dst, src);
-    }
-
-#if !MGB_CUDA
-    mgb_assert(other.type != Type::CUDA, "cuda disabled at compile time");
-#endif
-
-    auto &&desc = self->desc;
-    if (other.type == desc.type) {
-#if MGB_CUDA
-        if (desc.type == Type::CUDA) {
-            int dev = desc.cuda_ctx.device;
-            if (dev == -1) {
-                MGB_CUDA_CHECK(cudaGetDevice(&dev));
-            }
-            mgb_assert(dev == other.cuda_ctx.device,
-                    "DeviceTensor copy must be on the same device; "
-                    "got %d vs %d", dev, other.cuda_ctx.device);
-        }
-#endif
-        cn.peer_copy_to(cn, dst, src, size);
-    } else {
-        if ((desc.type == Type::CPU && other.type == Type::CUDA &&
-                    direction == CopyDirection::SELF_TO_OTHER) ||
-                (other.type == Type::CPU && desc.type == Type::CUDA &&
-                 direction == CopyDirection::OTHER_TO_SELF)) {
-            cn.copy_to_device(dst, src, size);
-        } else {
-            mgb_assert((desc.type == Type::CUDA && other.type == Type::CPU &&
-                        direction == CopyDirection::SELF_TO_OTHER) ||
-                    (other.type == Type::CUDA && desc.type == Type::CPU &&
-                     direction == CopyDirection::OTHER_TO_SELF));
-            cn.copy_to_host(dst, src, size);
-        }
-    }
-}
-
-static void forward_other_memory(
-        const DeviceTensor *self,
-        const Desc &other, CallbackOnce deleter) {
-    mgb_assert(self->desc.dtype == other.dtype, "dtype mismatch: %s vs %s",
-               self->dtype_name(), dtype_name(other.dtype));
-    auto deleter_wrap = [deleter]() mutable { deleter.consume(); };
-    thin_function<void(void*)> deleter_dispatch;
-    if (self->desc.type == Type::CPU) {
-        CompNode cn{};
-        if (self->m_varptr) {
-            cn = static_cast<cg::VarNode*>(self->m_varptr)->comp_node();
-        } else {
-            cn = static_cast<DeviceTensorND*>(self->m_dev_nd)->comp_node();
-        }
-        deleter_dispatch = [d = new DeleteDispatcher(deleter_wrap, cn)](void*) {
-            d->trigger();
-            delete d;
-        };
-    } else {
-        deleter_dispatch = [deleter_wrap](void*) mutable { deleter_wrap(); };
-    }
-    auto shape = desc_shape_to_tensor_shape(other);
-    if (self->m_varptr) {
-        auto var = static_cast<cg::VarNode*>(self->m_varptr);
-        DeviceTensorStorage storage;
-        storage.reset(var->comp_node(),
-                shape.total_nr_elems() * var->dtype().size(),
-                {static_cast<dt_byte*>(other.dev_ptr), deleter_dispatch});
-        DeviceTensorND tensor;
-        tensor.reset(storage, {shape, var->dtype()});
-        var->reset_dev_tensor_from_tensor(tensor);
-    } else {
-        DeviceTensorND& tensor = *static_cast<DeviceTensorND*>(self->m_dev_nd);
-        DeviceTensorStorage storage;
-        size_t dtype_size = tensor.layout().dtype.size();
-        storage.reset(tensor.comp_node(),
-                shape.total_nr_elems() * dtype_size,
-                {static_cast<dt_byte*>(other.dev_ptr), deleter_dispatch});
-        tensor.reset(storage, {shape, tensor.layout().dtype});
-    }
-}
-
-static void forward_to(
-        const DeviceTensor *self,
-        void **dest, CallbackOnce* deleter) {
-    auto orig_dv_ptr = static_cast<DeviceTensorStorage*>(self->m_dev_nd);
-    *dest = orig_dv_ptr->ptr();
-    mgb_assert(*dest == self->desc.dev_ptr);
-    deleter->user_data = new DeviceTensorStorage(*orig_dv_ptr);
-    deleter->fptr = [](void* ptr) {
-        delete reinterpret_cast<DeviceTensorStorage*>(ptr);
-    };
-}
-
-static void init_tensor(pubapi::DeviceTensor& dest, DeviceTensorND* tensor,
-                        VarNode* var, bool readonly) {
-    memset(&dest, 0, sizeof(pubapi::DeviceTensor));
-    {
-        static FuncTable functable{&sync, &copy, &forward_other_memory,
-                                   &dtype_name, &forward_to};
-        dest.m_functable = &functable;
-    }
-    dest._version0 = dest._version1 = CURRENT_VERSION;
-
-    mgb_assert((!!tensor) ^ (!!var));
-    auto cn = tensor ? tensor->comp_node() : var->comp_node();
-    using Type = pubapi::DeviceTensor::Type;
-    switch (cn.device_type()) {
-        case CompNode::DeviceType::CPU:
-            dest.desc.type = Type::CPU;
-            break;
-#if MGB_CUDA
-        case CompNode::DeviceType::CUDA:
-            dest.desc.type = Type::CUDA;
-            break;
-#endif
-        default:
-            mgb_throw(MegBrainError, "bad comp node type: %d",
-                      static_cast<int>(cn.device_type()));
-    }
-    dest.desc.dtype = dtype_mgb2pubapi(tensor ? tensor->dtype() : var->dtype());
-    if (tensor) {
-        dest.desc.dev_ptr = tensor->raw_ptr();
-        dest.desc.shape = tensor->shape().shape;
-        dest.desc.ndim = tensor->shape().ndim;
-        dest.size_bytes = tensor->layout().span().dist_byte();
-    }
-#if MGB_CUDA
-    if (dest.desc.type == Type::CUDA) {
-        auto&& env = CompNodeEnv::from_comp_node(cn).cuda_env();
-        dest.desc.cuda_ctx.device = env.device;
-        dest.desc.cuda_ctx.stream = env.stream;
-    }
-#endif
-    dest.m_readonly = readonly;
-    dest.m_dev_nd = tensor;
-    dest.m_varptr = var;
-}
-
-};  // pubapi::DeviceTensor::Impl
-
-void mgb::init_pubapi_dev_tensor(pubapi::DeviceTensor& dest,
-                                 DeviceTensorND* tensor, VarNode* var,
-                                 bool readonly) {
-    pubapi::DeviceTensor::_Impl::init_tensor(dest, tensor, var, readonly);
-}
-
-// vim: syntax=cpp.doxygen foldmethod=marker foldmarker=f{{{,f}}}
diff --git a/python_module/src/cpp/megbrain_pubapi.h b/python_module/src/cpp/megbrain_pubapi.h
deleted file mode 100644
index fa33f815..00000000
--- a/python_module/src/cpp/megbrain_pubapi.h
+++ /dev/null
@@ -1,185 +0,0 @@
-/**
- * \file python_module/src/cpp/megbrain_pubapi.h
- *
- * This file is part of MegBrain, a deep learning framework developed by Megvii.
- *
- * \brief public API for exposing megbrain internal data structures
- *
- * This is a pure header without compile-time dependencies.
- *
- * \copyright Copyright (c) 2014-2020 Megvii Inc. All rights reserved.
- */
-
-#pragma once
-
-#include <cstdint>
-#include <cstddef>
-
-namespace mgb {
-namespace pubapi {
-
-    /*!
-     * \brief a general callback that would be invoked exactly once
-     *
-     * During the invoke, the functor shoule release related memory
-     */
-    struct CallbackOnce {
-        void (*fptr)(void *);
-        void *user_data;
-
-        //! invoke the callback and clean up the scene
-        void consume() {
-            fptr(user_data);
-            fptr = nullptr;
-            user_data = nullptr;
-        }
-    };
-
-    //! tensor on a computing device
-    class DeviceTensor {
-        public:
-            static constexpr uint32_t CURRENT_VERSION = 20190725;
-
-            //! device type
-            enum class Type: uint32_t {
-                CPU, CUDA
-            };
-            enum class DataType: uint32_t {
-                FLOAT32, FLOAT16, INT32, INT16, INT8, UINT8
-            };
-            enum class CopyDirection {
-                SELF_TO_OTHER, OTHER_TO_SELF
-            };
-            struct CudaContext {
-                int device;     //! set to -1 in copy() to use current device
-                void *stream;   //!< set to nullptr for default stream
-            };
-
-            //! tensor descriptor
-            struct Desc {
-                Type type;
-                DataType dtype;
-                void *dev_ptr;          //!< pointer to actual device buffer
-                const size_t *shape;    //!< pointer to shape array
-                size_t ndim;
-                //! only valid if type == Type::CUDA
-                CudaContext cuda_ctx;
-            };
-
-            uint32_t _version0; //!< for consistency check
-            // note: fields starting with underscore are for internal use only
-
-            Desc desc;
-            size_t size_bytes;
-
-            /*!
-             * \brief synchonize with the calling thread
-             *
-             * This must be called before forwarding memory for direct use
-             *
-             * \param strong whether to synchronoze the whole device (true), or
-             *      just the computing node (false). Currently it only affects
-             *      how cuda sync is performed.
-             */
-            void sync(bool strong = false) const {
-                m_functable->sync(this, strong);
-            }
-
-            /*!
-             * \brief copy to/from another buffer
-             *
-             * Note: the copy is performed on the comp node on which this tensor
-             * resides and is always async.
-             *
-             * If \p direction is OTHER_TO_SELF and shape of this changes, then
-             * the corresponding dev_ptr would also be updated.
-             *
-             * \param other the other buffer involved in the copy; if
-             *      \p direction is SELF_TO_OTHER, then only its type and
-             *      dev_ptr would be used
-             * \param direction specify the direction to perform the copy
-             */
-            void copy(const Desc &other, CopyDirection direction) {
-                m_functable->copy(this, other, direction);
-            }
-
-            /*!
-             * \brief resize this tensor to given shape
-             */
-            void resize(size_t ndim, const size_t *shape) {
-                Desc tmp;
-                tmp.dev_ptr = nullptr;
-                tmp.ndim = ndim;
-                tmp.shape = shape;
-                copy(tmp, CopyDirection::OTHER_TO_SELF);
-            }
-
-            //! name of dtype of this tensor
-            const char* dtype_name() const { return dtype_name(desc.dtype); }
-
-            //! name of given dtype
-            const char* dtype_name(DataType dtype) const {
-                return m_functable->dtype_name(dtype);
-            }
-
-            /*!
-             * \brief forward memory from \p other directly to the underlying
-             *      storage
-             *
-             * This can only be used when there is a corresponding VarNode for
-             * this DeviceTensor. (e.g. for the outputs of Craniotome oprs)
-             */
-            void forward_other_memory(
-                    const Desc &other, CallbackOnce deleter) const {
-                m_functable->forward_other_memory(this, other, deleter);
-            }
-
-            /*!
-             * \brief forward device buffer to \p dest directly and create a
-             * tensor storage shared memory with m_dv_nd, it would be deleted
-             * when calling deleter, so refcnt to data ptr could be managed
-             * correctly.
-             */
-            void forward_to(
-                    void **dest, CallbackOnce* deleter) const {
-                m_functable->forward_to(this, dest, deleter);
-            }
-
-            struct _Impl;
-        private:
-            // note: we use a func table to avoid symbol visibility problems and
-            // linking hazards when built with other code base
-            struct FuncTable {
-                void (*sync)(const DeviceTensor*, bool);
-                void (*copy)(DeviceTensor*, const Desc&, CopyDirection);
-                void (*forward_other_memory)(const DeviceTensor*, const Desc&,
-                                             CallbackOnce);
-                const char* (*dtype_name)(DataType);
-                void (*forward_to)(const DeviceTensor*, void**, CallbackOnce*);
-            };
-            bool m_readonly;
-            void* m_dev_nd;
-            void* m_varptr;
-            FuncTable* m_functable;
-        public:
-            uint32_t _version1;
-    };
-
-    /*!
-     * \brief reinterpret_cast raw pointer or pointer integer to mgb object and
-     *      check version
-     * \return object pointer if the version is correct; nullptr if failed
-     */
-    template<typename T, typename S>
-    T* as_versioned_obj(S &&val) {
-        T *obj = reinterpret_cast<T*>(val);
-        if (obj->_version0 != T::CURRENT_VERSION ||
-                obj->_version1 != T::CURRENT_VERSION) {
-            return nullptr;
-        }
-        return obj;
-    }
-} // namespace pubapi
-} // namespace mgb
-
-// vim: syntax=cpp.doxygen foldmethod=marker foldmarker=f{{{,f}}}
diff --git a/python_module/src/cpp/megbrain_pubapi_internal.h b/python_module/src/cpp/megbrain_pubapi_internal.h
deleted file mode 100644
index 7581d60e..00000000
--- a/python_module/src/cpp/megbrain_pubapi_internal.h
+++ /dev/null
@@ -1,31 +0,0 @@
-/**
- * \file python_module/src/cpp/megbrain_pubapi_internal.h
- *
- * This file is part of MegBrain, a deep learning framework developed by Megvii.
- *
- * \brief internal helpers related to pubapi. Implemented in pubapi.cpp
- *
- * \copyright Copyright (c) 2014-2020 Megvii Inc. All rights reserved.
- *
- */
-
-#pragma once
-
-#include "megbrain_pubapi.h"
-#include "megbrain/graph.h"
-
-namespace mgb {
-    /*!
-     * \brief fill fields in \p dest with information from other tensors
-     *
-     * Note that exactly one of \p tensor and \p var must be non-null
-     */
-    void init_pubapi_dev_tensor(
-            pubapi::DeviceTensor &dest,
-            DeviceTensorND *tensor, VarNode *var, bool readonly);
-
-    //! convert megbrain dtype to pubapi dtype
-    pubapi::DeviceTensor::DataType dtype_mgb2pubapi(DType dtype);
-}
-
-// vim: syntax=cpp.doxygen foldmethod=marker foldmarker=f{{{,f}}}
diff --git a/python_module/src/cpp/megbrain_serialize.cpp b/python_module/src/cpp/megbrain_serialize.cpp
deleted file mode 100644
index d6e371bb..00000000
--- a/python_module/src/cpp/megbrain_serialize.cpp
+++ /dev/null
@@ -1,180 +0,0 @@
-/**
- * \file python_module/src/cpp/megbrain_serialize.cpp
- *
- * This file is part of MegBrain, a deep learning framework developed by Megvii.
- *
- * \copyright Copyright (c) 2014-2020 Megvii Inc. All rights reserved.
- *
- */
-
-#include "./megbrain_serialize.h"
-#include "./python_helper.h"
-
-#include "megbrain/opr/basic_arith.h"
-
-using namespace mgb;
-using namespace serialization;
-
-TensorValueDumperContext::~TensorValueDumperContext() noexcept = default;
-TensorValueLoaderContext::~TensorValueLoaderContext() noexcept = default;
-
-PyObject* TensorValueDumperContext::_value() {
-    return npy::ndarray_from_tensor(m_value, npy::ShareType::TRY_SHARE);
-}
-
-void TensorValueDumperContext::_write(PyObject *bytes) {
-    mgb_assert(PyBytes_Check(bytes));
-    auto arr_len = PyBytes_Size(bytes);
-    auto arr_buf = PyBytes_AsString(bytes);
-    m_fout.write(arr_buf, arr_len);
-}
-
-std::vector<size_t> TensorValueLoaderContext::_get_shape() const {
-    mgb_assert(m_layout.is_contiguous());
-    return npy::shape2vec(m_layout);
-}
-
-PyObject* TensorValueLoaderContext::_get_dtype() const {
-    return npy::dtype_mgb2np(m_layout.dtype);
-}
-
-PyObject* TensorValueLoaderContext::_read(size_t n) {
-    // Creates a PyBytes with uninitialized content
-    PyObject* bytes = PyBytes_FromStringAndSize(nullptr, n);
-    m_fin.read(PyBytes_AsString(bytes), n);
-    return bytes;
-}
-
-std::string _get_info_for_strip(const SymbolVarArray &dest_vars) {
-    std::unordered_set<const char*> opr_types, dtype_names, elemwise_modes;
-
-    auto on_opr = [&](cg::OperatorNodeBase *opr) {
-        if (GraphDumper::should_remove_in_dump(opr))
-            return;
-        opr_types.insert(opr->dyn_typeinfo()->name);
-        for (auto i: opr->output())
-            dtype_names.insert(i->dtype().name());
-        if (opr->same_type<opr::Elemwise>()) {
-            auto mode = opr->cast_final<opr::Elemwise>().param().mode;
-            elemwise_modes.insert(
-                    megdnn::Elemwise::ModeTrait::from_mode(mode).name);
-        }
-    };
-    cg::DepOprIter opr_iter{on_opr};
-    for (auto i: dest_vars)
-        opr_iter.add(i.node()->owner_opr());
-
-    auto to_json = [](const std::unordered_set<const char*> &v) {
-        std::vector<std::string> vs(v.begin(), v.end());
-        std::sort(vs.begin(), vs.end());
-        auto ret = json::Array::make();
-        for (auto &&i: vs)
-            ret->add(json::String::make(i));
-        return ret;
-    };
-
-    return json::Object::make({
-            {"opr_types", to_json(opr_types)},
-            {"dtypes", to_json(dtype_names)},
-            {"elemwise_modes", to_json(elemwise_modes)},
-            })->to_string();
-}
-
-void _serialize_comp_graph_to_file(
-        const char *fpath, bool append, GraphDumpFormat format,
-        const SymbolVarArray &output_vars,
-        int keep_var_name, bool keep_param_name, bool keep_opr_priority,
-        _TensorValueDumperCallback *tensor_value_dumper,
-        std::vector<size_t> &stat,
-        std::vector<std::string> &inputs,
-        std::vector<std::string> &outputs,
-        std::vector<std::string> &params) {
-
-    auto dumper = GraphDumper::make(
-            OutputFile::make_fs(fpath, append ? 'a' : 'w'), format);
-    GraphDumper::DumpConfig config{keep_var_name, keep_param_name,
-                                   keep_opr_priority};
-
-    if (tensor_value_dumper) {
-        config.tensor_value_dumper = [f=tensor_value_dumper](
-                OutputFile &fout, const cg::OperatorNodeBase &opr,
-                const HostTensorND &value) {
-            mgb_assert(value.layout().is_contiguous());
-            TensorValueDumperContext ctx{fout, opr, value};
-            f->call(ctx);
-        };
-    }
-
-    auto rst = dumper->dump(output_vars, config);
-    inputs = std::move(rst.inputs);
-    outputs = std::move(rst.outputs);
-    params = std::move(rst.params);
-    stat = {rst.nr_opr, rst.tot_bytes, rst.tensor_value_bytes,
-            rst.content_hash};
-}
-
-CompGraph _load_comp_graph_from_file(
-        const char* fpath, _CompNodeMapperCallback* cn_mapper,
-        _TensorValueLoaderCallback* tensor_value_loader,
-        std::vector<std::pair<std::string, SymbolVar>>& output_var_map,
-        SymbolVarArray& output_var_list) {
-    auto file = InputFile::make_fs(fpath);
-    auto format = GraphLoader::identify_graph_dump_format(*file);
-    mgb_throw_if(!format.valid(), SerializationError,
-                 "unknown model format (input is likely not a MegBrain model)");
-    auto loader = GraphLoader::make(std::move(file), format.val());
-    GraphLoader::LoadConfig config;
-    if (cn_mapper) {
-        config.comp_node_mapper = [f = cn_mapper](CompNode::Locator& locator) {
-            locator = CompNode::Locator::parse(f->call(locator.to_string()));
-        };
-    }
-    if (tensor_value_loader) {
-        config.tensor_value_loader = [f = tensor_value_loader](
-                                             void* ptr,
-                                             const TensorLayout& layout,
-                                             InputFile& fin) {
-            TensorValueLoaderContext ctx{layout, fin};
-            PyObjRefKeeper value = f->call(ctx);
-            mgb_assert(value.get()->ob_refcnt > 0);
-            if (ptr) {
-                HostTensorStorage storage;
-                // Unmanaged shared_ptr
-                storage.reset(CompNode::default_cpu(),
-                              layout.span().dist_byte(),
-                              {std::shared_ptr<dt_byte>(),
-                               reinterpret_cast<dt_byte*>(ptr)});
-                HostTensorND tensor;
-                tensor.reset(storage, layout);
-                npy::np2tensor(value.get(), npy::Meth::copy_into(&tensor),
-                               layout.dtype);
-            }
-        };
-    }
-    auto rst = loader->load(config);
-    output_var_map = {rst.output_var_map.begin(), rst.output_var_map.end()};
-    output_var_list = std::move(rst.output_var_list);
-
-    std::unordered_map<HostTensorND*, const std::string*> tensor2name;
-    for (const auto& pair : rst.tensor_map) {
-        tensor2name[pair.second.get()] = &pair.first;
-    }
-    auto cb = [&tensor2name, graph=rst.graph](cg::OperatorNodeBase* opr) {
-        if (!opr->same_type<opr::Host2DeviceCopy>())
-            return;
-
-        auto& h2d = opr->cast_final_safe<opr::Host2DeviceCopy>();
-        auto it = tensor2name.find(h2d.host_data().get());
-        mgb_throw_if(it == tensor2name.end(), GraphError,
-                     "unbound Host2DeviceCopy in loaded graph");
-        h2d.output(0)->name(*it->second);
-        mark_as_input(graph.get(), h2d.output(0));
-    };
-    cg::DepOprIter iter{cb};
-    for (const auto& var : output_var_list) {
-        iter.add(var.node()->owner_opr());
-    }
-    return CompGraph::make_from_shared_ptr(rst.graph);
-}
-
-// vim: syntax=cpp.doxygen foldmethod=marker foldmarker=f{{{,f}}}
diff --git a/python_module/src/cpp/megbrain_serialize.h b/python_module/src/cpp/megbrain_serialize.h
deleted file mode 100644
index f320c680..00000000
--- a/python_module/src/cpp/megbrain_serialize.h
+++ /dev/null
@@ -1,159 +0,0 @@
-/**
- * \file python_module/src/cpp/megbrain_serialize.h
- *
- * This file is part of MegBrain, a deep learning framework developed by Megvii.
- *
- * \copyright Copyright (c) 2014-2020 Megvii Inc. All rights reserved.
- *
- */
-
-
-#ifndef SWIG
-
-#pragma once
-
-#include "megbrain/serialization/serializer.h"
-#include "./megbrain_wrap.h"
-using mgb::cg::SymbolVar;
-using mgb::cg::SymbolVarArray;
-#endif
-
-#ifdef SWIG
-%feature("autodoc",
-"An object that is passed to the callback in \
-:func:`.serialize_comp_graph_to_file`.") TensorValueDumperContext;
-%feature("autodoc",
-"An object that is passed to the callback in \
-:func:`.load_comp_graph_from_file`.") TensorValueLoaderContext;
-%feature("director") _TensorValueDumperCallback;
-%feature("director") _TensorValueLoaderCallback;
-%feature("director") _CompNodeMapperCallback;
-
-%template(_VectorPairStringSymbolVar) std::vector<std::pair<std::string, SymbolVar>>;
-%typemap(directorout) PyObjRefKeeper {
-    Py_XINCREF($input);
-    $result = PyObjRefKeeper($input);
-}
-#endif
-class TensorValueDumperContext {
-#ifndef SWIG
-    mgb::serialization::OutputFile &m_fout;
-    const mgb::cg::OperatorNodeBase &m_opr;
-    const mgb::HostTensorND &m_value;
-#endif
-
-    public:
-        TensorValueDumperContext() = delete;
-        TensorValueDumperContext(const TensorValueDumperContext&) = delete;
-        TensorValueDumperContext& operator = (
-                const TensorValueDumperContext&) = delete;
-
-#ifndef SWIG
-        TensorValueDumperContext(
-                mgb::serialization::OutputFile &fout,
-                const mgb::cg::OperatorNodeBase &opr,
-                const mgb::HostTensorND &value):
-            m_fout{fout}, m_opr{opr}, m_value{value}
-        {
-        }
-#endif
-        ~TensorValueDumperContext() noexcept;
-
-        const char* _name() const {
-            return m_opr.cname();
-        }
-
-        const char* _type() const {
-            return m_opr.dyn_typeinfo()->name;
-        }
-
-        PyObject* _value();
-
-        void _write(PyObject *bytes);
-
-        void _write_default() {
-            mgb::serialization::GraphDumpConfig::default_tensor_value_dumper(
-                    m_fout, m_opr, m_value);
-        }
-
-#ifdef SWIG
-%include "./megbrain_serialize_TensorValueDumperContext.py"
-#endif
-
-};
-
-class TensorValueLoaderContext {
-#ifndef SWIG
-    const mgb::TensorLayout &m_layout;
-    mgb::serialization::InputFile &m_fin;
-#endif
-
-    public:
-        TensorValueLoaderContext() = delete;
-        TensorValueLoaderContext(const TensorValueLoaderContext&) = delete;
-        TensorValueLoaderContext& operator=(const TensorValueLoaderContext&) =
-                delete;
-
-#ifndef SWIG
-        TensorValueLoaderContext(const mgb::TensorLayout &layout,
-                                 mgb::serialization::InputFile &fin)
-                : m_layout(layout), m_fin(fin) {}
-#endif
-        ~TensorValueLoaderContext() noexcept;
-
-        std::vector<size_t> _get_shape() const;
-        PyObject* _get_dtype() const;
-
-        // Returns bytes
-        PyObject* _read(size_t n);
-
-#ifdef SWIG
-%include "./megbrain_serialize_TensorValueLoaderContext.py"
-#endif
-};
-
-class _TensorValueDumperCallback {
-    public:
-        virtual ~_TensorValueDumperCallback() = default;
-        virtual void call(TensorValueDumperContext &ctx) = 0;
-};
-
-class _TensorValueLoaderCallback {
-    public:
-        virtual ~_TensorValueLoaderCallback() = default;
-        virtual PyObjRefKeeper call(TensorValueLoaderContext &ctx) = 0;
-};
-
-class _CompNodeMapperCallback {
-    public:
-        virtual ~_CompNodeMapperCallback() = default;
-        virtual std::string call(const std::string &desc) = 0;
-};
-
-#ifdef SWIG
-%include "megbrain/serialization/dump_format.h"
-#else
-#include "megbrain/serialization/dump_format.h"
-#endif
-
-void _serialize_comp_graph_to_file(
-        const char *fpath, bool append,
-        mgb::serialization::GraphDumpFormat format,
-        const SymbolVarArray &output_vars,
-        int keep_var_name, bool keep_param_name, bool keep_opr_priority,
-        _TensorValueDumperCallback *tensor_value_dumper,
-        std::vector<size_t> &stat,
-        std::vector<std::string> &inputs,
-        std::vector<std::string> &outputs,
-        std::vector<std::string> &params);
-
-std::string _get_info_for_strip(const SymbolVarArray &dest_vars);
-
-CompGraph _load_comp_graph_from_file(
-        const char *fpath, _CompNodeMapperCallback *cn_mapper,
-        _TensorValueLoaderCallback *tensor_value_loader,
-        /* Outputs */
-        std::vector<std::pair<std::string, SymbolVar>> &output_var_map,
-        SymbolVarArray &output_var_list);
-
-// vim: syntax=cpp.doxygen foldmethod=marker foldmarker=f{{{,f}}}
diff --git a/python_module/src/cpp/megbrain_serialize_TensorValueDumperContext.py b/python_module/src/cpp/megbrain_serialize_TensorValueDumperContext.py
deleted file mode 100644
index d8028678..00000000
--- a/python_module/src/cpp/megbrain_serialize_TensorValueDumperContext.py
+++ /dev/null
@@ -1,44 +0,0 @@
-%pythoncode {
-    @property
-    def name(self):
-        """name of the param
-
-        :type: str
-        """
-        return str(self._name())
-
-    @property
-    def type(self):
-        """operator type
-
-        :type: str
-        """
-        return str(self._type())
-
-    @property
-    def value(self):
-        """numerical value of the param
-
-        :type: :class:`numpy.ndarray`
-        """
-        return self._value()
-
-    def write(self, buf):
-        """write raw data to the output file
-
-        :param buf: value to be written
-        :type buf: :class:`bytes`
-        :return: self
-        """
-        assert type(buf) is bytes, 'bad value: {!r}'.format(type(buf))
-        self._write(buf)
-
-    def write_default(self):
-        """dump the numerical value in default format
-
-        :return: self
-        """
-        self._write_default()
-        return self
-
-}
diff --git a/python_module/src/cpp/megbrain_serialize_TensorValueLoaderContext.py b/python_module/src/cpp/megbrain_serialize_TensorValueLoaderContext.py
deleted file mode 100644
index a2ff798c..00000000
--- a/python_module/src/cpp/megbrain_serialize_TensorValueLoaderContext.py
+++ /dev/null
@@ -1,19 +0,0 @@
-%pythoncode {
-    @property
-    def shape(self):
-        return self._get_shape()
-
-    @property
-    def dtype(self):
-        return self._get_dtype()
-
-    def read(self, size):
-        """read raw data from the input file
-
-        :param size: number of bytes to be read
-        :type size: :class:`int`
-        :return: bytes
-        """
-        return self._read(size)
-
-}
diff --git a/python_module/src/cpp/megbrain_wrap.cpp b/python_module/src/cpp/megbrain_wrap.cpp
deleted file mode 100644
index bcb85ae1..00000000
--- a/python_module/src/cpp/megbrain_wrap.cpp
+++ /dev/null
@@ -1,1055 +0,0 @@
-/**
- * \file python_module/src/cpp/megbrain_wrap.cpp
- *
- * This file is part of MegBrain, a deep learning framework developed by Megvii.
- *
- * \copyright Copyright (c) 2014-2020 Megvii Inc. All rights reserved.
- *
- */
-
-#include "./megbrain_wrap.h"
-#include "./python_helper.h"
-#include "./megbrain_pubapi_internal.h"
-
-#include "megbrain/version.h"
-#include "megbrain/tensor.h"
-#include "megbrain/comp_node_env.h"
-#include "megbrain/opr/io.h"
-#include "megbrain/opr/utility.h"
-#include "megbrain/gopt/inference.h"
-#include "megbrain/utils/thread.h"
-#include "megbrain/utils/timer.h"
-
-#include <cstring>
-using namespace mgb;
-
-namespace {
-    bool g_global_finalize_called = false;
-
-    /*!
-     * \brief record the vars produced from user-created Host2DeviceCopy
-     *
-     * Note that the vars are mapped by address of underlying HostTensorND, so
-     * in the case of partial execution, vars in the parent graph can be
-     * retrieved from oprs in the sub graphs.
-     */
-    class UserInputVars final : public UserDataContainer::UserData {
-        MGB_TYPEINFO_OBJ_DECL;
-
-        //! we keep this mapping to handle multi-part compiling, where new
-        //! graphs would be created and the var in the original graph is needed
-        ThinHashMap<HostTensorND*, VarNode*> m_tensor2var;
-
-    public:
-        void register_var(SymbolVar x) {
-            m_tensor2var[x.node()->owner_opr()
-                                  ->cast_final_safe<opr::Host2DeviceCopy>()
-                                  .host_data()
-                                  .get()] = x.node();
-        }
-
-        //! get the corresponding var from an opr if it has been registered;
-        //! return nullptr otherwise
-        VarNode* check(cg::OperatorNodeBase* opr) const {
-            if (opr->same_type<opr::Host2DeviceCopy>()) {
-                auto ptr = opr->cast_final<opr::Host2DeviceCopy>()
-                                   .host_data()
-                                   .get();
-                auto iter = m_tensor2var.find(ptr);
-                return iter == m_tensor2var.end() ? nullptr : iter->second;
-            }
-            return nullptr;
-        }
-
-        static UserInputVars& get(ComputingGraph* graph) {
-            return *graph->options()
-                            .user_data.get_user_data_or_create<UserInputVars>();
-        }
-    };
-
-    __attribute__((constructor))
-    void global_init() {
-        CompNode::enable_affinity_for_cpu(true);
-    }
-} // anonymous namespace
-
-MGB_TYPEINFO_OBJ_IMPL(UserInputVars);
-
-/* ================= SharedND =================  */
-
-bool SharedND::sync(mgb::DeviceTensorND &dv) {
-    if (m_copy_sync) {
-        dv.sync();
-        return true;
-    }
-    return false;
-}
-
-void SharedND::_set_init_shape(const std::vector<size_t> &shape) {
-    mgb_assert(m_dev_tensor && m_dev_tensor->empty());
-    m_dev_tensor->resize(npy::vec2shape(shape));
-}
-
-void SharedND::_resize(const std::vector<size_t> &shape) {
-    auto tshp = npy::vec2shape(shape);
-    if (m_dev_tensor) {
-        m_dev_tensor->resize(tshp);
-    } else {
-        mgb_assert(m_var);
-        m_var->shape_alloc(tshp);
-    }
-}
-
-void SharedND::_reset_zero() {
-    fill_zero_dev_tensor(*m_dev_tensor);
-}
-
-void SharedND::_copy_from_npyarr(PyObject *npyarr) {
-    auto do_copy = [&](DeviceTensorND *dest, VarNode *var) {
-        DType dtype = dest ? dest->dtype() : var->dtype();
-        mgb_assert(dtype.valid());
-        auto hv = npy::np2tensor(npyarr, npy::Meth::borrow(), dtype);
-        if (var) {
-            // only setup by assign(), by craniotome
-            var->shape_alloc(hv.shape());
-            dest = &var->mutable_dev_tensor();
-        }
-        if (!sync(dest->copy_from(hv))) {
-            m_async_copy_refkeeper = hv;
-        } else {
-            m_async_copy_refkeeper = {};
-        }
-    };
-    if (m_var) {
-        mgb_assert(!m_dev_tensor);
-        do_copy(nullptr, m_var);
-    } else {
-        mgb_assert(m_dev_tensor);
-        do_copy(m_dev_tensor.get(), nullptr);
-    }
-}
-
-PyObject* SharedND::_get_npyarr() {
-    mgb_assert(m_dev_tensor);
-    if (m_dev_tensor->empty())
-        Py_RETURN_NONE;
-    HostTensorND hv;
-    hv.comp_node(CompNode::default_cpu())
-        .copy_from(*m_dev_tensor)
-        .sync();
-    return npy::ndarray_from_tensor(hv, npy::ShareType::TRY_SHARE);
-}
-
-PyObject* SharedND::_get_dtype() {
-    mgb_assert(m_dev_tensor);
-    return npy::dtype_mgb2np(m_dev_tensor->dtype());
-}
-
-void SharedND::_copy_from_value_proxy(CompGraphCallbackValueProxy &value) {
-    if (value.eager_copy()) {
-        mgb_log_warn("copy from eager-copied CompGraphCallbackValueProxy into"
-                " SharedND; consider using callback_lazycopy; traceback:\n%s",
-                PyStackExtracter::run().c_str());
-    }
-
-    if (m_var) {
-        mgb_assert(!m_dev_tensor);
-        auto &&src = value.dev_tensor();
-        m_var->shape_alloc(src.shape()).
-            mutable_dev_tensor().copy_from(src);
-    } else {
-        mgb_assert(m_dev_tensor);
-        sync(m_dev_tensor->copy_from(value.dev_tensor()));
-    }
-}
-
-void SharedND::_share_from_value_proxy(CompGraphCallbackValueProxy& value) {
-    if (value.eager_copy()) {
-        mgb_log_warn(
-                "share value from eager-copied CompGraphCallbackValueProxy into"
-                " SharedND; consider using callback_lazycopy; traceback:\n%s",
-                PyStackExtracter::run().c_str());
-    }
-
-    if (m_var) {
-        mgb_assert(!m_dev_tensor);
-        m_var->reset_dev_tensor_from_tensor(value.dev_tensor());
-    } else {
-        mgb_assert(m_dev_tensor);
-        *m_dev_tensor = value.dev_tensor();
-    }
-}
-
-SharedND SharedND::_from_symvar(SymbolVar symvar) {
-    auto opr = symvar.node()->owner_opr();
-    if (auto vsnd = opr->try_cast_final<opr::VolatileSharedDeviceTensor>()) {
-        return SharedND(vsnd->dev_data());
-    }
-    if (auto snd = opr->try_cast_final<opr::SharedDeviceTensor>()) {
-        return SharedND(snd->dev_data());
-    }
-    mgb_throw(MegBrainError, "cannot convert from %s", opr->dyn_typeinfo()->name);
-}
-
-uintptr_t SharedND::_pubapi_dev_tensor_ptr(int version) {
-    DeviceTensorND *dv;
-    if (m_dev_tensor) {
-        mgb_assert(!m_var);
-        dv = m_dev_tensor.get();
-    } else {
-        mgb_assert(m_var);
-        dv = nullptr;
-    }
-    void *ret;
-    if (version == 0) {
-        if (dv) {
-            ret = dv->raw_ptr();
-        } else {
-            ret = m_var->dev_tensor().raw_ptr();
-        }
-    } else {
-        init_pubapi_dev_tensor(m_pubapi_dev_tensor, dv, m_var, false);
-        ret = &m_pubapi_dev_tensor;
-    }
-    return reinterpret_cast<uintptr_t>(ret);
-}
-
-SymbolVar SharedND::_as_sym_var(CompGraph &cg, const std::string &name,
-        bool volatile_) {
-    mgb_assert(m_dev_tensor);
-    OperatorNodeConfig config;
-    if (!name.empty())
-        config.name(name);
-    if (volatile_) {
-        return opr::VolatileSharedDeviceTensor::make(cg.get(), m_dev_tensor,
-                config);
-    } else {
-        return opr::SharedDeviceTensor::make(cg.get(), m_dev_tensor, config);
-    }
-}
-
-std::vector<size_t> SharedND::_get_shape(){
-    if (m_var) {
-        mgb_assert(!m_dev_tensor);
-        return npy::shape2vec(m_var->shape());
-    }
-    mgb_assert(m_dev_tensor);
-    return npy::shape2vec(m_dev_tensor->shape());
-}
-
-void SharedND::copy_to_sub_from_shared(
-        int axis, ptrdiff_t begin, ptrdiff_t end, ptrdiff_t step,
-        const SharedND &rhs) {
-    mgb_assert(m_dev_tensor && rhs.m_dev_tensor);
-    auto sub = m_dev_tensor->sub(
-            Slice(begin, end, step).apply(m_dev_tensor->layout(), axis));
-    sub.copy_from_fixlayout(*rhs.m_dev_tensor).sync();
-
-}
-
-void SharedND::copy_from_shared_sub(const SharedND &rhs,
-        int axis, ptrdiff_t begin, ptrdiff_t end, ptrdiff_t step) {
-    mgb_assert(m_dev_tensor && rhs.m_dev_tensor);
-    if (axis == -3) {
-        sync(m_dev_tensor->copy_from_fixlayout(*rhs.m_dev_tensor));
-    } else if (axis == -2) {
-        sync(m_dev_tensor->copy_from(*rhs.m_dev_tensor));
-    } else {
-        auto sub = rhs.m_dev_tensor->sub(
-                Slice(begin, end, step).apply(
-                    rhs.m_dev_tensor->layout(), axis));
-        sync(m_dev_tensor->copy_from(sub));
-    }
-}
-
-void SharedND::_check_before_share_memory(const SharedND& rhs) {
-    mgb_assert(rhs.m_dev_tensor);
-    mgb_assert(m_dev_tensor);
-    mgb_assert(rhs.m_dev_tensor->dtype() == m_dev_tensor->dtype());
-    mgb_assert(rhs.m_dev_tensor->comp_node() == m_dev_tensor->comp_node());
-}
-
-void SharedND::_share_memory_from(const SharedND& rhs, size_t begin) {
-    _check_before_share_memory(rhs);
-    m_dev_tensor->reset(
-        rhs.m_dev_tensor->storage().sub(m_dev_tensor->dtype().size() * begin),
-        m_dev_tensor->layout());
-}
-
-void SharedND::_reset_dev_tensor(const SharedND &rhs) {
-    _check_before_share_memory(rhs);
-    *m_dev_tensor = *(rhs.m_dev_tensor);
-}
-
-/* ================= _HostSharedND =================  */
-
-void _HostSharedND::ensure_own_storage() {
-    if (!m_own_storage) {
-        mgb_assert(m_tensor);
-        HostTensorND val{m_tensor->comp_node(), m_tensor->dtype()};
-        if (!m_tensor->empty()) {
-            val.resize(m_tensor->shape());
-        }
-        *m_tensor = std::move(val);
-        m_own_storage = true;
-    }
-}
-
-void _HostSharedND::_resize(const std::vector<size_t> &shape) {
-    ensure_own_storage();
-    m_tensor->resize(npy::vec2shape(shape));
-}
-
-void _HostSharedND::_copy_from_npyarr(PyObject *npyarr, bool borrow) {
-    mgb_assert(m_tensor);
-    mgb_assert(m_tensor->dtype().valid());
-    if (!m_borrow_on_cpu &&
-            m_tensor->comp_node().device_type() == CompNode::DeviceType::CPU) {
-        borrow = false;
-    }
-    if (borrow) {
-        auto val = npy::np2tensor(
-                npyarr, npy::Meth::borrow(m_tensor->comp_node()),
-                m_tensor->dtype());
-        m_own_storage = false;
-        *m_tensor = std::move(val);
-    } else {
-        ensure_own_storage();
-        npy::np2tensor(npyarr,
-                npy::Meth::copy_into(m_tensor.get()), m_tensor->dtype());
-    }
-}
-
-SymbolVar _HostSharedND::_as_sym_var(CompGraph &cg, bool enable_static_infer,
-        const std::string &name) {
-    if (m_tensor->empty())
-        cg.get().options().allocate_static_mem_after_graph_compile = false;
-
-    OperatorNodeConfig config;
-    if (!name.empty())
-        config.name(name);
-
-    SymbolVar ret;
-    if (enable_static_infer) {
-        ret = opr::Host2DeviceCopy::make(cg.get(), m_tensor, config);
-    } else {
-        ret = opr::Host2DeviceCopy::make_no_value_infer(cg.get(), m_tensor,
-                config);
-    }
-    UserInputVars::get(&cg.get()).register_var(ret);
-    return ret;
-}
-
-_HostSharedND _HostSharedND::make_proxy(SymbolVar var) {
-    auto &&opr = var.node()->owner_opr()->
-       cast_final_safe<opr::Host2DeviceCopy>();
-    _HostSharedND rst{var.node()->comp_node(), var.dtype()};
-    rst.m_tensor = opr.host_data();
-    rst.m_proxied_opr = &opr;
-    return rst;
-}
-
-std::string _HostSharedND::__repr__() const {
-    if (m_proxied_opr) {
-        return ssprintf("<HostSharedND proxy at %p for %s>",
-                this, m_proxied_opr->cname());
-    }
-    return ssprintf("<HostSharedND at %p>", this);
-}
-
-PyObject* _HostSharedND::_get_dtype() {
-    mgb_assert(m_tensor);
-    return npy::dtype_mgb2np(m_tensor->dtype());
-}
-
-/* ================= CompGraphCallbackValueProxy =================  */
-
-CompGraphCallbackValueProxy
-CompGraphCallbackValueProxy::make_raw_host_value_proxy(
-        const mgb::HostTensorND &hv) {
-    CompGraphCallbackValueProxy ret;
-    ret.m_use_raw_hv = true;
-    ret.m_hv = hv;
-    ret.m_is_active = true;
-    return ret;
-}
-
-void CompGraphCallbackValueProxy::setup(
-        const mgb::DeviceTensorND &val, bool eager_copy) {
-
-    while (__atomic_load_n(&m_is_active, __ATOMIC_SEQ_CST)) {
-        // wait for previous callback to finish
-        std::this_thread::yield();
-    }
-
-    mgb_assert(!m_use_raw_hv && val.shape_valid());
-    m_eager_copy = eager_copy;
-    m_dev_value = val;
-    if (eager_copy) {
-        m_value_used = false;
-        do_copy();
-    } else {
-        m_value_used = true;
-    }
-
-    __atomic_store_n(&m_is_active, true, __ATOMIC_SEQ_CST);
-}
-
-void CompGraphCallbackValueProxy::do_copy() {
-    mgb_assert(!m_use_raw_hv && m_dev_value.shape_valid());
-    m_hv.copy_from(m_dev_value);
-    auto cn = m_hv.comp_node();
-    if (!m_copy_event)
-        m_copy_event = cn.create_event();
-    m_copy_event->record();
-}
-
-#if defined(WIN32)
-#include <windows.h>
-#include <stdio.h>
-#undef CONST
-#define usleep Sleep
-#endif
-void CompGraphCallbackValueProxy::sync() const {
-    mgb_assert(!m_use_raw_hv);
-    RealTimer t0;
-    double next_warn_time = 2, warn_time_delta = 1;
-    while (!m_copy_event->finished()) {
-        //! sleep 1ms or sleep 1us no difference for performance on win32
-        usleep(1);
-        if (t0.get_secs() >= next_warn_time) {
-            mgb_log_warn("wait d2h copy for more than %.3f secs",
-                    t0.get_secs());
-            next_warn_time += warn_time_delta;
-            warn_time_delta += 1;
-        }
-    }
-}
-
-void CompGraphCallbackValueProxy::on_finished() {
-    mgb_assert(m_is_active && !m_use_raw_hv);
-    m_dev_value = {};
-    if (m_hv.shape_valid()) {
-        m_hv.resize({});    // resize to reuse buffer
-    }
-    __atomic_store_n(&m_is_active, false, __ATOMIC_SEQ_CST);
-    if (!m_value_used) {
-        mgb_log_warn("computing graph callback did not read the value");
-    }
-}
-
-PyObject* CompGraphCallbackValueProxy::_get_npyarr() {
-    mgb_assert(m_is_active);
-
-    if (!m_use_raw_hv) {
-        mgb_assert(m_dev_value.shape_valid());
-        if (!m_hv.shape_valid()) {
-            do_copy();
-            sync();
-        }
-    }
-    m_value_used = true;
-    return npy::ndarray_from_tensor(m_hv, npy::ShareType::TRY_SHARE);
-}
-
-PyObject* CompGraphCallbackValueProxy::_get_dtype() {
-    mgb_assert(m_is_active);
-
-    if (m_use_raw_hv)
-        return npy::dtype_mgb2np(m_hv.dtype());
-
-    mgb_assert(m_dev_value.shape_valid());
-    return npy::dtype_mgb2np(m_dev_value.dtype());
-}
-
-std::vector<size_t> CompGraphCallbackValueProxy::_get_shape() {
-    mgb_assert(m_is_active);
-
-    if (m_use_raw_hv)
-        return npy::shape2vec(m_hv.shape());
-
-    mgb_assert(m_dev_value.shape_valid());
-    return npy::shape2vec(m_dev_value.shape());
-}
-
-uintptr_t CompGraphCallbackValueProxy::_pubapi_dev_tensor_ptr(int version) {
-    mgb_assert(m_is_active && !m_use_raw_hv);
-    mgb_assert(m_dev_value.shape_valid());
-    void *ret;
-    if (version == 0) {
-        ret = m_dev_value.raw_ptr();
-    } else {
-        init_pubapi_dev_tensor(
-                m_pubapi_dev_tensor, &m_dev_value, nullptr, true);
-        ret = &m_pubapi_dev_tensor;
-    }
-    return reinterpret_cast<uintptr_t>(ret);
-}
-
-mgb::CompNode CompGraphCallbackValueProxy::_get_comp_node() {
-    mgb_assert(m_is_active && !m_use_raw_hv);
-    mgb_assert(m_dev_value.shape_valid());
-    return m_dev_value.comp_node();
-}
-
-/* ================= AsyncExec =================  */
-
-class AsyncExec::Core {
-    public:
-        Core(std::unique_ptr<mgb::cg::AsyncExecutable> f):
-            m_func(std::move(f))
-        {
-        }
-
-        mgb::cg::AsyncExecutable* func() const {
-            return m_func.get();
-        }
-
-        struct CallbackParam {
-            std::vector<CompGraphCallbackValueProxy> value;
-            _CompGraphCallback *cb;
-        };
-
-        void dispatch_callback(const CallbackParam &param) {
-            m_worker.add_task(param);
-        }
-
-        void wait_callback_finish() {
-            m_worker.wait_all_task_finish();
-        }
-
-    private:
-        std::unique_ptr<mgb::cg::AsyncExecutable> m_func;
-
-        class Worker final: public AsyncQueueSC<CallbackParam, Worker> {
-            public:
-                void process_one_task(const CallbackParam &task) {
-                    for (auto &tmp_value: task.value) {
-                        tmp_value.sync();
-                    }
-                    task.cb->call_pycb();
-                }
-        };
-        Worker m_worker;
-};
-
-AsyncExec::AsyncExec(std::unique_ptr<mgb::cg::AsyncExecutable> f):
-    m_core(std::make_shared<Core>(std::move(f)))
-{
-}
-
-AsyncExec::~AsyncExec() {
-    if (m_core)
-        _wait();
-}
-
-AsyncExec::Core* AsyncExec::core() const {
-    return m_core.get();
-}
-
-void AsyncExec::_execute() {
-    m_core->func()->execute();
-}
-
-std::string AsyncExec::_to_json_str() {
-    auto jv = m_core->func()->to_json();
-    return jv->to_string();
-}
-
-void AsyncExec::_wait() {
-    m_core->wait_callback_finish();
-    m_core->func()->wait();
-}
-
-double AsyncExec::_get_prev_exec_time() {
-    return m_core->func()->get_prev_exec_time();
-}
-
-SymbolVarArray AsyncExec::_find_mutable_input() {
-    ThinHashSet<VarNode*> used_set;
-    UserInputVars* user_vars = nullptr;
-    auto cb = [&](cg::OperatorNodeBase* opr) {
-        if (!user_vars) {
-            ComputingGraph* g;
-            if (m_multi_part_par_graph)
-                g = m_multi_part_par_graph.get();
-            else
-                g = opr->owner_graph();
-            user_vars = &UserInputVars::get(g);
-        }
-        if (auto var = user_vars->check(opr)) {
-            used_set.insert(var);
-        }
-        return true;
-    };
-    m_core->func()->iter_opr_seq(cb);
-    for (auto i : m_core->func()->get_rt_static_source_deps()) {
-        cb(i.dest->owner_opr());
-    }
-    SymbolVarArray ret;
-    ret.reserve(used_set.size());
-    ret.insert(ret.begin(), used_set.begin(), used_set.end());
-    return ret;
-}
-
-void AsyncExec::clear_device_memory() {
-    _wait();
-    m_core->func()->clear_device_memory();
-}
-
-std::vector<std::pair<CompNode, size_t>>
-AsyncExec::_update_static_alloc_plan_and_get_size() {
-    std::vector<std::pair<CompNode, size_t>> ret;
-    for (auto&& i : m_core->func()->update_static_alloc_plan_and_get_size()) {
-        ret.emplace_back(i.first, i.second);
-    }
-    return ret;
-}
-
-/* ================= _CompGraphCallback =================  */
-
-void _CompGraphCallback::set_async_exec(const AsyncExec &ae)  {
-    mgb_assert(!m_ae_core);
-    m_ae_core = ae.core();
-}
-
-void _CompGraphCallback::set_eager_copy(bool flag) {
-    mgb_assert(!m_cb_created);
-    m_eager_copy = flag;
-}
-
-std::function<void(mgb::SmallVector<mgb::DeviceTensorND> &)> _CompGraphCallback::make_multi_input_callback() {
-    mgb_assert(!m_cb_created);
-    m_cb_created = true;
-
-    // shared_ptr would delete this afterwards
-    std::shared_ptr <_CompGraphCallback> self(this);
-
-    auto cb = [self](SmallVector <mgb::DeviceTensorND> &data) {
-        for (size_t i = self->m_value_proxies.size(); i < data.size(); ++i) {
-            self->m_value_proxies.emplace_back();
-        }
-        if (self->m_eager_copy) {
-            mgb_assert(self->m_ae_core);
-            for (size_t i = 0; i < self->m_value_proxies.size(); ++i) {
-                self->m_value_proxies[i].setup(data[i], true);
-            }
-            self->m_ae_core->dispatch_callback(
-                    AsyncExec::Core::CallbackParam{self->m_value_proxies, self.get()}
-            );
-        } else {
-            for (size_t i = 0; i < self->m_value_proxies.size(); ++i)
-                self->m_value_proxies[i].setup(data[i], false);
-            self->call_pycb();
-        }
-    };
-
-    return cb;
-}
-
-std::function<void(mgb::DeviceTensorND &)> _CompGraphCallback::make_callback() {
-    this->m_value_proxies.emplace_back();
-    mgb_assert(!m_cb_created);
-    m_cb_created = true;
-
-    // shared_ptr would delete this afterwards
-    std::shared_ptr <_CompGraphCallback> self(this);
-
-    auto cb = [self](mgb::DeviceTensorND &data) {
-        if (self->m_eager_copy) {
-            mgb_assert(self->m_ae_core);
-            self->m_value_proxies[0].setup(data, true);
-            self->m_ae_core->dispatch_callback(
-                    AsyncExec::Core::CallbackParam{self->m_value_proxies, self.get()}
-            );
-        } else {
-            self->m_value_proxies[0].setup(data, false);
-            self->call_pycb();
-        }
-    };
-
-    return cb;
-}
-
-void _CompGraphCallback::call_pycb() {
-    try {
-        call(m_value_proxies);
-    } catch (...) {
-        for(auto &m_value_proxy: m_value_proxies) {
-            m_value_proxy.on_finished();
-        }
-        throw;
-    }
-    for(auto &m_value_proxy: m_value_proxies) {
-        m_value_proxy.on_finished();
-    }
-}
-
-/* ================= CompGraph =================  */
-
-class CompGraph::PyUserData final: public UserDataContainer::UserData,
-                                   public NonCopyableObj {
-    MGB_TYPEINFO_OBJ_DECL;
-
-    PyObject *m_obj;
-
-    public:
-
-        PyUserData() {
-            PYTHON_GIL;
-            m_obj = PyDict_New();
-            mgb_assert(m_obj, "failed to create python object");
-        }
-
-        ~PyUserData() {
-            PYTHON_GIL;
-            Py_DECREF(m_obj);
-        }
-
-        PyObject* get() const {
-            return m_obj;
-        }
-};
-MGB_TYPEINFO_OBJ_IMPL(CompGraph::PyUserData);
-
-mgb::ComputingGraph& CompGraph::get() const {
-    if (m_comp_graph_own)
-        return *m_comp_graph_own;
-    auto &&val = m_comp_graph_borrow.lock();
-    mgb_assert(val, "CompGraph has been destructed");
-    return *val;
-}
-
-void CompGraph::clear_device_memory() {
-    if (!m_comp_graph_own)
-        return;
-    m_comp_graph_own->clear_device_memory();
-}
-
-PyObject* CompGraph::_user_data() {
-    auto ct = get().options().user_data.get_user_data_or_create<PyUserData>();
-    auto ret = ct->get();
-    PYTHON_GIL;
-    Py_INCREF(ret);
-    return ret;
-}
-
-void CompGraph::_add_output_spec(
-        mgb::cg::SymbolVar &var, _CompGraphCallback *callback) {
-
-    cg::ComputingGraph::Callback cb;
-    if (callback) {
-        cb = callback->make_callback();
-        m_raw_callbacks.push_back({callback, m_out_specs.size() - 1});
-    }
-    if (m_out_specs.empty()) {
-        m_out_specs.emplace_back();
-    }
-    m_out_specs.back().push_back({var, cb});
-}
-
-AsyncExec CompGraph::_do_compile(bool copy, bool optimize_for_inference) {
-    mgb_assert(m_out_specs.size() == 1, "got %zu output specs for compile",
-               m_out_specs.size());
-    auto&& spec = m_out_specs[0];
-    if (optimize_for_inference) {
-        SymbolVarArray vars;
-        vars.reserve(spec.size());
-        for (auto&& i : spec) {
-            vars.push_back(i.first);
-        }
-        vars = gopt::optimize_for_inference(vars, {});
-        mgb_assert(vars.size() == spec.size());
-        for (size_t i = 0; i < vars.size(); ++i) {
-            spec[i].first = vars[i];
-        }
-    }
-
-    std::unique_ptr<mgb::cg::AsyncExecutable> async_executable;
-    if (get().options().eager_evaluation ||
-        (copy && get().current_comp_seq())) {
-        // need to copy a new comp graph
-        SymbolVarArray vars;
-        vars.reserve(spec.size());
-        for (auto&& i : spec) {
-            vars.emplace_back(i.first);
-        }
-
-        // copy graph
-        auto new_graph = mgb::ComputingGraph::make();
-        SymbolVarArray new_vars =
-                replace_vars_comp_graph(std::move(vars), new_graph.get());
-        mgb_assert(new_vars.size() == spec.size());
-
-        // register input
-        auto h2d = find_h2d(new_vars);
-        for (auto&& i : h2d) {
-            UserInputVars::get(new_graph.get()).register_var(i);
-        }
-
-        mgb::ComputingGraph::OutputSpec new_spec;
-        new_spec.reserve(spec.size());
-        for (size_t i = 0; i < spec.size(); ++i) {
-            new_spec.emplace_back(mgb::ComputingGraph::OutputSpecItem{
-                    new_vars[i], spec[i].second});
-        }
-        async_executable = new_graph->compile(new_spec);
-    } else {
-        async_executable = get().compile(spec);
-    }
-
-    AsyncExec ret{std::move(async_executable)};
-
-    for (auto&& i : m_raw_callbacks) {
-        mgb_assert(!i.second);
-        i.first->set_async_exec(ret);
-    }
-    _clear_output_spec();
-    return ret;
-}
-
-std::vector<AsyncExec> CompGraph::_do_compile_multi_part() {
-    // last spec is empty due to an extra call to _add_multi_part_endpoint()
-    mgb_assert(m_out_specs.size() > 1 && m_out_specs.back().empty(),
-               "got %zu output specs for multi-part compile",
-               m_out_specs.size());
-    m_out_specs.pop_back();
-    std::vector<AsyncExec> ret;
-    ret.reserve(m_out_specs.size());
-    auto graph = get().shared_from_this();
-    for (auto&& i : graph->compile_multi_part(m_out_specs)) {
-        ret.emplace_back(std::move(i));
-    }
-    for (auto&& i : ret) {
-        i.set_multi_part_par_graph(graph);
-    }
-    for (auto&& i : m_raw_callbacks) {
-        i.first->set_async_exec(ret.at(i.second));
-    }
-    _clear_output_spec();
-    return ret;
-}
-
-/* ================= SharedScalar =================  */
-
-SharedScalar::SharedScalar(PyObject *val):
-    m_val{std::make_shared<DTypeScalar>()}
-{
-    _set(val);
-}
-
-HostTensorND& SharedScalar::val_as_host_nd() {
-    if (m_val_as_host_nd.empty()) {
-        HostTensorStorage storage;
-        storage.reset(CompNode::default_cpu(), m_val->dtype().size(),
-                      {m_val, static_cast<dt_byte*>(
-                                      const_cast<void*>(m_val->storage()))});
-        m_val_as_host_nd.reset(storage, {TensorShape{1}, m_val->dtype()});
-    }
-    return m_val_as_host_nd;
-}
-
-void SharedScalar::_set(PyObject *val) {
-    auto tensor = npy::np2tensor(val, npy::Meth::borrow(), {});
-    mgb_assert(tensor.layout().is_scalar(),
-            "value given to SharedScalar must be scalar; got shape %s",
-            tensor.shape().to_string().c_str());
-    if (m_dtype_locked) {
-        mgb_assert(tensor.dtype() == m_val->dtype(),
-                "dtype for SharedScalar has been locked as %s, "
-                "but attempt to set it to %s", m_val->dtype().name(),
-                tensor.dtype().name());
-    }
-    m_val->set_raw(tensor.dtype(), tensor.raw_ptr());
-
-    if (!m_dev_val.empty()) {
-        auto &&hv = val_as_host_nd();
-        for (auto &&i: m_dev_val)
-            i.second->copy_from_fixlayout(hv);
-    }
-}
-
-PyObject* SharedScalar::_get() {
-    HostTensorND hv{CompNode::default_cpu(), TensorShape{1}, m_val->dtype()};
-    memcpy(hv.raw_ptr(), m_val->storage(), m_val->dtype().size(1));
-    return npy::ndarray_from_tensor(hv, npy::ShareType::TRY_SHARE);
-}
-
-SymbolVar SharedScalar::_as_sym_var(CompGraph &cg, mgb::CompNode &cn) {
-    m_dtype_locked = true;
-    auto &&dv = m_dev_val[cn];
-    auto &&hv = val_as_host_nd();
-    if (!dv) {
-        dv = std::make_shared<DeviceTensorND>(cn);
-        dv->copy_from(hv);
-    }
-    return opr::SharedDeviceTensor::make(cg.get(), dv,
-            ssprintf("SharedScalar@%p", m_val.get()));
-}
-
-/* =============== Operator ===============  */
-
-const std::unique_ptr<mgb::OprFootprint> Operator::sm_opr_footprint_ptr{
-                                std::make_unique<mgb::OprFootprint>()};
-
-/* ================= misc =================  */
-
-SymbolVar fill_retain_dtype(SymbolVar var, PyObject *value) {
-    auto tensor = npy::np2tensor(value, npy::Meth::borrow(), {});
-    mgb_assert(tensor.shape().is_scalar(),
-            "value for fill_retain_dtype must be scalar; got shape %s",
-            tensor.shape().to_string().c_str());
-    switch (tensor.dtype().enumv()) {
-#define cb(_dt) case DTypeTrait<_dt>::enumv: \
-        static_assert(sizeof(DTypeTrait<_dt>::ctype) <= sizeof(int), \
-                "bad dtype size"); \
-        return var.fill_retain_dtype(static_cast<int>( \
-                    *tensor.ptr<DTypeTrait<_dt>::ctype>()));
-        MEGDNN_FOREACH_COMPUTING_DTYPE_INT(cb)
-#undef cb
-        case DTypeEnum::Float32:
-            return var.fill_retain_dtype(*tensor.ptr<dt_float32>());
-        case DTypeEnum::Float16:
-            return var.fill_retain_dtype(
-                    static_cast<float>(*tensor.ptr<dt_float16>()));
-        case DTypeEnum::BFloat16:
-            return var.fill_retain_dtype(
-                    static_cast<float>(*tensor.ptr<dt_bfloat16>()));
-        // TODO: What does this mean?
-        case DTypeEnum::Quantized8Asymm:
-        case DTypeEnum::QuantizedS32:
-        case DTypeEnum::QuantizedS8:
-        case DTypeEnum::Quantized4Asymm:
-        case DTypeEnum::QuantizedS4:
-        case DTypeEnum::Byte:
-        case DTypeEnum::QuantizedS16:
-        case DTypeEnum::Bool:
-            break;
-#define cb(low_bit, size) \
-        case DTypeEnum::low_bit##size: \
-            break;
-MEGDNN_FOREACH_LOWBIT_DTYPE(cb)
-#undef cb
-
-    }
-    throw ConversionError(ssprintf(
-                "unsupported value dtype: %s", tensor.dtype().name()));
-}
-
-PyObject* get_symvar_inferred_value(mgb::SymbolVar symvar) {
-    auto var = symvar.node();
-    auto&& mgr = var->owner_graph()->static_infer_manager();
-    using IT = cg::static_infer::InferType;
-    auto it = mgr.get_infer_type(var);
-    if (!(it.value & (IT::CONST | IT::RT_STATIC)))
-        Py_RETURN_NONE;
-
-    auto val = mgr.infer_value_fallible(var);
-    if (!val)
-        Py_RETURN_NONE;
-
-    auto hv = HostTensorND::make_proxy(*val);
-    return npy::ndarray_from_tensor(hv, npy::ShareType::MUST_UNSHARE);
-}
-
-void _mgb_global_finalize() {
-    CompNode::finalize();
-    g_global_finalize_called = true;
-}
-
-bool global_finalized() {
-    return g_global_finalize_called;
-}
-
-std::vector<size_t> _get_mgb_version() {
-    return {MGB_MAJOR, MGB_MINOR, MGB_PATCH, MGB_IS_DEV};
-}
-
-SymbolVarArray _grad(SymbolVar target, SymbolVarArray wrts,
-        bool warn_mid_wrt, int use_virtual_grad,
-        bool return_zero_for_nodep) {
-    if (use_virtual_grad == -1) {
-        use_virtual_grad = std::abs(
-                target.node()->owner_graph()->options().graph_opt_level) >= 2;
-    }
-
-    if (use_virtual_grad) {
-        mgb_assert(return_zero_for_nodep,
-            "can't return a null var when using virtual grad opr");
-        SymbolVarArray ret;
-        ret.reserve(wrts.size());
-        for (auto&& wrt : wrts) {
-            ret.push_back(opr::VirtualGrad::make(target, wrt));
-        }
-        return ret;
-    }
-    return cg::grad(target, wrts, warn_mid_wrt, return_zero_for_nodep);
-}
-
-SymbolVar _inter_graph_trans_var(
-        CompGraph &dest_graph, SymbolVar src) {
-    auto &&graph = dest_graph.get();
-    auto trans = mgb::cg::InterGraphVarTransformer::get(graph);
-    mgb_assert(trans, "trans func on graph %p has not been setup", &graph);
-    return trans->trans(src.node());
-}
-
-SymbolVar _get_graph_optimizer_replaced_var(SymbolVar src) {
-    return gopt::GraphOptimizer::var_replace_lookup(src.node());
-}
-
-void mark_as_input(ComputingGraph* cg, SymbolVar var) {
-    VarNode* node = var.node();
-    mgb_assert(node->owner_graph() == cg);
-    mgb_assert(node->owner_opr()->same_type<opr::Host2DeviceCopy>());
-    UserInputVars::get(cg).register_var(var);
-}
-
-namespace {
-
-void add_update_impl(const DeviceTensorND& dest,
-        const DeviceTensorND& delta_nobrd,
-        float alpha, float beta, float bias) {
-    auto&& cn = dest.comp_node();
-    using DT = CompNode::DeviceType;
-    mgb_assert(cn == delta_nobrd.comp_node() &&
-               (cn.device_type() == DT::CUDA || cn.device_type() == DT::CPU ||
-                cn.device_type() == DT::ROCM));
-    mgb_assert(dest.dtype() == delta_nobrd.dtype());
-    auto&& delta = delta_nobrd.sub(SubTensorSpec::make_from_offset_elem(
-        delta_nobrd.layout().broadcast(dest.shape()), 0));
-    cn.activate();
-    if (!static_cast<bool>(alpha) && beta == 1 &&
-            !static_cast<bool>(bias)) {
-        dest.copy_from_fixlayout(delta);
-    } else {
-        auto&& handle = MegDNNHandle::get(
-                CompNodeEnv::from_comp_node(cn)).handle();
-        auto&& op = handle->create_operator<megdnn::AddUpdate>();
-        op->param() = {alpha, beta, bias};
-        op->exec(dest.as_megdnn(), delta.as_megdnn());
-        if (cn.device_type() == DT::CPU && cn != CompNode::default_cpu()) {
-            CompNodeEnv::from_comp_node(cn).cpu_env().dispatch(
-                [p = op.release()] { delete p; }
-            );
-        }
-    }
-}
-
-} // anonymous namespace
-
-void _add_update_fastpath(SharedND& dest_, SharedND& delta_,
-        float alpha, float beta, float bias) {
-    auto&& dest = dest_.dev_tensor();
-    auto&& delta = delta_.dev_tensor();
-    add_update_impl(*dest, *delta, alpha, beta, bias);
-}
-
-void _add_update_fastpath(SharedND& dest_, CompGraphCallbackValueProxy& delta_,
-        float alpha, float beta, float bias) {
-    auto&& dest = dest_.dev_tensor();
-    auto&& delta = delta_.dev_tensor();
-    add_update_impl(*dest, delta, alpha, beta, bias);
-}
-
-// vim: syntax=cpp.doxygen foldmethod=marker foldmarker=f{{{,f}}}
diff --git a/python_module/src/cpp/megbrain_wrap.h b/python_module/src/cpp/megbrain_wrap.h
deleted file mode 100644
index 106457ee..00000000
--- a/python_module/src/cpp/megbrain_wrap.h
+++ /dev/null
@@ -1,491 +0,0 @@
-/**
- * \file python_module/src/cpp/megbrain_wrap.h
- *
- * This file is part of MegBrain, a deep learning framework developed by Megvii.
- *
- * \brief wrappers for basic functionalities
- *
- * \copyright Copyright (c) 2014-2020 Megvii Inc. All rights reserved.
- *
- */
-
-#pragma once
-
-#include "./python_helper.h"
-#include "./megbrain_pubapi.h"
-
-#include "megbrain/graph.h"
-#include "megbrain/opr/io.h"
-
-#include "megbrain/plugin/opr_footprint.h"
-
-#include <map>
-#include <string>
-
-class CompGraph;
-class CompGraphCallbackValueProxy;
-
-/*!
- * \brief proxy a mgb::DeviceTensorND or a SymbolVar
- */
-class SharedND {
-    mgb::pubapi::DeviceTensor m_pubapi_dev_tensor;
-
-    std::shared_ptr<mgb::DeviceTensorND> m_dev_tensor;
-    mgb::HostTensorND m_async_copy_refkeeper;
-    mgb::VarNode *m_var = nullptr;
-    bool m_copy_sync = true;
-
-    bool sync(mgb::DeviceTensorND &dv);
-    inline void _check_before_share_memory(const SharedND& rhs);
-
-    public:
-        SharedND() = default;
-
-        SharedND(mgb::CompNode node, PyObject* dtype):
-            m_dev_tensor(std::make_shared<mgb::DeviceTensorND>(
-                        node, npy::dtype_np2mgb(dtype)))
-        { }
-
-        SharedND(const std::shared_ptr<mgb::DeviceTensorND>& dv)
-            : m_dev_tensor(dv) {}
-
-        //! set init shape; can be only called once
-        void _set_init_shape(const std::vector<size_t> &shape);
-
-        //! resize to given shape
-        void _resize(const std::vector<size_t> &shape);
-
-	//! reset dev_tensor to zeros
-	void _reset_zero();
-
-        /*!
-         * \brief assign to proxy given dev tensor; used by craniotome
-         */
-        void assign(const mgb::DeviceTensorND &dv) {
-            mgb_assert(!m_dev_tensor && !m_var);
-            m_dev_tensor = std::make_shared<mgb::DeviceTensorND>(dv);
-        }
-
-        /*!
-         * \brief assign to proxy a var node; used by craniotome
-         */
-        void assign(mgb::VarNode *var) {
-            mgb_assert(!m_dev_tensor && !m_var);
-            m_var = var;
-        }
-
-        /*!
-         * \brief share memory from another SharedND; only used in ParamPack
-         */
-        void _share_memory_from(const SharedND& rhs, size_t begin);
-
-        /*!
-        * \brief reset dev_tensor to another SharedNd's
-        */
-        void _reset_dev_tensor(const SharedND& rhs);
-
-        uintptr_t _pubapi_dev_tensor_ptr(int version);
-
-        mgb::SymbolVar _as_sym_var(CompGraph &cg, const std::string &name,
-                bool volatile_);
-
-        mgb::CompNode _get_comp_node() {
-            return m_dev_tensor->comp_node();
-        }
-
-        void _set_copy_sync(bool flag) {
-            m_copy_sync = flag;
-        }
-
-        //! get dev buffer from shared nd
-        const std::shared_ptr<mgb::DeviceTensorND>& dev_tensor() {
-            return m_dev_tensor;
-        }
-
-        void _copy_from_npyarr(PyObject *npyarr);
-        void _copy_from_value_proxy(CompGraphCallbackValueProxy &value);
-        void _share_from_value_proxy(CompGraphCallbackValueProxy &value);
-        static SharedND _from_symvar(mgb::SymbolVar symvar);
-
-        //! get numpy ndarray that contains a copy of the value; return new ref
-        PyObject* _get_npyarr();
-        PyObject* _get_dtype();
-        std::vector<size_t> _get_shape();
-
-        /*!
-         * \brief copy to sub of this from another SharedND
-         * \param axis axis for sub, or -1 to work on flattened array
-         */
-        void copy_to_sub_from_shared(
-                int axis, ptrdiff_t begin, ptrdiff_t end, ptrdiff_t step,
-                const SharedND &rhs);
-
-        /*!
-         * \brief copy from sub of another SharedND to this
-         * \param axis axis for sub, or -1 to work on flattened array, -2 to
-         *      copy whole tensor, -3 to copy whole tensor fixlayout
-         */
-        void copy_from_shared_sub(const SharedND &rhs,
-                int axis, ptrdiff_t begin, ptrdiff_t end, ptrdiff_t step);
-};
-
-/*!
- * \brief wraps around shared pointer to mgb::HostTensorND
- */
-class _HostSharedND {
-    bool m_own_storage = false, m_borrow_on_cpu = false;
-    std::shared_ptr<mgb::HostTensorND> m_tensor;
-    //! set to non-null if this _HostSharedND is set to proxy a var
-    mgb::opr::Host2DeviceCopy* m_proxied_opr = nullptr;
-
-    void ensure_own_storage();
-
-    public:
-        _HostSharedND() = default;
-
-        _HostSharedND(const _HostSharedND &rhs):
-            m_own_storage{false},
-            m_tensor{rhs.m_tensor},
-            m_proxied_opr{rhs.m_proxied_opr}
-        {
-        }
-
-        _HostSharedND(mgb::CompNode node, mgb::DType dtype):
-            m_own_storage{true},
-            m_tensor{std::make_shared<mgb::HostTensorND>(node, dtype)}
-        {
-        }
-
-        _HostSharedND(mgb::CompNode node, PyObject* dtype):
-            _HostSharedND(node, npy::dtype_np2mgb(dtype))
-        {
-        }
-
-        _HostSharedND& operator = (const _HostSharedND &) = delete;
-
-        /*!
-         * \brief make a _HostSharedND by proxing a var produced by
-         *      Host2DeviceCopy
-         */
-        static _HostSharedND make_proxy(mgb::SymbolVar var);
-
-        mgb::SymbolVar _as_sym_var(CompGraph &cg, bool enable_static_infer,
-                const std::string &name);
-
-        void _resize(const std::vector<size_t> &shape);
-        void _copy_from_npyarr(PyObject *npyarr, bool borrow);
-
-        void _enable_borrow_on_cpu(bool flag) {
-            m_borrow_on_cpu = flag;
-        }
-
-        std::string __repr__() const;
-        PyObject* _get_dtype();
-};
-
-/*!
- * \brief proxy a value to be passed to computing graph callback
- */
-class CompGraphCallbackValueProxy {
-    mgb::pubapi::DeviceTensor m_pubapi_dev_tensor;
-    bool m_is_active = false; //! setup called but on_finished not called
-    bool m_use_raw_hv = false;
-    bool m_value_used, m_eager_copy;
-    mgb::HostTensorND m_hv;
-    std::shared_ptr<mgb::CompNode::Event> m_copy_event;
-
-    //! original dev value
-    mgb::DeviceTensorND m_dev_value;
-
-    //! perform D2H copy
-    void do_copy();
-
-    public:
-        static CompGraphCallbackValueProxy make_raw_host_value_proxy(
-                const mgb::HostTensorND &hv);
-
-        bool eager_copy() const {
-            return m_eager_copy;
-        }
-
-        mgb::DeviceTensorND& dev_tensor() {
-            return m_dev_value;
-        }
-
-        void setup(const mgb::DeviceTensorND &val, bool eager_copy);
-        void sync() const;
-
-        /*!
-         * \brief called after python callback returned
-         */
-        void on_finished();
-
-        //! get numpy ndarray that contains a copy of the value; return new ref
-        PyObject* _get_npyarr();
-        PyObject* _get_dtype();
-        std::vector<size_t> _get_shape();
-
-        uintptr_t _pubapi_dev_tensor_ptr(int version);
-
-        mgb::CompNode _get_comp_node();
-};
-
-class AsyncExec {
-    public:
-        class Core;
-
-        AsyncExec() = default;
-
-        ~AsyncExec();
-
-        AsyncExec(std::unique_ptr<mgb::cg::AsyncExecutable> f);
-
-        void _execute();
-        void _wait();
-        double _get_prev_exec_time();
-
-        void clear_device_memory();
-
-        std::vector<std::pair<mgb::CompNode, size_t>>
-        _update_static_alloc_plan_and_get_size();
-
-        std::string _to_json_str();
-
-        /*!
-         * \brief find all Host2DeviceCopy input vars that are mutable (i.e.
-         *      used as func args)
-         */
-        mgb::SymbolVarArray _find_mutable_input();
-
-        Core* core() const;
-
-        void set_multi_part_par_graph(std::shared_ptr<mgb::ComputingGraph> g) {
-            m_multi_part_par_graph = std::move(g);
-        }
-
-    private:
-        std::shared_ptr<Core> m_core;
-        //! parent graph in multi-part compiling
-        std::shared_ptr<mgb::ComputingGraph> m_multi_part_par_graph;
-};
-
-/*!
- * \brief callback wrapper for computing graph
- */
-class _CompGraphCallback {
-    bool m_cb_created = false, m_eager_copy = false;
-    AsyncExec::Core* m_ae_core = nullptr;
-    std::vector<CompGraphCallbackValueProxy> m_value_proxies;
-
-    public:
-        /*!
-         * \brief set AsyncExec associated with this callback; if it is set,
-         *      eager value copy would be enabled
-         */
-        void set_async_exec(const AsyncExec &ae);
-
-        /*!
-         * \brief set whether enabling eager copy
-         *
-         * If eager copy is enabled, host to device copy would start immediately
-         *      and asynchronously when this callback is executed by megbrain
-         */
-        void set_eager_copy(bool flag);
-
-        virtual ~_CompGraphCallback() = default;
-
-        std::function<void(mgb::SmallVector<mgb::DeviceTensorND> &)> make_multi_input_callback();
-        std::function<void(mgb::DeviceTensorND &)> make_callback();
-
-        /*!
-         * \brief call python callback
-         */
-        void call_pycb();
-
-        /*!
-         * \brief python callback to be overwritten
-         */
-        virtual void call(std::vector<CompGraphCallbackValueProxy>&) = 0;
-};
-
-/*!
- * \brief wrap around shared mgb::ComputingGraph
- */
-class CompGraph {
-    class PyUserData;
-
-    mgb::SmallVector<mgb::ComputingGraph::OutputSpec> m_out_specs;
-    //! (callback, output spec part)
-    mgb::SmallVector<std::pair<_CompGraphCallback*, size_t>> m_raw_callbacks;
-
-    std::shared_ptr<mgb::ComputingGraph> m_comp_graph_own;
-    std::weak_ptr<mgb::ComputingGraph> m_comp_graph_borrow;
-
-    explicit CompGraph(const std::shared_ptr<mgb::ComputingGraph>& cg)
-                : m_comp_graph_own{cg} {}
-
-    explicit CompGraph(const std::weak_ptr<mgb::ComputingGraph> &cg):
-        m_comp_graph_borrow{cg}
-    {}
-
-    public:
-
-        CompGraph():
-            m_comp_graph_own(mgb::ComputingGraph::make())
-        {}
-
-        // A mgb::cg::ComputingGraph may be wrapped in a CompGraph in two ways:
-        // 1. Borrowing a ComputingGraph.
-        // 2. Own a shared_ptr of ComputingGraph.
-        // We make constructors private and use factory function instead to make
-        // it explicit at the call site. (So-called "Named Constructor")
-
-        /*!
-         * \brief Wrap a ComputingGraph by borrowing a reference.
-         */
-        static CompGraph make_from_weak_ptr(
-                const std::weak_ptr<mgb::ComputingGraph>& cg) {
-            return CompGraph{cg};
-        }
-
-        /*!
-         * \brief Wrap a ComputingGraph by owning one of its reference.
-         */
-        static CompGraph make_from_shared_ptr(
-                const std::shared_ptr<mgb::ComputingGraph>& cg) {
-            return CompGraph{cg};
-        }
-
-        CompGraph(const mgb::cg::SymbolVarArray& dest_symbol_vars) {
-            m_comp_graph_own = mgb::ComputingGraph::make();
-            mgb::cg::replace_vars_comp_graph(dest_symbol_vars,
-                                                  m_comp_graph_own.get());
-        }
-
-        void clear_device_memory();
-
-        //! get underlying ComputingGraph instance
-        mgb::ComputingGraph& get() const;
-
-        CompGraph& share_device_memory_with(CompGraph &other) {
-            get().share_device_memory_with(other.get());
-            return *this;
-        }
-
-        //! get a dict to store arbitrary user data
-        PyObject* _user_data();
-
-        AsyncExec _do_compile(bool copy, bool optimize_for_inference);
-        std::vector<AsyncExec> _do_compile_multi_part();
-
-        /*!
-         * \brief add an output spec
-         * \param callback callback to be invoked; or nullptr for computing
-         *      output var only
-         */
-        void _add_output_spec(mgb::cg::SymbolVar &var,
-                _CompGraphCallback *callback);
-
-        //! mark currently added output specs as a part in multi-part compile
-        void _add_multi_part_endpoint() {
-            m_out_specs.emplace_back();
-        }
-
-        void _clear_output_spec() {
-            m_raw_callbacks.clear();
-            m_out_specs.resize(1);
-            m_out_specs[0].clear();
-        }
-
-        size_t _release() {
-            if (m_comp_graph_own) {
-                auto ret = m_comp_graph_own.use_count();
-                m_comp_graph_own.reset();
-                return ret;
-            }
-            m_comp_graph_borrow.reset();
-            return 0;
-        }
-
-};
-
-//! wrap shared_ptr<DTypeScalar>
-class SharedScalar {
-    bool m_dtype_locked = false;
-    std::shared_ptr<mgb::DTypeScalar> m_val;
-    mgb::HostTensorND m_val_as_host_nd;
-    mgb::CompNode::UnorderedMap<std::shared_ptr<mgb::DeviceTensorND>> m_dev_val;
-
-    mgb::HostTensorND& val_as_host_nd();
-
-    public:
-        SharedScalar(PyObject *val);
-        void _set(PyObject *val);
-        PyObject* _get();
-        mgb::SymbolVar _as_sym_var(CompGraph &cg, mgb::CompNode &cn);
-
-        void _lock_dtype() {
-            m_dtype_locked = true;
-        }
-
-        bool _dtype_locked() {
-            return m_dtype_locked;
-        }
-
-        const std::shared_ptr<mgb::DTypeScalar>& get_val() const {
-            return m_val;
-        }
-};
-
-/*!
- * \brief wrap around shared mgb::cg::OperatorNodeBase
- */
-class Operator {
-    mgb::cg::OperatorNodeBase* m_operator_node;
-    std::string m_params;
-
-    static const std::unique_ptr<mgb::OprFootprint> sm_opr_footprint_ptr;
-
-public:
-    Operator() : m_operator_node(nullptr){};
-    Operator(mgb::cg::OperatorNodeBase* operator_node)
-            : m_operator_node(operator_node),
-            m_params(std::move(
-                        (sm_opr_footprint_ptr->calc_footprint(m_operator_node)).param->to_string()))
-            {}
-
-    size_t id() const { return m_operator_node->id(); }
-
-    const std::string& name() const { return m_operator_node->name(); }
-
-    const std::string& params() const { return m_params; }
-
-    const std::shared_ptr<mgb::ComputingGraph> get_owner_graph() const {
-        return m_operator_node->owner_graph()->shared_from_this();
-    }
-
-    const mgb::SymbolVarArray inputs() const {
-        return mgb::cg::to_symbol_var_array(m_operator_node->input());
-    }
-
-    const mgb::SymbolVarArray outputs() const {
-        return mgb::cg::to_symbol_var_array(m_operator_node->output());
-    }
-
-    mgb::cg::OperatorNodeBase* node() const { return m_operator_node; }
-};
-
-//! get inferred value as numpy ndarray or None
-PyObject* get_symvar_inferred_value(mgb::SymbolVar var);
-
-mgb::SymbolVar fill_retain_dtype(mgb::SymbolVar var, PyObject* value);
-
-//! whether _mgb_global_finalize() has been called
-bool global_finalized();
-
-#ifndef SWIG
-void mark_as_input(mgb::cg::ComputingGraph* cg, mgb::cg::SymbolVar var);
-#endif
-
-// vim: syntax=cpp.doxygen foldmethod=marker foldmarker=f{{{,f}}}
diff --git a/python_module/src/cpp/numpy_incl.h b/python_module/src/cpp/numpy_incl.h
deleted file mode 100644
index 02a2b693..00000000
--- a/python_module/src/cpp/numpy_incl.h
+++ /dev/null
@@ -1,37 +0,0 @@
-/**
- * \file python_module/src/cpp/numpy_incl.h
- *
- * This file is part of MegBrain, a deep learning framework developed by Megvii.
- *
- * \brief import numpy array with proper settings
- *
- * \copyright Copyright (c) 2014-2020 Megvii Inc. All rights reserved.
- *
- */
-#pragma once
-
-#define PY_ARRAY_UNIQUE_SYMBOL mgb_numpy_array_api
-#define NPY_NO_DEPRECATED_API NPY_1_7_API_VERSION
-#include <numpy/arrayobject.h>
-
-#define FOREACH_MGB_LOW_BIT(cb) \
-    cb(1) \
-    cb(2) \
-    cb(4) \
-
-#define FOREACH_MGB_DTYPE_PAIR(cb)  \
-    cb(IntB1, npy_num_intb1())      \
-    cb(IntB2, npy_num_intb2())      \
-    cb(IntB4, npy_num_intb4())      \
-    cb(BFloat16, npy_num_bfloat16())
-
-namespace mgb {
-    //! numpy type num for intb2 type
-#define DEFINE_NPY_INTBX(n) \
-    int npy_num_intb##n();
-FOREACH_MGB_LOW_BIT(DEFINE_NPY_INTBX)
-#undef DEFINE_NPY_INTBX
-    int npy_num_bfloat16();
-}
-
-// vim: syntax=cpp.doxygen foldmethod=marker foldmarker=f{{{,f}}}
diff --git a/python_module/src/cpp/opr_defs.cpp b/python_module/src/cpp/opr_defs.cpp
deleted file mode 100644
index 1fd4dbbb..00000000
--- a/python_module/src/cpp/opr_defs.cpp
+++ /dev/null
@@ -1,313 +0,0 @@
-/**
- * \file python_module/src/cpp/opr_defs.cpp
- *
- * This file is part of MegBrain, a deep learning framework developed by Megvii.
- *
- * \copyright Copyright (c) 2014-2020 Megvii Inc. All rights reserved.
- *
- */
-
-#include "./opr_defs.h"
-#include "./opr_helper.h"
-#include "./python_helper.h"
-
-#if MGB_ENABLE_OPR_MM
-#include "megbrain/opr/mm_handler.h"
-#endif
-
-#include "megbrain/opr/io.h"
-#include "megbrain/serialization/extern_c_opr_io.h"
-
-using namespace mgb;
-using namespace mgb::opr;
-
-SymbolVar _Opr::_axis_add_remove(SymbolVar src,
-        const std::vector<int>& axis, bool is_add,
-        const OperatorNodeConfig &config) {
-    using ADR = mgb::opr::AxisAddRemove;
-    std::vector<ADR::AxisDesc> desc;
-    mgb_assert(!axis.empty());
-    for (auto i: axis) {
-        if (is_add) {
-            desc.emplace_back(ADR::AxisDesc::make_add(i));
-        } else {
-            desc.emplace_back(ADR::AxisDesc::make_remove(i));
-        }
-    }
-    return ADR::make(src, desc, config);
-}
-
-SymbolVarArray _Opr::param_pack_split(
-        SymbolVar src, const std::vector<std::vector<size_t>>& shapes,
-        const OperatorNodeConfig& config) {
-    auto size = shapes.size();
-    mgb::TensorShapeArray shapearr(size);
-    for (size_t i = 0; i < size; i++) {
-        shapearr[i] = npy::vec2shape(shapes[i]);
-    }
-
-    auto cn = src.node()->comp_node();
-    auto offsets = megdnn::ParamPackConcat::gen_offsets(
-            shapearr, cn.get_mem_addr_alignment(), src.dtype().size());
-
-    return mgb::opr::ParamPackSplit::make(src, offsets, shapearr, config);
-}
-
-#if MGB_ENABLE_OPR_MM
-#include "megbrain/opr/lock.h"
-#include "megbrain/opr/io_remote.h"
-
-SymbolVar _Opr::lock_acquire(SymbolVar var, size_t lock_id, size_t group_id,
-        const OperatorNodeConfig &config) {
-    return mgb::opr::LockAcquire::make(var, {lock_id, group_id}, config);
-}
-
-SymbolVar _Opr::lock_release(SymbolVar var, size_t lock_id, size_t group_id,
-        const OperatorNodeConfig &config) {
-    return mgb::opr::LockRelease::make(var, {lock_id, group_id}, config);
-}
-
-SymbolVar _Opr::remote_send(
-        const std::string& server_addr, const int port,
-        const std::string& key, SymbolVar var,
-        const bool is_grad,
-        const OperatorNodeConfig& config) {
-    return RemoteSend::make(key, var,
-                            std::make_shared<GroupClientProxy>(ssprintf(
-                                    "%s:%d", server_addr.c_str(), port)),
-                            is_grad, config);
-}
-
-SymbolVar _Opr::remote_recv(const std::string& server_addr, const int port,
-                            const std::string& key, CompGraph& graph,
-                            const std::vector<size_t>& shape, PyObject* dtype,
-                            const OperatorNodeConfig& config) {
-    const TensorShape ishape = npy::vec2shape(shape);
-    const DType idtype = npy::dtype_np2mgb(dtype);
-
-    return RemoteRecv::make(key, graph.get(),
-                            std::make_shared<GroupClientProxy>(
-                                    ssprintf("%s:%d", server_addr.c_str(), port)),
-                            config, ishape, idtype);
-}
-
-SymbolVar _Opr::collective_comm_with_input(
-        SymbolVar inpvar, const std::string& key, const size_t nr_devices,
-        const bool is_root, const int rank, const bool local_grad,
-        const std::string& server_addr, const int port, PyObject* params,
-        PyObject* dtype, const std::string& backend, SharedND* output_buf,
-        const OperatorNodeConfig& config, const SharedScalar& disable) {
-    SymbolVarArray inputs(1, inpvar);
-    ComputingGraph* graph = inpvar.node()->owner_graph();
-    auto group_mgr = std::make_shared<GroupClientProxy>(
-            ssprintf("%s:%d", server_addr.c_str(), port));
-    SmallVector<std::shared_ptr<mgb::DeviceTensorND>> dev_buffer_arr(1, nullptr);
-    if (output_buf)
-        dev_buffer_arr[0] = output_buf->dev_tensor();
-    CollectiveComm::Param param = load_collective_comm_params(params, graph);
-    mgb::DType _dtype = DType();
-    if (dtype != Py_None) {
-        _dtype = npy::dtype_np2mgb(dtype);
-    }
-    return CollectiveComm::make(inputs, graph, key, nr_devices, is_root, rank,
-                                local_grad, group_mgr, dev_buffer_arr, param,
-                                _dtype, backend, config, disable.get_val())[0];
-}
-
-SymbolVar _Opr::collective_comm_without_input(
-        CompGraph& cg, const std::string& key, const size_t nr_devices,
-        const bool is_root, const int rank, const bool local_grad,
-        const std::string& server_addr, const int port, PyObject* params,
-        PyObject* dtype, const std::string& backend, SharedND* output_buf,
-        const OperatorNodeConfig& config, const SharedScalar& disable) {
-    SymbolVarArray inputs;
-    auto& graph = cg.get();
-    auto group_mgr = std::make_shared<GroupClientProxy>(
-            ssprintf("%s:%d", server_addr.c_str(), port));
-    SmallVector<std::shared_ptr<mgb::DeviceTensorND>> dev_buffer_arr(1, nullptr);
-    if (output_buf)
-        dev_buffer_arr[0] = output_buf->dev_tensor();
-    CollectiveComm::Param param = load_collective_comm_params(params, &graph);
-    mgb::DType _dtype = DType();
-    if (dtype != Py_None) {
-        _dtype = npy::dtype_np2mgb(dtype);
-    }
-    return CollectiveComm::make(inputs, &graph, key, nr_devices, is_root, rank,
-                                local_grad, group_mgr, dev_buffer_arr, param,
-                                _dtype, backend, config, disable.get_val())[0];
-}
-
-#else
-namespace {
-    [[noreturn]] void on_opr_mm() {
-        mgb_throw(MegBrainError, "opr-mm disabled at compile time");
-    }
-}
-SymbolVar _Opr::lock_acquire(SymbolVar var, size_t lock_id, size_t group_id,
-        const OperatorNodeConfig &config) {
-    on_opr_mm();
-}
-
-SymbolVar _Opr::lock_release(SymbolVar var, size_t lock_id, size_t group_id,
-        const OperatorNodeConfig &config) {
-    on_opr_mm();
-}
-
-
-SymbolVar _Opr::remote_send(
-        const std::string& server_addr, const int port,
-        const std::string& key, SymbolVar var,
-        const bool is_grad,
-        const OperatorNodeConfig& config) {
-    on_opr_mm();
-}
-
-SymbolVar _Opr::remote_recv(const std::string& server_addr, const int port,
-                            const std::string& key, CompGraph& graph,
-                            const std::vector<size_t>& shape, PyObject* dtype,
-                            const OperatorNodeConfig& config) {
-    on_opr_mm();
-}
-
-SymbolVar _Opr::collective_comm_with_input(
-        SymbolVar inpvar, const std::string& key, const size_t nr_devices,
-        const bool is_root, const int rank, const bool local_grad,
-        const std::string& server_addr, const int port, PyObject* params,
-        PyObject* dtype, const std::string& backend, SharedND* output_buf,
-        const OperatorNodeConfig& config, const SharedScalar& disable) {
-    on_opr_mm();
-}
-
-SymbolVar _Opr::collective_comm_without_input(
-        CompGraph& cg, const std::string& key, const size_t nr_devices,
-        const bool is_root, const int rank, const bool local_grad,
-        const std::string& server_addr, const int port, PyObject* params,
-        PyObject* dtype, const std::string& backend, SharedND* output_buf,
-        const OperatorNodeConfig& config, const SharedScalar& disable) {
-    on_opr_mm();
-}
-
-#endif // MGB_ENABLE_OPR_MM
-
-SymbolVarArray _Opr::extern_c_opr_placeholder(
-        const SymbolVarArray& inputs,
-        const std::vector<std::vector<size_t>>& output_shapes,
-        PyObject* output_dtypes, const char* dump_name, PyObject* data_bytes,
-        const OperatorNodeConfig& config) {
-    mgb_assert(PyBytes_Check(data_bytes));
-    if (output_dtypes != Py_None) {
-        mgb_assert(PyTuple_Check(output_dtypes));
-        mgb_assert(output_shapes.size() ==
-                           static_cast<size_t>(PyTuple_Size(output_dtypes)));
-    }
-
-    TensorShapeArray cpp_output_shapes(output_shapes.size());
-    for (size_t i = 0; i < output_shapes.size(); ++i) {
-        cpp_output_shapes[i] = npy::vec2shape(output_shapes[i]);
-    }
-    SmallVector<DType> cpp_output_dtypes;
-    if (output_dtypes != Py_None) {
-        size_t dtype_size = PyTuple_Size(output_dtypes);
-        for (size_t i = 0; i < dtype_size; ++i) {
-            cpp_output_dtypes.push_back(
-                    npy::dtype_np2mgb(PyTuple_GetItem(output_dtypes, i)));
-        }
-    }
-
-    auto opr = opr::ExternCOprRunner::make_placeholder(
-            inputs, cpp_output_shapes, dump_name, PyBytes_AsString(data_bytes),
-            PyBytes_Size(data_bytes), config, cpp_output_dtypes);
-    SymbolVarArray ret;
-    ret.reserve(opr->output().size());
-    for (auto i: opr->output())
-        ret.emplace_back(i);
-    return ret;
-}
-
-#if MGB_ENABLE_TENSOR_RT
-
-#include "megbrain/tensorrt/tensorrt_runtime_opr.h"
-
-SymbolVarArray _Opr::tensor_rt_runtime(const SymbolVarArray& inputs,
-                                       PyObject* data_bytes,
-                                       const OperatorNodeConfig& config) {
-    mgb_assert(PyBytes_Check(data_bytes));
-    auto size = PyBytes_Size(data_bytes);
-    mgb_assert(size, "trt data bytes should not be empty");
-    return opr::TensorRTRuntimeOpr::make(PyBytes_AsString(data_bytes),
-                                         size, inputs,
-                                         config);
-}
-#else
-SymbolVarArray _Opr::tensor_rt_runtime(const SymbolVarArray& inputs,
-                                       PyObject* data_bytes,
-                                       const OperatorNodeConfig& config) {
-    mgb_throw(MegBrainError, "TensorRT disabled at compile time");
-}
-#endif
-
-#if MGB_ATLAS
-
-#include "megbrain/opr/atlas_runtime_op.h"
-
-SymbolVarArray _Opr::atlas_runtime(const SymbolVarArray& inputs,
-                                   PyObject* data_bytes,
-                                   const OperatorNodeConfig& config) {
-    mgb_assert(PyBytes_Check(data_bytes));
-    auto size = PyBytes_Size(data_bytes);
-    mgb_assert(size, "atlas data bytes should not be empty");
-
-    return opr::AtlasRuntimeOpr::make(PyBytes_AsString(data_bytes), size,
-                                      inputs, config);
-}
-#else
-SymbolVarArray _Opr::atlas_runtime(const SymbolVarArray& inputs,
-                                   PyObject* data_bytes,
-                                   const OperatorNodeConfig& config) {
-    mgb_throw(MegBrainError, "Atlas disabled at compile time");
-}
-#endif
-
-SymbolVar _Opr::timestamp(SymbolVar input, PyObject* dest, size_t dest_off,
-                           const OperatorNodeConfig& config) {
-    auto tensor = std::make_shared<HostTensorND>(
-            npy::np2tensor(dest, npy::Meth::must_borrow(), dtype::Float32{}));
-    return opr::Timestamp::make(input, std::move(tensor), dest_off, config);
-}
-
-SymbolVar _Opr::virtual_loss(const SymbolVarArray& ys,
-                             const SymbolVarArray& y_grads,
-                             const OperatorNodeConfig& config) {
-    return opr::VirtualLoss::make(ys, y_grads, {}, config);
-}
-
-SymbolVar _Opr::virtual_dep(const SymbolVarArray& symvars,
-                            const OperatorNodeConfig& config) {
-    return opr::VirtualDep::make(symvars, config);
-}
-
-
-#if MGB_CAMBRICON
-#include "megbrain/cambricon/cambricon_runtime_opr.h"
-
-SymbolVarArray _Opr::cambricon_runtime(PyObject* data_bytes, const char* symbol,
-                                       const SymbolVarArray& inputs,
-                                       bool tensor_dim_mutable,
-                                       const OperatorNodeConfig& config) {
-    mgb_assert(PyBytes_Check(data_bytes));
-    auto size = PyBytes_Size(data_bytes);
-    mgb_assert(size, "cambricon data bytes should not be empty");
-    return opr::CambriconRuntimeOpr::make(PyBytes_AsString(data_bytes), size,
-                                          symbol, inputs, tensor_dim_mutable,
-                                          config);
-}
-#else
-SymbolVarArray _Opr::cambricon_runtime(PyObject* data_bytes, const char* symbol,
-                                       const SymbolVarArray& inputs,
-                                       bool tensor_dim_mutable,
-                                       const OperatorNodeConfig& config) {
-    mgb_throw(MegBrainError, "Cambricon disabled at compile time");
-}
-#endif
-
-// vim: syntax=cpp.doxygen foldmethod=marker foldmarker=f{{{,f}}}
diff --git a/python_module/src/cpp/opr_defs.h b/python_module/src/cpp/opr_defs.h
deleted file mode 100644
index 29a70c5f..00000000
--- a/python_module/src/cpp/opr_defs.h
+++ /dev/null
@@ -1,170 +0,0 @@
-/**
- * \file python_module/src/cpp/opr_defs.h
- *
- * This file is part of MegBrain, a deep learning framework developed by Megvii.
- *
- * \brief extra opr definitions
- *
- * \copyright Copyright (c) 2014-2020 Megvii Inc. All rights reserved.
- *
- */
-
-#ifndef SWIG
-#pragma once
-
-#include "./megbrain_wrap.h"
-#include "./opr_helper.h"
-
-#if MGB_ENABLE_OPR_MM
-#include "megbrain/opr/collective_comm.h"
-#endif
-#include "megbrain/opr/basic_arith.h"
-#include "megbrain/opr/tensor_manip.h"
-using mgb::SymbolVar;
-using mgb::SymbolVarArray;
-using mgb::OperatorNodeConfig;
-
-#endif
-
-class _Opr {
-
-public:
-
-// basic arith
-
-static SymbolVar add_update(SymbolVar dest, SymbolVar delta,
-        const SharedScalar &alpha, const SharedScalar &beta,
-        const SharedScalar &bias, const SharedScalar &disable,
-        const OperatorNodeConfig &config) {
-    return mgb::opr::AddUpdate::make(dest, delta,
-            {alpha.get_val(), beta.get_val(), bias.get_val(), disable.get_val()},
-            config);
-}
-
-// tensor manip
-
-static SymbolVarArray param_pack_split(
-        SymbolVar src, const std::vector<std::vector<size_t>>& shapes,
-        const OperatorNodeConfig& config);
-
-static SymbolVar dimshuffle(SymbolVar src,
-        const std::vector<int> &pattern, size_t ndim,
-        const OperatorNodeConfig &config) {
-    return mgb::opr::Dimshuffle::make(src, pattern, ndim, config);
-}
-
-static SymbolVar _axis_add_remove(SymbolVar src,
-        const std::vector<int>& axis, bool is_add,
-        const OperatorNodeConfig &config);
-
-static SymbolVar callback_injector(SymbolVar src, _CompGraphCallback &callback,
-        const OperatorNodeConfig &config) {
-    return mgb::opr::CallbackInjector::make(src, callback.make_callback());
-}
-
-static SymbolVar callback_injector(SymbolVarArray src, _CompGraphCallback &callback,
-                                   const OperatorNodeConfig &config) {
-    return mgb::opr::CallbackInjector::make(src, callback.make_multi_input_callback());
-}
-
-static SymbolVar set_grad(SymbolVar src, _SetGradCallback &grad_getter,
-        const OperatorNodeConfig &config) {
-    return mgb::opr::SetGrad::make(src, grad_getter.make_callback(), config);
-}
-
-// multi machine
-
-static SymbolVar lock_acquire(SymbolVar var, size_t lock_id, size_t group_id,
-        const OperatorNodeConfig &config);
-
-static SymbolVar lock_release(SymbolVar var, size_t lock_id, size_t group_id,
-        const OperatorNodeConfig &config);
-
-static SymbolVar remote_send(
-        const std::string& server_addr, const int port,
-        const std::string& key, SymbolVar var,
-        const bool is_grad,
-        const OperatorNodeConfig& config);
-
-static SymbolVar remote_recv(const std::string& server_addr, const int port,
-                             const std::string& key,
-                             CompGraph& graph,
-                             const std::vector<size_t>& shape, PyObject* dtype,
-                             const OperatorNodeConfig& config);
-
-static SymbolVar collective_comm_with_input(
-        SymbolVar inpvar, const std::string& key, const size_t nr_devices,
-        const bool is_root, const int rank, const bool local_grad,
-        const std::string& server_addr, const int port, PyObject* params,
-        PyObject* dtype, const std::string& backend, SharedND* output_buf,
-        const OperatorNodeConfig& config, const SharedScalar& disable);
-
-static SymbolVar collective_comm_without_input(
-        CompGraph& graph, const std::string& key, const size_t nr_devices,
-        const bool is_root, const int rank, const bool local_grad,
-        const std::string& server_addr, const int port, PyObject* params,
-        PyObject* dtype, const std::string& backend, SharedND* output_buf,
-        const OperatorNodeConfig& config, const SharedScalar& disable);
-
-// misc
-static SymbolVarArray extern_c_opr_placeholder(
-        const SymbolVarArray& inputs,
-        const std::vector<std::vector<size_t>>& output_shapes,
-        PyObject* dtypes,
-        const char* dump_name, PyObject* data_bytes,
-        const OperatorNodeConfig& config);
-
-static SymbolVarArray tensor_rt_runtime(const SymbolVarArray& inputs,
-                                        PyObject* data_bytes,
-                                        const OperatorNodeConfig& config);
-
-
-
-static SymbolVar timestamp(SymbolVar input, PyObject* dest, size_t dest_off,
-                           const OperatorNodeConfig& config);
-
-static SymbolVar virtual_loss(const SymbolVarArray& ys,
-                              const SymbolVarArray& y_grads,
-                              const OperatorNodeConfig& config);
-
-static SymbolVar virtual_dep(const SymbolVarArray& symvars,
-                             const OperatorNodeConfig& config);
-
-static SymbolVarArray atlas_runtime(const SymbolVarArray& inputs,
-        PyObject* data_bytes,
-        const OperatorNodeConfig& config);
-
-
-static SymbolVarArray cambricon_runtime(PyObject* data_bytes,
-                                        const char* symbol,
-                                        const SymbolVarArray& inputs,
-                                        bool tensor_dim_mutable,
-                                        const OperatorNodeConfig& config);
-
-#ifdef SWIG
-%pythoncode {
-
-@classmethod
-def _make_axis_vec(cls, axis):
-    ret = _VectorInt()
-    if isinstance(axis, collections.Iterable):
-        for i in axis:
-            ret.push_back(i)
-    else:
-        ret.push_back(axis)
-    return ret
-
-@classmethod
-def add_axis(cls, src, axis, config):
-    return cls._axis_add_remove(src, cls._make_axis_vec(axis), True, config)
-
-@classmethod
-def remove_axis(cls, src, axis, config):
-    return cls._axis_add_remove(src, cls._make_axis_vec(axis), False, config)
-
-} // %pythoncode
-#endif // SWIG
-
-};
-
-// vim: syntax=cpp.doxygen foldmethod=marker foldmarker=f{{{,f}}}
diff --git a/python_module/src/cpp/opr_helper.cpp b/python_module/src/cpp/opr_helper.cpp
deleted file mode 100644
index 0baa47c2..00000000
--- a/python_module/src/cpp/opr_helper.cpp
+++ /dev/null
@@ -1,192 +0,0 @@
-/**
- * \file python_module/src/cpp/opr_helper.cpp
- *
- * This file is part of MegBrain, a deep learning framework developed by Megvii.
- *
- * \copyright Copyright (c) 2014-2020 Megvii Inc. All rights reserved.
- *
- */
-
-#include "./opr_helper.h"
-#include "./megbrain_wrap.h"
-#include "megbrain/opr/indexing.h"
-#include "megbrain/opr/io.h"
-#include "megbrain/serialization/opr_load_dump.h"
-
-using namespace mgb;
-
-namespace {
-    class OprParamsLoadContext final: public serialization::OprLoadContextRawPOD {
-        PyObject *m_params;
-        ComputingGraph *m_graph;
-        size_t m_nr_used_params = 0, m_param_size = 0;
-        size_t m_item_bytes_consumed = 0;
-
-        void read_raw(void *dest, size_t size) override final {
-            mgb_assert(m_nr_used_params < m_param_size);
-            auto item = PyList_GetItem(m_params, m_nr_used_params);
-            mgb_assert(item, "failed to get item %zu", m_nr_used_params);
-            mgb_assert(PyBytes_Check(item), "list item must be bytes");
-            auto item_size = PyBytes_Size(item);
-            mgb_assert(size < (SIZE_MAX >> 3));
-            mgb_assert(m_item_bytes_consumed + size <= size_t(item_size));
-            auto item_buf = PyBytes_AsString(item);
-            mgb_assert(item_size > 0 && item_buf);
-            memcpy(dest, item_buf + m_item_bytes_consumed, size);
-            m_item_bytes_consumed += size;
-            if (m_item_bytes_consumed == size_t(item_size)) {
-                ++ m_nr_used_params;
-                m_item_bytes_consumed = 0;
-            }
-        }
-
-        std::shared_ptr<HostTensorND> load_tensor() override {
-            mgb_assert(0);
-        }
-
-        std::shared_ptr<DeviceTensorND> load_tensor_shared() override {
-            mgb_assert(0);
-        }
-
-        const serialization::GraphLoadConfig& config() const override {
-            mgb_assert(0);
-        }
-
-        public:
-            OprParamsLoadContext(PyObject *params, ComputingGraph *graph):
-                m_params{params}, m_graph{graph}
-            {
-                mgb_assert(PyList_Check(params), "params must be a list");
-                m_param_size = PyList_Size(params);
-            }
-
-            ~OprParamsLoadContext() {
-                mgb_assert(m_nr_used_params == m_param_size,
-                        "number of params mismatch");
-            }
-
-            ComputingGraph& graph() override {
-                return *m_graph;
-            }
-    };
-} // anonymous namespace
-
-_SplitPartCallback::callback_t _SplitPartCallback::make_callback() {
-    mgb_assert(!m_cb_created);
-    m_cb_created = true;
-
-    std::shared_ptr<_SplitPartCallback> cb_ptr(this);
-
-    auto cb = [cb_ptr](size_t sz) {
-        return cb_ptr->call(sz);
-    };
-
-    return cb;
-}
-
-_SetGradCallback::callback_t _SetGradCallback::make_callback() {
-    mgb_assert(!m_cb_created);
-    m_cb_created = true;
-
-    if (empty()) {
-        return {};
-    }
-
-    std::shared_ptr<_SetGradCallback> cb_ptr(this);
-
-    auto cb = [cb_ptr](const opr::SetGrad& opr) {
-        auto graph = CompGraph::make_from_weak_ptr(
-                opr.owner_graph()->shared_from_this());
-        return cb_ptr->call(graph);
-    };
-
-    return cb;
-}
-
-_TimeoutCallback::callback_t _TimeoutCallback::make_callback() {
-    mgb_assert(!m_cb_created);
-    m_cb_created = true;
-
-    std::shared_ptr<_TimeoutCallback> cb_ptr(this);
-    auto cb = [cb_ptr]() {
-        return cb_ptr->call();
-    };
-    return cb;
-}
-
-mgb::SymbolVar _create_subtensor_like_opr(
-        const std::string &name,
-        const SymbolVarArray& inputs,
-        const std::vector<AxisIndexer> &idx,
-        const mgb::OperatorNodeConfig &config) {
-#define CHK1(_name, _opr) \
-    if (name == _name) { \
-        mgb_assert(inputs.size() == 1); \
-        return opr::_opr::make(inputs[0], idx, config); \
-    }
-#define CHK2(_name, _opr) \
-    if (name == _name) { \
-        mgb_assert(inputs.size() == 2); \
-        return opr::_opr::make(inputs[0], inputs[1], idx, config); \
-    }
-
-    CHK1("subtensor", Subtensor);
-    CHK2("set_subtensor", SetSubtensor);
-    CHK2("incr_subtensor", IncrSubtensor);
-    CHK1("mavi", IndexingMultiAxisVec);
-    CHK2("set_mavi", IndexingSetMultiAxisVec);
-    CHK2("incr_mavi", IndexingIncrMultiAxisVec);
-    CHK1("mesh_indexing", MeshIndexing);
-    CHK1("batched_mesh_indexing", BatchedMeshIndexing);
-    CHK2("incr_mesh_indexing", IncrMeshIndexing);
-    CHK2("set_mesh_indexing", SetMeshIndexing);
-    CHK2("batched_incr_mesh_indexing", BatchedIncrMeshIndexing);
-    CHK2("batched_set_mesh_indexing", BatchedSetMeshIndexing);
-
-    mgb_throw(MegBrainError, "bad subtensor opr name: %s", name.c_str());
-
-#undef CHK1
-#undef CHK2
-}
-
-SymbolVar _make_immutable(CompGraph &comp_graph, PyObject *npyarr,
-        PyObject *dtype, const mgb::cg::OperatorNodeConfig &config) {
-
-    auto cn = config.get_single_comp_node();
-    mgb_assert(cn.valid(), "invalid comp node given to make_tensor");
-    DType dtype_mgb;
-    if (dtype && dtype != Py_None)
-        dtype_mgb = npy::dtype_np2mgb(dtype);
-    auto hv = npy::np2tensor(npyarr, npy::Meth::borrow(cn), dtype_mgb);
-    return opr::ImmutableTensor::make(comp_graph.get(), hv, config);
-}
-
-SymbolVarArray _create_opr(
-        const char *name, const SymbolVarArray &inputs,
-        PyObject *params, const OperatorNodeConfig &config) {
-    mgb_assert(!inputs.empty());
-    auto registry = serialization::OprRegistry::find_by_name(name);
-    mgb_assert(registry, "operator %s not found", name);
-    OprParamsLoadContext ctx{params, inputs[0].node()->owner_graph()};
-    VarNodeArray vinputs(inputs.size());
-    for (size_t i = 0; i < inputs.size(); ++ i)
-        vinputs[i] = inputs[i].node();
-    auto opr = registry->loader(ctx, vinputs, config);
-
-    SymbolVarArray ret;
-    for (auto i: opr->output()) {
-        if (!i->contain_flag(VarNode::Flag::VOLATILE_CONTENT))
-            ret.push_back(i);
-    }
-    return ret;
-}
-
-#if MGB_ENABLE_OPR_MM
-mgb::opr::CollectiveComm::Param load_collective_comm_params(
-        PyObject* params, mgb::ComputingGraph* graph) {
-    OprParamsLoadContext ctx{params, graph};
-    return ctx.read_param<mgb::opr::CollectiveComm::Param>();
-}
-#endif
-
-// vim: syntax=cpp.doxygen foldmethod=marker foldmarker=f{{{,f}}}
diff --git a/python_module/src/cpp/opr_helper.h b/python_module/src/cpp/opr_helper.h
deleted file mode 100644
index 15b49d5a..00000000
--- a/python_module/src/cpp/opr_helper.h
+++ /dev/null
@@ -1,71 +0,0 @@
-/**
- * \file python_module/src/cpp/opr_helper.h
- *
- * This file is part of MegBrain, a deep learning framework developed by Megvii.
- *
- * \brief helper for wrapping special oprs
- *
- * \copyright Copyright (c) 2014-2020 Megvii Inc. All rights reserved.
- *
- */
-
-#pragma once
-
-#include "./megbrain_wrap.h"
-
-#include "megbrain/opr/tensor_manip.h"
-#include "megbrain/opr/utility.h"
-#if MGB_ENABLE_OPR_MM
-#include "megbrain/opr/collective_comm.h"
-#endif
-using AxisIndexer = mgb::opr::indexing::AxisIndexer;
-
-/*!
- * \brief wrapping callbacks used for opr::Split::Options::make_callback
- */
-class _SplitPartCallback {
-    bool m_cb_created = false;
-
-    public:
-        virtual ~_SplitPartCallback() = default;
-        virtual std::vector<size_t> call(size_t tot_size) = 0;
-
-        using callback_t = mgb::opr::Split::Options::callback_t;
-        callback_t make_callback();
-};
-
-class _SetGradCallback {
-    bool m_cb_created = false;
-
-    public:
-        virtual ~_SetGradCallback() = default;
-        virtual mgb::SymbolVar call(CompGraph &graph) = 0;
-        virtual bool empty() = 0;
-
-        using callback_t = mgb::opr::SetGrad::GradGetter;
-        callback_t make_callback();
-};
-
-/*!
- * \brief wrapping callbacks used for subclasses of opr::RemoteIOBase
- */
-class _TimeoutCallback {
-    bool m_cb_created = false;
-
-    public:
-        virtual ~_TimeoutCallback() = default;
-        /*!
-         * \brief Will be overrided by swig generated code, calls into Python.
-         */
-        virtual bool call() = 0;
-
-        using callback_t = mgb::thin_function<bool()>;
-        callback_t make_callback();
-};
-
-#if MGB_ENABLE_OPR_MM
-mgb::opr::CollectiveComm::Param load_collective_comm_params(
-        PyObject* params, mgb::ComputingGraph* graph);
-#endif
-
-// vim: syntax=cpp.doxygen foldmethod=marker foldmarker=f{{{,f}}}
diff --git a/python_module/src/cpp/plugin.cpp b/python_module/src/cpp/plugin.cpp
deleted file mode 100644
index 5ca4df44..00000000
--- a/python_module/src/cpp/plugin.cpp
+++ /dev/null
@@ -1,243 +0,0 @@
-/**
- * \file python_module/src/cpp/plugin.cpp
- *
- * This file is part of MegBrain, a deep learning framework developed by Megvii.
- *
- * \brief helpers for debugging
- *
- * \copyright Copyright (c) 2014-2020 Megvii Inc. All rights reserved.
- *
- */
-
-#include "./plugin.h"
-#include "./python_helper.h"
-
-#include "megbrain/system.h"
-
-#include <thread>
-#include <cstring>
-#include <sstream>
-
-#ifdef WIN32
-#include <windows.h>
-#else
-#include <pthread.h>
-#include <unistd.h>
-#endif
-#include <signal.h>
-
-/* ================= _InfkernFinderImpl ================= */
-size_t _InfkernFinderImpl::sm_id = 0;
-
-_InfkernFinderImpl::_InfkernFinderImpl(CompGraph &cg, bool record_input_value):
-    m_id{sm_id ++},
-    m_comp_graph{cg.get().shared_from_this()},
-    m_finder{m_comp_graph.get(), record_input_value}
-{
-}
-
-size_t _InfkernFinderImpl::_write_to_file(const char *fpath) {
-    auto opr = m_finder.write_to_file(fpath);
-    if (opr)
-        return opr->id() + 1;
-    return 0;
-}
-
-size_t _InfkernFinderImpl::_get_input_values_prepare(size_t opr_id) {
-    m_inp_val = m_finder.get_input_values(opr_id);
-    return m_inp_val.size();
-}
-
-const char* _InfkernFinderImpl::_get_input_values_var_name(size_t idx) {
-    return m_inp_val.at(idx).first->cname();
-}
-
-size_t _InfkernFinderImpl::_get_input_values_var_idx(size_t idx) {
-    return m_inp_val.at(idx).first->id();
-}
-
-size_t _InfkernFinderImpl::_get_input_values_run_id(size_t idx) {
-    return m_inp_val.at(idx).second.run_id;
-}
-
-CompGraphCallbackValueProxy  _InfkernFinderImpl::_get_input_values_val(size_t idx) {
-    return CompGraphCallbackValueProxy::make_raw_host_value_proxy(
-            m_inp_val.at(idx).second.val);
-}
-
-std::string _InfkernFinderImpl::__repr__() {
-    return mgb::ssprintf(
-            "_InfkernFinderImpl(%zu,graph=%p)", m_id, m_comp_graph.get());
-}
-
-/* ================= _FastSignal ================= */
-
-class _FastSignal::Impl {
-    using HandlerCallback = std::function<void()>;
-    bool m_worker_started = false;
-    std::mutex m_mtx;
-    std::thread m_worker_hdl;
-#ifdef WIN32
-    SECURITY_ATTRIBUTES win_sa = {sizeof(SECURITY_ATTRIBUTES), NULL, TRUE};
-    HANDLE pipe_r, pipe_w;
-    DWORD bytes_r_w;
-#else
-    int m_pfd[2]; //! pipe fds; write signal handlers, -1 for exit
-#endif
-    std::unordered_map<int, HandlerCallback> m_handler_callbacks;
-
-    void worker() {
-        std::ostringstream oss;
-        oss << std::this_thread::get_id() << std::endl;
-        mgb_log("fast signal worker started in thread %s", oss.str().c_str());
-        mgb::sys::set_thread_name("fastsgl");
-        int signum;
-        for (;;) {
-#ifdef WIN32
-            if (ReadFile(pipe_r, &signum, sizeof(int), &bytes_r_w, NULL) ==
-                NULL) {
-#else
-            if (read(m_pfd[0], &signum, sizeof(int)) != sizeof(int)) {
-#endif
-                if (errno == EINTR)
-                    continue;
-                mgb_log_error("fast signal worker: "
-                        "failed to read from self pipe: %s",
-                        strerror(errno));
-                return;
-            }
-            std::exception_ptr exc_ptr;
-            if (signum == -1)
-                return;
-            try {
-                HandlerCallback *cb;
-                {
-                    MGB_LOCK_GUARD(m_mtx);
-                    cb = &m_handler_callbacks.at(signum);
-                }
-                (*cb)();
-            } MGB_CATCH_ALL_EXCEPTION("fast signal worker", exc_ptr);
-        }
-    }
-
-    void setup() {
-        if (m_worker_started)
-            return;
-
-#ifdef WIN32
-        if (!CreatePipe(&pipe_r, &pipe_w, &win_sa, 0)) {
-            throw mgb::MegBrainError(mgb::ssprintf("failed to create pipe: %s",
-                                                   strerror(errno)));
-        }
-#else
-        if (pipe(m_pfd)) {
-            throw mgb::MegBrainError(mgb::ssprintf("failed to create pipe: %s",
-                                                   strerror(errno)));
-        }
-#endif
-        std::thread t(std::bind(&Impl::worker, this));
-        m_worker_hdl.swap(t);
-        m_worker_started = true;
-    }
-
-    void write_pipe(int v) {
-        mgb_assert(m_worker_started);
-#ifdef WIN32
-        if (WriteFile(pipe_w, &v, sizeof(int), &bytes_r_w, NULL) == NULL) {
-#else
-        if (write(m_pfd[1], &v, sizeof(int)) != sizeof(int)) {
-#endif
-            mgb_log_error("fast signal: failed to write to self pipe: %s",
-                    strerror(errno));
-        }
-    }
-
-    public:
-        bool worker_started() const {
-            return m_worker_started;
-        }
-
-        void register_handler(int signum, PyObject *func) {
-            setup();
-
-            {
-                PYTHON_GIL;
-                mgb_assert(PyCallable_Check(func));
-                Py_INCREF(func);
-            }
-            auto deleter = [](PyObject *f){
-                PYTHON_GIL;
-                Py_DECREF(f);
-            };
-            std::shared_ptr<PyObject> funcptr(func, deleter);
-
-            auto callback = [funcptr]() {
-                PYTHON_GIL;
-                auto func = funcptr.get();
-                auto ret = PyObject_CallObject(func, nullptr);
-                mgb_assert(ret, "failed to call pyobj %p; repr=%s",
-                        func, PyUnicode_AsUTF8(PyObject_Repr(func)));
-                Py_DECREF(ret);
-            };
-
-            MGB_LOCK_GUARD(m_mtx);
-            m_handler_callbacks[signum] = callback;
-        }
-
-        void shutdown() {
-            MGB_LOCK_GUARD(m_mtx);
-            if (!m_worker_started)
-                return;
-            write_pipe(-1);
-            m_worker_hdl.join();
-#ifdef WIN32
-            CloseHandle(pipe_r);
-            CloseHandle(pipe_w);
-#else
-            close(m_pfd[0]);
-            close(m_pfd[1]);
-#endif
-            m_handler_callbacks.clear();
-            m_worker_started = false;
-        }
-
-        void signal_hander(int signum) {
-            write_pipe(signum);
-        }
-
-        ~Impl() {
-            shutdown();
-        }
-};
-
-_FastSignal::Impl _FastSignal::sm_impl;
-
-void _FastSignal::signal_hander(int signum) {
-    if (sm_impl.worker_started())
-        sm_impl.signal_hander(signum);
-}
-
-void _FastSignal::register_handler(int signum, PyObject *func) {
-#ifdef WIN32
-    //! up to now we can only use CTRL_C_EVENT to unix signal.SIGUSR1/2
-    //FIXME: how to coherence signal number at python side
-    // https://docs.microsoft.com/en-gb/cpp/c-runtime-library/reference/signal?view=vs-2017
-    mgb_assert(signum == CTRL_C_EVENT, "only allow register CTRL_C_EVENT as unix signal.SIGUSR1/2 now");
-    signal(signum, signal_hander);
-#else
-    struct sigaction action;
-    memset(&action, 0, sizeof(action));
-    action.sa_handler = &signal_hander;
-    int ret = sigaction(signum, &action, nullptr);
-    mgb_assert(!ret, "sigaction failed: %s", strerror(errno));
-#endif
-
-    sm_impl.register_handler(signum, func);
-}
-
-void _FastSignal::shutdown()  {
-    sm_impl.shutdown();
-}
-
-// vim: syntax=cpp.doxygen foldmethod=marker foldmarker=f{{{,f}}}
-
diff --git a/python_module/src/cpp/plugin.h b/python_module/src/cpp/plugin.h
deleted file mode 100644
index 5253a0bb..00000000
--- a/python_module/src/cpp/plugin.h
+++ /dev/null
@@ -1,135 +0,0 @@
-/**
- * \file python_module/src/cpp/plugin.h
- *
- * This file is part of MegBrain, a deep learning framework developed by Megvii.
- *
- * \brief helpers for debugging
- *
- * \copyright Copyright (c) 2014-2020 Megvii Inc. All rights reserved.
- *
- */
-
-
-#ifndef SWIG
-
-#pragma once
-
-#include "./megbrain_wrap.h"
-
-#include "megbrain/plugin/profiler.h"
-#include "megbrain/plugin/infkern_finder.h"
-#include "megbrain/plugin/num_range_checker.h"
-#include "megbrain/plugin/opr_io_dump.h"
-
-#endif // SWIG
-
-#include <Python.h>
-
-class _CompGraphProfilerImpl {
-#ifndef SWIG
-    std::shared_ptr<mgb::ComputingGraph> m_comp_graph;
-    mgb::GraphProfiler m_profiler;
-#endif
-
-    public:
-        _CompGraphProfilerImpl(CompGraph &cg):
-            m_comp_graph{cg.get().shared_from_this()},
-            m_profiler{m_comp_graph.get()}
-        {
-        }
-
-        std::string _get_result() {
-            auto json = m_profiler.to_json_full(
-                    m_comp_graph->current_comp_seq());
-            return json->to_string();
-        }
-};
-
-class _NumRangeCheckerImpl {
-#ifndef SWIG
-    std::shared_ptr<mgb::ComputingGraph> m_comp_graph;
-    mgb::NumRangeChecker m_checker;
-#endif
-
-    public:
-        _NumRangeCheckerImpl(CompGraph &cg, float range):
-            m_comp_graph{cg.get().shared_from_this()},
-            m_checker{m_comp_graph.get(), range}
-        {
-        }
-};
-
-class _TextOprIODumpImpl {
-#ifndef SWIG
-    std::shared_ptr<mgb::ComputingGraph> m_comp_graph;
-    mgb::TextOprIODump m_dump;
-#endif
-
-    public:
-        _TextOprIODumpImpl(CompGraph &cg, const char *fpath):
-            m_comp_graph{cg.get().shared_from_this()},
-            m_dump{m_comp_graph.get(), fpath}
-        {
-        }
-
-        void _print_addr(bool flag) {
-            m_dump.print_addr(flag);
-        }
-
-        void _max_size(size_t size) {
-            m_dump.max_size(size);
-        }
-};
-
-class _BinaryOprIODumpImpl {
-#ifndef SWIG
-    std::shared_ptr<mgb::ComputingGraph> m_comp_graph;
-    mgb::BinaryOprIODump m_dump;
-#endif
-
-    public:
-        _BinaryOprIODumpImpl(CompGraph &cg, const char *fpath):
-            m_comp_graph{cg.get().shared_from_this()},
-            m_dump{m_comp_graph.get(), fpath}
-        {
-        }
-};
-
-class _InfkernFinderImpl {
-#ifndef SWIG
-    static size_t sm_id;
-    const size_t m_id;
-    std::shared_ptr<mgb::ComputingGraph> m_comp_graph;
-    mgb::InfkernFinder m_finder;
-    mgb::InfkernFinder::InputValueRecord::FullRecord m_inp_val;
-#endif
-
-    public:
-        _InfkernFinderImpl(CompGraph &cg, bool record_input_value);
-
-        size_t _write_to_file(const char *fpath);
-
-        size_t _get_input_values_prepare(size_t opr_id);
-        const char* _get_input_values_var_name(size_t idx);
-        size_t _get_input_values_var_idx(size_t idx);
-        size_t _get_input_values_run_id(size_t idx);
-        CompGraphCallbackValueProxy  _get_input_values_val(size_t idx);
-
-        std::string __repr__();
-
-};
-
-class _FastSignal {
-#ifndef SWIG
-    class Impl;
-    static Impl sm_impl;
-
-    static void signal_hander(int signum);
-#endif
-    public:
-        static void register_handler(int signum, PyObject *func);
-        static void shutdown();
-};
-
-// vim: syntax=cpp.doxygen foldmethod=marker foldmarker=f{{{,f}}}
-
diff --git a/python_module/src/cpp/python_helper.cpp b/python_module/src/cpp/python_helper.cpp
deleted file mode 100644
index 1cc97d7a..00000000
--- a/python_module/src/cpp/python_helper.cpp
+++ /dev/null
@@ -1,911 +0,0 @@
-/**
- * \file python_module/src/cpp/python_helper.cpp
- *
- * This file is part of MegBrain, a deep learning framework developed by Megvii.
- *
- * \brief helper utilities for python integration
- *
- * \copyright Copyright (c) 2014-2020 Megvii Inc. All rights reserved.
- *
- */
-
-#include "./python_helper.h"
-#include "megbrain/graph/exc_extra_info.h"
-#include "megbrain/graph/event.h"
-#include "megbrain/graph/cg.h"
-#include "megbrain/utils/mempool.h"
-
-#include "./numpy_incl.h"
-
-/*
- * demangle typeid, see
- * http://stackoverflow.com/questions/281818/unmangling-the-result-of-stdtype-infoname
- */
-#ifdef __GNUG__
-#include <cstdlib>
-#include <memory>
-#include <cxxabi.h>
-
-namespace {
-
-std::string demangle_typeid(const char* name) {
-
-    int status = -4; // some arbitrary value to eliminate the compiler warning
-
-    // enable c++11 by passing the flag -std=c++11 to g++
-    std::unique_ptr<char, void(*)(void*)> res {
-        abi::__cxa_demangle(name, nullptr, nullptr, &status),
-        std::free
-    };
-
-    return (status==0) ? res.get() : name ;
-}
-}
-#else
-
-namespace {
-// does nothing if not g++
-std::string demangle_typeid(const char* name) {
-    return name;
-}
-}
-
-#endif
-
-using namespace mgb;
-using namespace cg;
-
-PyStackExtracter* PyStackExtracter::ins = nullptr;
-
-namespace {
-
-    std::string repr_pyobj(PyObject *obj) {
-        if (!obj)
-            return "<null PyObject>";
-        PYTHON_GIL;
-        auto str = PyObject_Repr(obj);
-        if (!str)
-            return ssprintf("<PyObject at %p (repr failed)>", obj);
-        std::string ret{PyUnicode_AsUTF8(str)};
-        Py_DECREF(str);
-        return ret;
-    }
-
-    template<typename T>
-    std::string typeid_name(const T &t) {
-        return demangle_typeid(typeid(t).name());
-    }
-
-} // anonymous namespace
-
-/* ============== OprPyTracker ============== */
-
-class OprPyTracker::TrackerStorage final : public UserDataContainer::UserData,
-                                           public NonCopyableObj {
-    MGB_TYPEINFO_OBJ_DECL;
-
-    PyObject* m_cur_tracker = nullptr;
-    size_t m_refcnt_to_add = 0;
-    SyncEventConnecter::ReceiverHandler m_opr_insert_handler;
-    ThinHashMap<OperatorNodeBase*, PyObject*> m_opr2tracker;
-
-public:
-    explicit TrackerStorage(ComputingGraph& graph) {
-        auto on_new_opr = [this](const event::OprInserted& ev) {
-            if (!ev.is_dedup && !ev.exc) {
-                if (m_cur_tracker) {
-                    ++m_refcnt_to_add;
-                    m_opr2tracker[ev.opr] = m_cur_tracker;
-                }
-            }
-        };
-        m_opr_insert_handler =
-                graph.event().register_receiver<event::OprInserted>(on_new_opr);
-    }
-
-    ~TrackerStorage() {
-        if (m_cur_tracker) {
-            // manage refcnt of cur tracker
-            disable();
-        }
-        PYTHON_GIL;
-        for (auto&& i : m_opr2tracker) {
-            Py_DecRef(i.second);
-        }
-    }
-
-    //! get the instance
-    static TrackerStorage& inst(ComputingGraph& graph) {
-        auto make = [&graph]() {
-            return std::make_shared<TrackerStorage>(graph);
-        };
-        return *graph.options()
-                        .user_data.get_user_data_or_create<TrackerStorage>(
-                                make);
-    }
-
-    //! get the tracker associated with an opr, or nullptr
-    PyObject* get(OperatorNodeBase* opr) const {
-        auto iter = m_opr2tracker.find(opr);
-        return iter == m_opr2tracker.end() ? nullptr : iter->second;
-    }
-
-    void enable(PyObject* obj) {
-        mgb_assert(!m_cur_tracker,
-                   "multiple calls to begin_set_tracker() on the same graph");
-        m_cur_tracker = obj;
-    }
-
-    void disable() {
-        mgb_assert(m_cur_tracker,
-                   "call end_set_tracker() before begin_set_tracker()");
-        if (m_refcnt_to_add) {
-            PYTHON_GIL;
-            for (size_t i = 0; i < m_refcnt_to_add; ++i) {
-                Py_IncRef(m_cur_tracker);
-            }
-        }
-        m_cur_tracker = nullptr;
-    }
-};
-MGB_TYPEINFO_OBJ_IMPL(OprPyTracker::TrackerStorage);
-
-void OprPyTracker::begin_set_tracker(ComputingGraph& graph, PyObject* obj) {
-    TrackerStorage::inst(graph).enable(obj);
-}
-
-void OprPyTracker::end_set_tracker(ComputingGraph& graph) {
-    TrackerStorage::inst(graph).disable();
-}
-
-OprPyTracker::TrackerResult OprPyTracker::get_tracker(mgb::MegBrainError& exc) {
-    auto ptr = dynamic_cast<const OperatorNodeExcExtraInfo*>(exc.extra_info());
-    if (!ptr)
-        return {};
-    return get_tracker(ptr->opr());
-}
-
-OprPyTracker::TrackerResult OprPyTracker::get_tracker(
-        mgb::cg::OperatorNodeBase* opr) {
-    TrackerResult ret;
-    mgb_assert(opr);
-    ret.exc_opr = opr;
-    opr = cg::get_opr_root_source_opr(opr);
-    ret.unopt_opr = opr;
-
-    auto&& storage = TrackerStorage::inst(*opr->owner_graph());
-    ret.tracker = storage.get(opr);
-
-    {
-        auto&& grad_info = opr->node_prop().attribute().grad_tracker;
-        if (grad_info.valid()) {
-            ret.opr_grad_src = cg::get_opr_root_source_opr(grad_info->orig_opr);
-            ret.tracker_grad_src = storage.get(ret.opr_grad_src);
-        }
-    }
-
-    return ret;
-}
-
-PyObject* OprPyTracker::TrackerResult::as_tuple(const char *leading_msg) const {
-    std::string msg;
-    if (leading_msg)
-        msg = leading_msg;
-
-    auto print_opr = [&](const char *otype, cg::OperatorNodeBase *opr) {
-        if (!opr)
-            return;
-
-        msg += ssprintf("\n%s: id=%zu name=%s type=%s\n",
-                otype, opr->id(), opr->cname(),
-                typeid_name(*opr).c_str());
-        msg += "  input variables: \n";
-        size_t idx = 0;
-        for (auto i: opr->input()) {
-            msg += ssprintf("    %zu: ", idx ++);
-            msg += cg::dump_var_info({i});
-            msg += "\n";
-        }
-
-        msg += "  output variables: \n";
-        idx = 0;
-        for (auto i: opr->output()) {
-            msg += ssprintf("    %zu: ", idx ++);
-            msg += cg::dump_var_info({i});
-            msg += "\n";
-        }
-    };
-
-    print_opr("Associated operator", exc_opr);
-    if (unopt_opr != exc_opr) {
-        print_opr("Unoptimized equivalent of associated operator", unopt_opr);
-    }
-    print_opr("Associated operator created by taking grad of", opr_grad_src);
-
-    PYTHON_GIL;
-    PyObject *py_msg = PyUnicode_FromString(msg.c_str()),
-             *py_tuple = PyTuple_Pack(3, py_msg,
-                     tracker ? tracker : Py_None,
-                     tracker_grad_src ? tracker_grad_src : Py_None);
-    Py_DECREF(py_msg);
-    return py_tuple;
-}
-
-std::string blame(mgb::cg::OperatorNodeBase* opr) {
-    mgb_assert(PyMGBExceptionMaker::py_exc_class,
-               "Python exception class is not set yet");
-    PyObject* args = OprPyTracker::get_tracker(opr).as_tuple();
-
-    PYTHON_GIL;
-
-    PyObject* py_exc = PyObject_CallObject(PyMGBExceptionMaker::py_exc_class, args);
-    Py_DECREF(args);
-    mgb_assert(py_exc);
-
-    PyObject* py_str = PyObject_Str(py_exc);
-    Py_DECREF(py_exc);
-    mgb_assert(py_str);
-
-    int err = PyUnicode_READY(py_str);
-    if (err) {
-        Py_DECREF(py_str);
-        mgb_assert(!err);
-    }
-
-    Py_ssize_t c_str_size;
-    const char* c_str = PyUnicode_AsUTF8AndSize(py_str, &c_str_size);
-    if (!c_str) {
-        Py_DECREF(py_str);
-        mgb_assert(c_str);
-    }
-    std::string ret(c_str, c_str_size);
-    Py_DECREF(py_str);
-    return ret;
-}
-
-/* ============== PyMGBExceptionMaker ============== */
-PyObject *PyMGBExceptionMaker::py_exc_class = nullptr;
-
-void PyMGBExceptionMaker::setup_py_exception(std::exception &exc) {
-    mgb_assert(py_exc_class);
-    if (auto cbexc = dynamic_cast<PyExceptionForward*>(&exc)) {
-        cbexc->restore();
-        return;
-    }
-
-    std::string msg;
-    try {
-        msg = ssprintf("MegBrain core throws exception: %s\n%s",
-                typeid_name(exc).c_str(), exc.what());
-
-        auto mgbexc = dynamic_cast<MegBrainError*>(&exc);
-        OprPyTracker::TrackerResult tracker;
-        if (mgbexc) {
-            tracker = OprPyTracker::get_tracker(*mgbexc);
-        }
-
-        PYTHON_GIL;
-        PyObject *py_exc_arg = tracker.as_tuple(msg.c_str());
-        PyErr_SetObject(py_exc_class, py_exc_arg);
-        Py_DECREF(py_exc_arg);
-    } catch (std::exception &newexc) {
-        auto newmsg = ssprintf(
-                "caught exception during handling exception: %s\n%s\n"
-                "original message: %s",
-                typeid_name(newexc).c_str(), newexc.what(),
-                msg.c_str());
-        PyErr_SetString(PyExc_RuntimeError, newmsg.c_str());
-    } catch (...) {
-        auto newmsg = ssprintf(
-                    "caught unknown exception during handling exception\n"
-                    "original message: %s", msg.c_str());
-        PyErr_SetString(PyExc_RuntimeError, newmsg.c_str());
-    }
-}
-
-/* ============== PyExceptionForward ============== */
-
-PyExceptionForward::~PyExceptionForward() {
-    PYTHON_GIL;
-    PyObjRefKeeper::deleter(m_type);
-    PyObjRefKeeper::deleter(m_value);
-    PyObjRefKeeper::deleter(m_traceback);
-}
-
-void PyExceptionForward::restore() {
-    PyErr_Restore(m_type, m_value, m_traceback);
-    m_type = m_value = m_traceback = nullptr;
-}
-
-void PyExceptionForward::throw_() {
-    PyObject *etype, *obj, *trace;
-    PyErr_Fetch(&etype, &obj, &trace);
-    PyErr_NormalizeException(&etype, &obj, &trace);
-
-    std::string msg{"python exception"};
-    bool succ = false;
-    if (etype && obj && trace) {
-        auto run = [&]() {
-#define DEF(name, expr)        \
-    PyObjRefKeeper name{expr}; \
-    if (!name.get())           \
-    return
-            DEF(mod, PyImport_ImportModule("traceback"));
-            DEF(result, PyObject_CallMethod(mod.get(), "format_exception",
-                                            "(OOO)", etype, obj, trace));
-            if (!PyList_Check(result.get()))
-                return;
-            auto size = PyList_Size(result.get());
-            msg.append(":\n");
-            for (Py_ssize_t i = 0; i < size; ++i) {
-                msg.append("  ");
-                msg.append(PyUnicode_AsUTF8(PyList_GetItem(result.get(), i)));
-            }
-            msg.pop_back();  // remove last \n
-            succ = true;
-#undef DEF
-        };
-        run();
-    }
-    if (!succ) {
-        PyObject* obj_str_py;
-        if (obj && (obj_str_py = PyObject_Repr(obj))) {
-            msg.append(" with message ");
-            msg.append(PyUnicode_AsUTF8(obj_str_py));
-            Py_DECREF(obj_str_py);
-        } else {
-            msg.append(" with unknown message");
-        }
-    }
-    // throwing exception may cause abort due to unknown reasons; so we first
-    // log the message
-    mgb_log_error("caught exception from python callback: %s", msg.c_str());
-    fflush(stdout);
-    fflush(stderr);
-    throw PyExceptionForward{etype, obj, trace, msg};
-}
-
-/* ============== namespace npy ============== */
-
-namespace {
-
-int to_mgb_supported_dtype_raw(int dtype) {
-    if (dtype == NPY_INT64)
-        return NPY_INT32;
-    if (dtype == NPY_FLOAT64)
-        return NPY_FLOAT32;
-    return dtype;
-}
-
-#define FOREACH_NPY_DTYPE_PAIR(cb) \
-    cb(Uint8, NPY_UINT8) \
-    cb(Int8, NPY_INT8) \
-    cb(Int16, NPY_INT16) \
-    cb(Int32, NPY_INT32) \
-    cb(Float16, NPY_FLOAT16) \
-    cb(Float32, NPY_FLOAT32)
-
-#define FOREACH_NPY_MGB_DTYPE_PAIR(cb) \
-    FOREACH_NPY_DTYPE_PAIR(cb) \
-    FOREACH_MGB_DTYPE_PAIR(cb)
-
-
-
-//! convert megbrain dtype to numpy dtype
-int dtype_mgb2np_raw(DType dtype) {
-    mgb_assert(dtype.valid(), "attempt to convert from invalid dtype");
-    switch (dtype.enumv()) {
-#define cb(_m, _n) \
-        case DTypeEnum::_m: \
-            return _n;
-        FOREACH_NPY_MGB_DTYPE_PAIR(cb)
-#undef cb
-        default:
-            break;
-    }
-    throw ConversionError(ssprintf(
-                "can not convert dtype %s to numpy dtype", dtype.name()));
-}
-
-struct PyArrayDescrDeleter {
-    void operator()(PyArray_Descr* obj) {
-        Py_XDECREF(obj);
-    }
-};
-
-//! Convert MegBrain DType to NumPy DType descriptor, the caller receives a new
-//! reference to the descriptor.
-std::unique_ptr<PyArray_Descr, PyArrayDescrDeleter> dtype_mgb2np_descr(
-        DType dtype) {
-    PYTHON_GIL;
-    mgb_assert(dtype.valid(), "attempt to convert from invalid dtype");
-    auto build_mgb_dtype_dict =
-            [](const char* name,
-               const std::vector<std::pair<const char*, PyObject*>>& data) {
-                PyObject* metadata = PyDict_New();
-                PyObject* mgb_dtype_metadata = PyDict_New();
-                PyDict_SetItemString(mgb_dtype_metadata, "name",
-                                     PyUnicode_FromString(name));
-                for (const auto& d : data) {
-                    PyDict_SetItemString(mgb_dtype_metadata, d.first, d.second);
-                }
-                PyDict_SetItemString(metadata, "mgb_dtype", mgb_dtype_metadata);
-                return metadata;
-            };
-    if (dtype.has_param()) {
-        PyArray_Descr* type_descr;
-        switch (dtype.enumv()) {
-            case DTypeEnum::Quantized8Asymm: {
-                auto& param = dtype.param<dtype::Quantized8Asymm>();
-                type_descr = PyArray_DescrNewFromType(NPY_UINT8);
-                type_descr->metadata = build_mgb_dtype_dict(
-                        DTypeTrait<dtype::Quantized8Asymm>::name,
-                        {{"scale", PyFloat_FromDouble(param.scale)},
-                         {"zero_point", PyLong_FromLong(param.zero_point)}});
-                break;
-            }
-            case DTypeEnum::QuantizedS8: {
-                auto& param = dtype.param<dtype::QuantizedS8>();
-                type_descr = PyArray_DescrNewFromType(NPY_INT8);
-                type_descr->metadata = build_mgb_dtype_dict(
-                        DTypeTrait<dtype::QuantizedS8>::name,
-                        {{"scale", PyFloat_FromDouble(param.scale)}});
-                break;
-            }
-            case DTypeEnum::Quantized4Asymm: {
-                auto& param = dtype.param<dtype::Quantized4Asymm>();
-                type_descr = PyArray_DescrNewFromType(NPY_UINT8);
-                type_descr->metadata = build_mgb_dtype_dict(
-                        DTypeTrait<dtype::Quantized4Asymm>::name,
-                        {{"scale", PyFloat_FromDouble(param.scale)},
-                         {"zero_point", PyLong_FromLong(param.zero_point)}});
-                break;
-            }
-            case DTypeEnum::QuantizedS4: {
-                auto& param = dtype.param<dtype::QuantizedS4>();
-                type_descr = PyArray_DescrNewFromType(NPY_INT8);
-                type_descr->metadata = build_mgb_dtype_dict(
-                        DTypeTrait<dtype::QuantizedS4>::name,
-                        {{"scale", PyFloat_FromDouble(param.scale)}});
-                break;
-            }
-            case DTypeEnum::QuantizedS32: {
-                auto& param = dtype.param<dtype::QuantizedS32>();
-                type_descr = PyArray_DescrNewFromType(NPY_INT32);
-                type_descr->metadata = build_mgb_dtype_dict(
-                        DTypeTrait<dtype::QuantizedS32>::name,
-                        {{"scale", PyFloat_FromDouble(param.scale)}});
-                break;
-            }
-            default:
-                mgb_throw(ConversionError, "unhandled parameterized DType %s",
-                          dtype.name());
-        }
-        return std::unique_ptr<PyArray_Descr, PyArrayDescrDeleter>(type_descr);
-    }
-    PyArray_Descr* basic_descr = PyArray_DescrFromType(dtype_mgb2np_raw(dtype));
-    mgb_assert(basic_descr != nullptr,
-                   "failed to convert expected dtype to numpy type descriptor");
-    return std::unique_ptr<PyArray_Descr, PyArrayDescrDeleter>(basic_descr);
-}
-
-DType dtype_np2mgb_raw(int npt) {
-    switch (npt) {
-#define cb(_m, _n) \
-        case _n: \
-            return dtype::_m();
-        FOREACH_NPY_DTYPE_PAIR(cb)
-#undef cb
-    }
-#define cb(_m, _n) \
-    if (_n == npt) return dtype::_m();
-    FOREACH_MGB_DTYPE_PAIR(cb)
-#undef cb
-
-    PYTHON_GIL;
-    std::string msg;
-    auto py_obj = PyArray_TypeObjectFromType(npt);
-    if (!py_obj) {
-        msg = ssprintf("unknown numpy dtype enum %d", npt);
-    } else {
-        msg = ssprintf("unsupported numpy dtype %s",
-                repr_pyobj(py_obj).c_str());
-    }
-    Py_DECREF(py_obj);
-    throw ConversionError(msg);
-}
-
-DType dtype_np2mgb_descr(PyArray_Descr* descr) {
-    PYTHON_GIL;
-    auto handle_parameterized_dtype = [](PyObject* metadata) -> DType {
-        mgb_assert(PyDict_Check(metadata),
-                   "Invalid parameterized DType metadata: should be a dict");
-        PyObject* dtype_name_py = PyDict_GetItemString(metadata, "name");
-        mgb_assert(
-                PyUnicode_Check(dtype_name_py),
-                "Invalid parameterized DType metadata: name should be a str");
-        std::string dtype_name(PyUnicode_AsUTF8(dtype_name_py));
-        if (dtype_name == "Quantized8Asymm") {
-            PyObject* scale_py = PyDict_GetItemString(metadata, "scale");
-            PyObject* zero_point_py =
-                    PyDict_GetItemString(metadata, "zero_point");
-            mgb_assert(scale_py && zero_point_py,
-                       "Invalid Quantized8Asymm metadata: missing scale or "
-                       "zero_point.");
-            mgb_assert(
-                    PyFloat_Check(scale_py),
-                    "Invalid Quantized8Asymm metadata: scale should be float");
-            mgb_assert(PyLong_Check(zero_point_py),
-                       "Invalid Quantized8Asymm metadata: zero_point should be "
-                       "integer");
-            auto zero_point = PyLong_AS_LONG(zero_point_py);
-            mgb_assert(zero_point >= 0 && zero_point < 256,
-                       "Invalid Quantized8Asymm metadata: zero_point should be "
-                       "in [0, 256)");
-            return dtype::Quantized8Asymm(
-                    static_cast<float>(PyFloat_AS_DOUBLE(scale_py)),
-                    static_cast<uint8_t>(zero_point));
-        }
-        if (dtype_name == "Quantized4Asymm") {
-            PyObject* scale_py = PyDict_GetItemString(metadata, "scale");
-            PyObject* zero_point_py =
-                    PyDict_GetItemString(metadata, "zero_point");
-            mgb_assert(scale_py && zero_point_py,
-                       "Invalid Quantized4Asymm metadata: missing scale or "
-                       "zero_point.");
-            mgb_assert(
-                    PyFloat_Check(scale_py),
-                    "Invalid Quantized4Asymm metadata: scale should be float");
-            mgb_assert(PyLong_Check(zero_point_py),
-                       "Invalid Quantized4Asymm metadata: zero_point should be "
-                       "integer");
-            auto zero_point = PyLong_AS_LONG(zero_point_py);
-            mgb_assert(zero_point >= 0 && zero_point < 15,
-                       "Invalid Quantized4Asymm metadata: zero_point should be "
-                       "in [0, 15)");
-            return dtype::Quantized4Asymm(
-                    static_cast<float>(PyFloat_AS_DOUBLE(scale_py)),
-                    static_cast<uint8_t>(zero_point));
-        }
-        if (dtype_name == "QuantizedS32" || dtype_name == "QuantizedS8" ||
-            dtype_name == "QuantizedS4") {
-            PyObject* scale_py = PyDict_GetItemString(metadata, "scale");
-            mgb_assert(scale_py, "Invalid metadata: missing scale");
-            mgb_assert(PyFloat_Check(scale_py),
-                       "Invalid metadata: scale should be float");
-            float scale = static_cast<float>(PyFloat_AS_DOUBLE(scale_py));
-            if (dtype_name == "QuantizedS32") {
-                return dtype::QuantizedS32(scale);
-            } else if (dtype_name == "QuantizedS8"){
-                return dtype::QuantizedS8(scale);
-            } else {
-                return dtype::QuantizedS4(scale);
-            }
-        }
-        throw ConversionError(
-                ssprintf("Unknown parameterized DType: %s", dtype_name.c_str())
-                        .c_str());
-    };
-    PyObject* dtype_metadata;
-    if (descr->metadata && PyDict_Check(descr->metadata) &&
-        (dtype_metadata = PyDict_GetItemString(descr->metadata, "mgb_dtype"))) {
-        return handle_parameterized_dtype(dtype_metadata);
-    }
-    return dtype_np2mgb_raw(descr->type_num);
-}
-
-HostTensorND lowbit_ndarray_to_host_tensor(
-        CompNode comp_node, TensorLayout &layout, PyArrayObject *input) {
-    auto src_ptr = reinterpret_cast<dt_byte*>(PyArray_DATA(input));
-    if (!layout.ndim) {
-        // numpy scalar
-        mgb_assert(src_ptr, "can not convert from null numpy array");
-        layout.init_contiguous_stride({1});
-    } else {
-        mgb_assert(layout.ndim && layout.ndim <= TensorShape::MAX_NDIM,
-                "unsupported ndim %zu", layout.ndim);
-        for (size_t i = 0; i < layout.ndim; ++ i) {
-            layout.shape[i] = PyArray_SHAPE(input)[i];
-            layout.stride[i] = PyArray_STRIDE(input, i);
-            mgb_assert(layout.shape[i], "zero shape not supported");
-        }
-        mgb_assert(layout.is_contiguous());
-    }
-    HostTensorND ret{comp_node, layout};
-    lowbit_memcpy_byte2compact(layout.dtype, ret.raw_ptr(), src_ptr,
-            layout.total_nr_elems());
-    return ret;
-}
-
-/*!
- * \brief convert a python object to tensor and try to borrow memory if the
- *      original object is a contiguous numpy array
- * \param dtype see np2tensor
- * \return the megbrain tensor, and whether memory is borrowed
- */
-std::pair<HostTensorND, bool> np2tensor_try_borrow(
-        PyObject *obj, CompNode dest_cn, DType dtype) {
-    mgb_assert(dest_cn.valid());
-
-    PYTHON_GIL;
-
-    PyArray_Descr* expected_descr = nullptr;
-    if (dtype.valid()) {
-        // The reference to expected_descr will be stealed later.
-        expected_descr = dtype_mgb2np_descr(dtype).release();
-    }
-
-    // make result from PyArrayObject; its reference would be stolen
-    auto make_from_arr = [&](PyArrayObject *input, bool is_borrow) {
-        PyObjRefKeeper ref_obj_cvt{reinterpret_cast<PyObject*>(input)};
-
-        TensorLayout layout;
-        layout.dtype = dtype_np2mgb_descr(PyArray_DESCR(input));
-        if (dtype.valid())
-            mgb_assert(dtype == layout.dtype);
-        layout.ndim = PyArray_NDIM(input);
-
-        if (layout.dtype.is_low_bit()) {
-            auto ret = lowbit_ndarray_to_host_tensor(dest_cn, layout, input);
-            // decref(input) would be handled by ref_obj_cvt
-            return std::make_pair(ret, false);
-        }
-
-        auto data = reinterpret_cast<dt_byte*>(PyArray_DATA(input));
-        if (!layout.ndim) {
-            // numpy scalar
-            mgb_assert(data, "can not convert from null numpy array");
-            layout.init_contiguous_stride({1});
-        } else {
-            mgb_assert(layout.ndim && layout.ndim <= TensorShape::MAX_NDIM,
-                    "unsupported ndim %zu", layout.ndim);
-            auto dsize = layout.dtype.size();
-            bool is_empty = false;
-            for (size_t i = 0; i < layout.ndim; ++ i) {
-                layout.shape[i] = PyArray_SHAPE(input)[i];
-                layout.stride[i] = PyArray_STRIDE(input, i);
-                if (!layout.shape[i]) {
-                    is_empty = true;
-                }
-                mgb_assert(layout.stride[i] % dsize == 0,
-                        "bad stride %zd", layout.stride[i]);
-                layout.stride[i] /= dsize;
-            }
-            mgb_assert(is_empty || layout.is_contiguous());
-        }
-        HostTensorStorage storage;
-        auto input_ptr = ref_obj_cvt.make_shared(data);
-        storage.reset(dest_cn, layout.span().high_byte, input_ptr);
-        HostTensorND ret;
-        ret.reset(storage, layout);
-        return std::make_pair(ret, is_borrow);
-    };
-
-    PyArrayObject *obj_as_arr = nullptr;
-    do {
-        // check contiguous and dtype, and borrow mem if ok
-        if (!PyArray_Check(obj))
-            break;
-        obj_as_arr = reinterpret_cast<PyArrayObject*>(obj);
-        int typenum = PyArray_DTYPE(obj_as_arr)->type_num;
-        // We have to check dtype.valid() and typenum first to avoid
-        // accidentally trigger ConversionError on incompatible dtypes which can
-        // be automatically converted into comptaible ones (e.g. float64).
-        if (dtype.valid() &&
-            (expected_descr->type_num != typenum ||
-             dtype_np2mgb_descr(PyArray_DTYPE(obj_as_arr)) != dtype))
-            break;
-        if (typenum != to_mgb_supported_dtype_raw(typenum)) {
-            mgb_assert(!dtype.valid() && expected_descr == nullptr);
-            expected_descr =
-                    PyArray_DescrFromType(to_mgb_supported_dtype_raw(typenum));
-            break;
-        }
-        if (PyArray_ISCARRAY_RO(obj_as_arr)) {
-            Py_INCREF(obj_as_arr);
-            return make_from_arr(obj_as_arr, true);
-        }
-    } while(0);
-
-    constexpr auto NP_FLAGS = NPY_ARRAY_C_CONTIGUOUS | NPY_ARRAY_FORCECAST;
-    PyObject *obj_cvt;
-    if (obj_as_arr) {
-        obj_cvt = PyArray_FromArray(obj_as_arr, expected_descr, NP_FLAGS);
-    } else {
-        obj_cvt = PyArray_FromAny(obj, expected_descr, 0, 0, NP_FLAGS, nullptr);
-    }
-
-    if (obj_cvt) {
-        // convert to mgb supported dtype
-        auto arr = reinterpret_cast<PyArrayObject*>(obj_cvt);
-        int dt0 = PyArray_TYPE(arr), dt1 = to_mgb_supported_dtype_raw(dt0);
-        if (dt0 != dt1) {
-            mgb_assert(expected_descr == nullptr);
-            expected_descr = PyArray_DescrFromType(dt1);
-            mgb_assert(expected_descr);
-            auto obj_cvt_new = PyArray_FromAny(
-                    obj_cvt, expected_descr, 0, 0, NP_FLAGS, nullptr);
-            Py_DECREF(obj_cvt);
-            obj_cvt = obj_cvt_new;
-        }
-    }
-
-    if (!obj_cvt) {
-        if (PyErr_Occurred()) {
-            PyExceptionForward::throw_();
-        }
-        throw ConversionError(ssprintf("can not convert to numpy array from %s",
-                    repr_pyobj(obj).c_str()));
-    }
-
-    return make_from_arr(reinterpret_cast<PyArrayObject*>(obj_cvt), false);
-}
-
-//! hold a reference to HostTensorND
-class HostTensorNDRefHolder final: public NonCopyableObj {
-    HostTensorND m_val;
-    static MemPool<HostTensorNDRefHolder> sm_mem_pool;
-
-    friend class MemPool<HostTensorNDRefHolder>;
-
-    HostTensorNDRefHolder(const HostTensorND &v):
-        m_val{v}
-    {
-    }
-
-    public:
-
-        static HostTensorNDRefHolder* alloc(const HostTensorND &v) {
-            return sm_mem_pool.alloc(v);
-        }
-
-        static void free(HostTensorNDRefHolder *p) {
-            return sm_mem_pool.free(p);
-        }
-};
-MemPool<HostTensorNDRefHolder> HostTensorNDRefHolder::sm_mem_pool;
-
-void ndarray_shared_from_tensor_py_capsule_dtor(PyObject *cap) {
-    auto ptr = PyCapsule_GetPointer(cap, "HostTensorND");
-    mgb_assert(ptr, "not a PyCapsule: %s", repr_pyobj(cap).c_str());
-    HostTensorNDRefHolder::free(static_cast<HostTensorNDRefHolder*>(ptr));
-}
-
-} // anonymous namespace
-
-PyObject* npy::ndarray_from_tensor(
-        const HostTensorND &val, ShareType share_type) {
-    if (!val.layout().is_contiguous() && !val.shape().is_empty()) {
-        mgb_assert(share_type != ShareType::MUST_SHARE);
-        HostTensorND contig;
-        contig.copy_from(val);
-        return ndarray_from_tensor(contig, ShareType::TRY_SHARE);
-    }
-    PYTHON_GIL;
-    npy_intp dims[TensorLayout::MAX_NDIM];
-    for (size_t i = 0; i < val.layout().ndim; ++ i)
-        dims[i] = val.shape()[i];
-    PyObject* ret = nullptr;
-
-    auto alloc_new_ret = [&]() {
-        mgb_assert(!ret);
-        ret = PyArray_NewFromDescr(
-                &PyArray_Type, dtype_mgb2np_descr(val.dtype()).release(),
-                val.layout().ndim, dims, nullptr, nullptr, 0, nullptr);
-        mgb_assert(ret, "failed to allocate array");
-        mgb_assert(PyArray_Check(ret));
-        return PyArray_DATA(reinterpret_cast<PyArrayObject*>(ret));
-    };
-    if (val.dtype().is_low_bit()) {
-        mgb_assert(share_type != ShareType::MUST_SHARE,
-                "can not share memory for lowbit dtype");
-        lowbit_memcpy_compact2byte(val.dtype(), alloc_new_ret(), val.raw_ptr(),
-                val.layout().total_nr_elems());
-    } else if (share_type == ShareType::MUST_UNSHARE) {
-        memcpy(alloc_new_ret(), val.raw_ptr(), val.layout().span().dist_byte());
-    } else {
-        // share data
-        ret = PyArray_NewFromDescr(
-                &PyArray_Type, dtype_mgb2np_descr(val.dtype()).release(),
-                val.layout().ndim, dims, nullptr,
-                const_cast<dt_byte*>(val.raw_ptr()), 0, nullptr);
-        mgb_assert(ret, "failed to alloc ndarray");
-        auto capsule = PyCapsule_New(HostTensorNDRefHolder::alloc(val),
-                "HostTensorND", ndarray_shared_from_tensor_py_capsule_dtor);
-        mgb_assert(capsule, "failed to create PyCapsule");
-        auto err = PyArray_SetBaseObject(
-                reinterpret_cast<PyArrayObject*>(ret), capsule);
-        mgb_assert(!err);
-    }
-    return ret;
-}
-
-HostTensorND npy::np2tensor(PyObject* obj, const Meth& meth, DType dtype) {
-    auto ret_full = np2tensor_try_borrow(obj, meth.dest_cn_, dtype);
-    if (meth.dest_tensor_) {
-        meth.dest_tensor_->copy_from(ret_full.first);
-        return *meth.dest_tensor_;
-    }
-    if (meth.must_borrow_) {
-        mgb_assert(ret_full.second,
-                   "can not borrow from numpy array as contig array with dtype "
-                   "%s; src=%s",
-                   dtype.name(), repr_pyobj(obj).c_str());
-    }
-    return ret_full.first;
-}
-
-PyObject* npy::dtype_mgb2np(mgb::DType dtype) {
-    PYTHON_GIL;
-    // According to
-    // https://docs.scipy.org/doc/numpy/reference/c-api.array.html#c.PyArray_TypeObjectFromType
-    // the following is equivalent to PyArray_TypeObjectFromType for built-in
-    // types.
-    auto descr = dtype_mgb2np_descr(dtype);
-    if (descr == nullptr) {
-        return nullptr;
-    }
-    if (dtype.has_param()) {
-        return reinterpret_cast<PyObject*>(descr.release());
-    }
-    PyObject* typeobj = reinterpret_cast<PyObject*>(descr->typeobj);
-    Py_XINCREF(typeobj);
-    return typeobj;
-}
-
-mgb::DType npy::dtype_np2mgb(PyObject *obj) {
-    mgb_assert(obj && obj != Py_None,
-               "can not convert null PyObject to numpy dtype");
-    // see
-    // http://stackoverflow.com/questions/8477122/numpy-c-api-convert-type-object-to-type-number
-    PYTHON_GIL;
-
-    PyArray_Descr* dtype;
-    if(!PyArray_DescrConverter(obj, &dtype)) {
-        throw ConversionError(ssprintf("can not convert to np.dtype from %s",
-                    repr_pyobj(obj).c_str()));
-    }
-
-    mgb::DType result = dtype_np2mgb_descr(dtype);
-    Py_DECREF(dtype);
-    return result;
-}
-
-PyObject* npy::to_mgb_supported_dtype(PyObject* dtype) {
-    PYTHON_GIL;
-
-    PyArray_Descr* descr;
-    if (!PyArray_DescrConverter(dtype, &descr)) {
-        throw ConversionError(ssprintf("can not convert to np.dtype from %s",
-                                       repr_pyobj(dtype).c_str()));
-    }
-    mgb_assert(!descr->metadata,
-               "unexpected metadata in dtype: "
-               "dtype_obj=%s metadata=%s",
-               repr_pyobj(dtype).c_str(), repr_pyobj(descr->metadata).c_str());
-    int type_num = to_mgb_supported_dtype_raw(descr->type_num);
-    return PyArray_TypeObjectFromType(type_num);
-}
-
-TensorShape npy::vec2shape(const std::vector<size_t> &vec) {
-    TensorShape shape;
-    mgb_assert(vec.size() <= TensorShape::MAX_NDIM,
-            "dim too large: %zd (max %zd)",
-            vec.size(), TensorShape::MAX_NDIM);
-    shape.ndim = vec.size();
-    for (size_t i = 0; i < vec.size(); i ++) {
-        if (!vec[i]) {
-            shape.ndim = 0;
-            break;
-        }
-        shape[i] = vec[i];
-    }
-    mgb_assert(shape.ndim, "shape should not be empty");
-    return shape;
-}
-
-void mgb_init_numpy() {
-    import_array1( );
-}
-
-// vim: syntax=cpp.doxygen foldmethod=marker foldmarker=f{{{,f}}}
diff --git a/python_module/src/cpp/python_helper.h b/python_module/src/cpp/python_helper.h
deleted file mode 100644
index 1d443905..00000000
--- a/python_module/src/cpp/python_helper.h
+++ /dev/null
@@ -1,229 +0,0 @@
-/**
- * \file python_module/src/cpp/python_helper.h
- *
- * This file is part of MegBrain, a deep learning framework developed by Megvii.
- *
- * \brief helper utilities for python integration
- *
- * \copyright Copyright (c) 2014-2020 Megvii Inc. All rights reserved.
- *
- */
-
-#pragma once
-
-#include "megbrain/graph.h"
-
-#include <Python.h>
-#include <string>
-
-class GILManager {
-    PyGILState_STATE gstate;
-
-    public:
-        GILManager():
-            gstate(PyGILState_Ensure())
-        {
-        }
-
-        ~GILManager() {
-            PyGILState_Release(gstate);
-        }
-};
-#define PYTHON_GIL GILManager __gil_manager
-
-//! wraps a shared_ptr and decr PyObject ref when destructed
-class PyObjRefKeeper {
-    std::shared_ptr<PyObject> m_ptr;
-
-public:
-    static void deleter(PyObject* p) {
-        if (p) {
-            PYTHON_GIL;
-            Py_DECREF(p);
-        }
-    }
-
-    PyObjRefKeeper() = default;
-    PyObjRefKeeper(PyObject* p) : m_ptr{p, deleter} {}
-
-    PyObject* get() const { return m_ptr.get(); }
-
-    //! create a shared_ptr as an alias of the underlying ptr
-    template <typename T>
-    std::shared_ptr<T> make_shared(T* ptr) const {
-        return {m_ptr, ptr};
-    }
-};
-
-class PyStackExtracter {
-    static PyStackExtracter *ins;
-
-    public:
-        virtual ~PyStackExtracter() = default;
-
-        virtual std::string extract() = 0;
-
-        static void reg(PyStackExtracter *p) {
-            ins = p;
-        }
-
-        static std::string run() {
-            return ins->extract();
-        }
-};
-
-//! exception to be thrown when python callback fails
-class PyExceptionForward : public std::exception {
-    PyObject *m_type, *m_value, *m_traceback;
-    std::string m_msg;
-
-    PyExceptionForward(PyObject* type, PyObject* value, PyObject* traceback,
-                       const std::string& msg)
-            : m_type{type},
-              m_value{value},
-              m_traceback{traceback},
-              m_msg{msg} {}
-
-public:
-    PyExceptionForward(const PyExceptionForward&) = delete;
-    PyExceptionForward& operator=(const PyExceptionForward&) = delete;
-    ~PyExceptionForward();
-
-    PyExceptionForward(PyExceptionForward&& rhs)
-            : m_type{rhs.m_type},
-              m_value{rhs.m_value},
-              m_traceback{rhs.m_traceback},
-              m_msg{std::move(rhs.m_msg)} {
-        rhs.m_type = rhs.m_value = rhs.m_traceback = nullptr;
-    }
-
-    //! throw PyExceptionForward from current python error state
-    static void throw_() __attribute__((noreturn));
-
-    //! restore python error
-    void restore();
-
-    const char* what() const noexcept override { return m_msg.c_str(); }
-};
-
-/*!
- * \brief make python exception
- */
-class PyMGBExceptionMaker {
-    static PyObject *py_exc_class;
-    friend std::string blame(mgb::cg::OperatorNodeBase* opr);
-
-    public:
-        static void setup_py_exception(std::exception &exc);
-
-        static void _reg_exception_class(PyObject *cls) {
-            py_exc_class = cls;
-        }
-
-};
-
-//! associate a python object with an operator
-class OprPyTracker final : public mgb::NonCopyableObj {
-    class TrackerStorage;
-    OprPyTracker() = delete;
-
-public:
-    /*!
-     * \brief set current tracker; all operators created later would share
-     *      this tracker
-     *
-     * Note that a py reference would be kept
-     */
-    static void begin_set_tracker(mgb::cg::ComputingGraph& graph,
-                                  PyObject* obj);
-
-    static void end_set_tracker(mgb::cg::ComputingGraph& graph);
-
-    struct TrackerResult {
-        mgb::cg::OperatorNodeBase
-                //! operator that directly causes the exception
-                *exc_opr = nullptr,
-                //! operator constructed by user (non-optimized exc_opr)
-                *unopt_opr = nullptr,
-                //! the grad source if opr is constructed by taking grad
-                        *opr_grad_src = nullptr;
-        PyObject *tracker = nullptr, *tracker_grad_src = nullptr;
-
-        //! format as python tuple
-        PyObject* as_tuple(const char* leading_msg = nullptr) const;
-    };
-
-    //! get tracker from exception
-    static TrackerResult get_tracker(mgb::MegBrainError& exc);
-
-    //! get tracker from operator
-    static TrackerResult get_tracker(mgb::cg::OperatorNodeBase* opr);
-};
-
-std::string blame(mgb::cg::OperatorNodeBase* opr);
-
-//! numpy utils
-namespace npy {
-    //! convert tensor shape to raw vector
-    static inline std::vector<size_t> shape2vec(const mgb::TensorShape &shape) {
-        return {shape.shape, shape.shape + shape.ndim};
-    }
-
-    //! change numpy dtype to megbrain supported dtype
-    PyObject* to_mgb_supported_dtype(PyObject *dtype);
-
-    //! convert raw vector to tensor shape
-    mgb::TensorShape vec2shape(const std::vector<size_t> &vec);
-
-    //! convert megbrain dtype to numpy dtype object; return new reference
-    PyObject* dtype_mgb2np(mgb::DType dtype);
-
-    //! convert numpy dtype object or string to megbrain dtype
-    mgb::DType dtype_np2mgb(PyObject *obj);
-
-    //! buffer sharing type
-    enum class ShareType {
-        MUST_SHARE,     //!< must be shared
-        MUST_UNSHARE,   //!< must not be shared
-        TRY_SHARE       //!< share if possible
-    };
-
-    //! get ndarray from HostTensorND
-    PyObject* ndarray_from_tensor(const mgb::HostTensorND &val,
-            ShareType share_type);
-
-    //! specify how to convert numpy array to tensor
-    struct Meth {
-        bool must_borrow_ = false;
-        mgb::HostTensorND *dest_tensor_ = nullptr;
-        mgb::CompNode dest_cn_;
-
-        //! make a Meth that allows borrowing numpy array memory
-        static Meth borrow(
-                mgb::CompNode dest_cn = mgb::CompNode::default_cpu()) {
-            return {false, nullptr, dest_cn};
-        }
-
-        //! make a Meth that requires the numpy array to be borrowed
-        static Meth must_borrow(
-                mgb::CompNode dest_cn = mgb::CompNode::default_cpu()) {
-            return {true, nullptr, dest_cn};
-        }
-
-        //! make a Meth that requires copying the value into another
-        //! tensor
-        static Meth copy_into(mgb::HostTensorND *tensor) {
-            return {false, tensor, tensor->comp_node()};
-        }
-    };
-    /*!
-     * \brief convert an object to megbrain tensor
-     * \param meth specifies how the conversion should take place
-     * \param dtype desired dtype; it can be set as invalid to allow arbitrary
-     *      dtype
-     */
-    mgb::HostTensorND np2tensor(PyObject *obj, const Meth &meth,
-            mgb::DType dtype);
-}
-
-// vim: syntax=cpp.doxygen foldmethod=marker foldmarker=f{{{,f}}}
diff --git a/python_module/src/python/genopr.py b/python_module/src/python/genopr.py
deleted file mode 100755
index 28ce7e28..00000000
--- a/python_module/src/python/genopr.py
+++ /dev/null
@@ -1,293 +0,0 @@
-#!/usr/bin/env python3
-# -*- coding: utf-8 -*-
-# This file is part of MegBrain.
-# Copyright (c) 2014-2020 Megvii Inc. All rights reserved.
-
-from io import StringIO
-import re
-import argparse
-import subprocess
-import os
-import textwrap
-
-def camel2underscore(
-        name, *,
-        first_cap_re=re.compile('([A-Z])([A-Z][a-z]+)'),
-        all_cap_re = re.compile('([a-z])([A-Z]+)')):
-    if name.isupper():
-        return name.lower()
-    s1 = first_cap_re.sub(r'\1_\2', name)
-    return all_cap_re.sub(r'\1_\2', s1).lower()
-
-class Doc:
-    """wrap an identifier and doc"""
-    _id = None
-
-    def __init__(self, id_, doc, typestr=None, default=None):
-        self._id = id_
-        self.doc = doc
-        self.typestr = typestr
-        self.default = default
-
-    def __str__(self):
-        return self._id
-
-
-class OprGenerator:
-    _fout = None
-    _cur_indent = ''
-
-    def __init__(self):
-        self._fout = StringIO()
-
-    def _indent(self):
-        self._cur_indent += ' ' * 4
-
-    def _unindent(self):
-        self._cur_indent = self._cur_indent[:-4]
-
-    def _write(self, content, *fmt, indent=0):
-        if indent < 0:
-            self._unindent()
-
-        self._fout.write(self._cur_indent)
-        if fmt:
-            content = content % fmt
-        self._fout.write(content)
-        self._fout.write('\n')
-
-        if indent > 0:
-            self._indent()
-
-    def _gen_signature(self, inputs, params, *, have_config=True,
-                       has_out_dtype=False):
-        assert inputs
-        sig = []
-        for inp in inputs:
-            name = str(inp)
-            if name.startswith('*'):
-                assert name[1:].isidentifier()
-                assert inp is inputs[-1]
-            else:
-                assert name.isidentifier()
-            if isinstance(inp, Doc) and inp.default is not None:
-                name += '={}'.format(inp.default)
-            sig.append(name)
-        if not str(inputs[-1]).startswith('*'):
-            sig.append('*')
-        for i, _ in params:
-            sig.append('{}=None'.format(i))
-
-        if have_config:
-            sig.extend(['name=None', 'comp_node=None', 'config=None'])
-            if 'comp_graph' not in map(str, inputs):
-                sig.append('comp_graph=None')
-            if has_out_dtype:
-                assert 'dtype' not in map(str, inputs)
-                sig.append('dtype=None')
-
-        if params:
-            sig.append('**kwargs')
-
-        if sig[-1] == '*':
-            sig.pop()
-        return ', '.join(sig)
-
-    def _write_canonize_inputs(self, inputs, canonize_input_vars,
-                               canonize_input_vars_args=None,
-                               has_out_dtype=False):
-        self._write_gen_config(has_out_dtype)
-        inputs = list(map(str, inputs))
-        if canonize_input_vars_args is None:
-            if inputs[0][0] == '*':
-                arg = inputs[0][1:]
-            else:
-                arg = '[{}]'.format(', '.join(inputs))
-        else:
-            arg = canonize_input_vars_args
-        self._write('all_inputs = _helper.%s(%s, '
-                    'comp_graph=comp_graph, config=config)',
-                    canonize_input_vars, arg)
-
-    def _write_gen_config(self, has_out_dtype=False):
-        if not has_out_dtype:
-            self._write('config = _helper.gen_config(name, comp_node, config)')
-        else:
-            self._write('config = _helper.gen_config(name, comp_node, config, dtype)')
-
-    def _write_make_params(self, params, body):
-        for pname, ptype in params:
-            self._write('%s = _helper.cvt_to_opr_param_def(%s, '
-                        '_opr_param_defs.%s, kwargs)',
-                        pname, pname, ptype)
-        self._write('assert not kwargs, "extra kwargs: {}".format(kwargs)')
-
-        for i in body:
-            self._write(i)
-
-        self._write('all_params = []')
-        for pname, _ in params:
-            self._write('all_params.append(%s.serialize())',
-                        pname)
-
-    def _write_doc(self, inputs, params, desc):
-        self._write('"""')
-        if isinstance(desc, Doc):
-            assert desc._id is None
-            self._write(desc.doc)
-        elif desc:
-            for i in textwrap.wrap(desc, 75):
-                self._write(i)
-
-        self._write('')
-        for i in inputs:
-            name = str(i)
-            typestr = ':class:`.SymbolVar`'
-            if name[0] == '*':
-                name = name[1:]
-                typestr = 'list of ' + typestr
-            if isinstance(i, Doc):
-                self._write(':param %s: %s', name, i.doc)
-                if i.typestr is not None:
-                    typestr = i.typestr
-            if typestr:
-                if not isinstance(i, Doc):
-                    self._write(':param %s: ', name)
-                self._write(':type %s: %s', name, typestr)
-
-        for pname, ptype in params:
-            self._write(':param %s: ', pname)
-            self._write(':type %s: :class:`~megbrain.opr_param_defs.%s`',
-                        pname, ptype)
-
-        self._write(':param comp_node: see doc for *config*')
-        self._write(':param name: see doc for *config*')
-        self._write(
-            ':param config: give a :class:`.OperatorNodeConfig` object to set '
-            'operator name and comp node. This can also be achieved by passing '
-            '*comp_node* and *name* separately.')
-
-        if 'comp_graph' not in map(str, inputs):
-            self._write(
-                ':param comp_graph: If all inputs are immediate numbers, '
-                '*comp_graph* and *comp_node* must be provided '
-                'so the input values can be put on appropriate '
-                'computing graph and computing node')
-        self._write('"""')
-
-    def _write_return(self, name, outputs):
-        self._write('outputs = _mgb._create_opr('
-                    '"%s", all_inputs, all_params, config)', name)
-        if outputs:
-            self._write('outputs = [outputs[i] for i in %s]',
-                        list(map(int, outputs)))
-        self._write('return _helper.cvt_opr_result(outputs, '
-                    '**cvt_result_kwargs)')
-
-    def decl_opr(self, name, *, inputs, params, desc=None, pyname=None,
-                 canonize_input_vars='canonize_input_vars',
-                 canonize_input_vars_args=None, body=[],
-                 outputs=None, version=0, has_out_dtype=False):
-        """
-        :param inputs: name of variable inputs; a name starting with `*' means
-            a list of vars
-        :type inputs: list of str
-        :param params: (param name, param type) pairs; it can be a single
-            string representing the param type, and param name defaults to
-            'param'
-        :type params: list of pair of str, or str
-        :param pyname: python function name
-        :param body: extra statements to be placed before calling _create_opr
-        :param outputs: the indices of output vars to be selected from raw opr
-            result
-        """
-        if isinstance(params, str):
-            params = [('param', params)]
-        assert params
-
-        if pyname is None:
-            pyname = camel2underscore(name)
-
-        self._write('def %s(%s):', pyname,
-                    self._gen_signature(inputs, params,
-                                        has_out_dtype=has_out_dtype), indent=1)
-        self._write_doc(inputs, params, desc)
-        self._write_canonize_inputs(
-            inputs, canonize_input_vars,
-            canonize_input_vars_args=canonize_input_vars_args,
-            has_out_dtype=has_out_dtype)
-        self._write('cvt_result_kwargs = {}')
-        self._write_make_params(params, body)
-        if version:
-            name += 'V{}'.format(version)
-        self._write_return(name, outputs)
-        self._write('', indent=-1)
-
-    def decl_raw_opr(self, name, *, inputs, inputs_cvt=[], body=None,
-                     desc=None, local_defs=[], have_config=True):
-        """declare a raw operator that is forwarded to _mgb._Opr; if *body* is
-        given, a custom implemented can be provided
-
-        :param inputs_cvt: list of (input name, cvt) pairs, where cvt is name
-            of the function to convert that input
-        :param body: list of statements to produce output, or None
-        :param local_defs: list of statements to be prepended before generated
-            code
-        """
-        self._write('def %s(%s):', name,
-                    self._gen_signature(inputs, [], have_config=have_config),
-                    indent=1)
-        self._write_doc(inputs, [], desc)
-        if have_config:
-            self._write_gen_config()
-        for i in local_defs:
-            self._write(i)
-        for k, v in inputs_cvt:
-            self._write('%s = %s(%s)', k, v, k)
-        self._write('cvt_result_kwargs = {}')
-        if body is None:
-            self._write('output = _mgb._Opr.%s(%s, config)',
-                        name, ', '.join(map(str, inputs)))
-        else:
-            for i in body:
-                self._write(i)
-        self._write(
-            'return _helper.cvt_opr_result(output, **cvt_result_kwargs)')
-        self._write('', indent=-1)
-
-    def get_str(self):
-        return self._fout.getvalue()
-
-
-def main():
-    parser = argparse.ArgumentParser(
-        description='generate operator function def code from decl file')
-    parser.add_argument('inputs', nargs='+')
-    args = parser.parse_args()
-
-    gen = OprGenerator()
-    exec_globals = {
-        'decl_opr': gen.decl_opr,
-        'decl_raw_opr': gen.decl_raw_opr,
-        'Doc': Doc,
-        'camel2underscore': camel2underscore,
-    }
-    for i in args.inputs:
-        print('generate oprs from {}'.format(i))
-        with open(i) as fin:
-            exec(fin.read(), exec_globals)
-
-    git_commit = subprocess.Popen(['git', 'rev-parse', 'HEAD'],
-                               stdout=subprocess.PIPE).communicate()[0].strip()
-    git_commit = git_commit.decode('utf-8')
-
-    file_rela = lambda *paths: os.path.join(os.path.dirname(__file__), *paths)
-    outfile = lambda name: file_rela('../../megengine/_internal', name)
-    with open(file_rela('opr_template.py')) as fin:
-        with open(outfile('opr.py'), 'w') as fout:
-            fout.write(fin.read().
-                       replace('{%body%}', gen.get_str()).
-                       replace('{%git_commit%}', git_commit))
-
-if __name__ == '__main__':
-    main()
diff --git a/python_module/src/python/opr_template.py b/python_module/src/python/opr_template.py
deleted file mode 100644
index 29ea0a9d..00000000
--- a/python_module/src/python/opr_template.py
+++ /dev/null
@@ -1,425 +0,0 @@
-# -*- coding: utf-8 -*-
-# This file is part of MegBrain.
-# Copyright (c) 2014-2020 Megvii Inc. All rights reserved.
-
-"""This python module contains functions to apply the operators defined by
-megbrain.
-
-.. note::
-    Most of the functions are automatically generated, and their signature have
-    the form contain a ``param`` argument (or more than one arguments such as
-    :func:`convolution` that has ``param`` and ``execution_polity``) and also
-    accept keyword arguments. In such case, it can be called by either
-    providing a param object of appropriate type, or by passing the arguments
-    needed by the constructor of param object to the keyword arguments.
-    Furthermore, for a param that needs an enumeration member, the enum name
-    can be used to refer to the enum object.
-
-    For example, the following statements are equivalent::
-
-        elemwise([a, b], mode='max')
-        elemwise([a, b], mode=opr_param_defs.Elemwise.Mode.MAX)
-        elemwise([a, b], param=opr_param_defs.Elemwise('max'))
-"""
-
-from . import mgb as _mgb
-from . import helper as _helper
-from . import opr_param_defs as _opr_param_defs
-
-import sys
-import enum
-import collections
-import json
-
-__git_commit__ = "{%git_commit%}"
-
-{%body%}
-
-class _ElemMeta(type):
-    def __getattr__(self, name):
-        def run(*inputs, **kwargs):
-            return elemwise(inputs, mode=name, **kwargs)
-        if name.startswith('__'):
-            return
-        return run
-
-
-class elem(metaclass=_ElemMeta):
-    """
-    Helper class for easily applying element-wise operator. Request for getting
-    member method would be translated to a call to :func:`elemwise` with mode
-    set to the method name. Example::
-
-        elem.exp(a) # elemwise(a, mode='exp')
-        elem.max(a, b) # elemwise([a, b], mode='max')
-
-    """
-
-
-def add_update(
-        dest, delta,
-        alpha=_mgb.SharedScalar(1), beta=_mgb.SharedScalar(1),
-        bias=_mgb.SharedScalar(0), disable=_mgb.SharedScalar(0), *,
-        name=None, comp_node=None, config=None, comp_graph=None):
-    """update *dest* by `dest := dest*alpha + delta*beta + bias`
-
-    :param dest: target var to be updated; must be created from
-        :func:`make_shared`
-    :type dest: :class:`.SymbolVar`
-    :param disable: AddUpdate will not be executed if disable is set to 1,
-        this is used for dynamic param-updating. The value of this SharedScalar
-        can only be set to 0/1 of type `int`
-    :type disable: :class:`.SharedScalar`
-    """
-    def as_ss(x):
-        if not isinstance(x, _mgb.SharedScalar):
-            x = _mgb.SharedScalar(x)
-        return x
-
-    assert isinstance(dest, _mgb.SymbolVar)
-    config = _helper.gen_config(name, comp_node, config)
-    dest, delta = _helper.canonize_input_vars(
-        [dest, delta], comp_graph=comp_graph, config=config)
-
-    assert isinstance(disable, _mgb.SharedScalar)
-
-    alpha, beta, bias = map(as_ss, [alpha, beta, bias])
-    return _mgb._Opr.add_update(dest, delta, alpha, beta, bias, disable, config)
-
-def reduce_(src, mode, axis=None, keepdims=False, *,
-            name=None, comp_node=None, config=None, comp_graph=None):
-    """reduce along given axis; if axis is None, reduce to scalar
-
-    :param mode: reduction mode
-    :type mode: :class:`~megengine._internal.opr_param_defs.Reduce.Mode` compatible
-    :param axis: axis along which to reduce input var
-    :type axis: int
-    :param keepdims: whether to keep an axis of shape 1 in the result
-    :type keepdims: False
-    """
-    assert isinstance(src, _mgb.SymbolVar)
-    config = _helper.gen_config(name, comp_node, config)
-    inputs = [src]
-    kwargs = {'mode': mode}
-    remove_axis = False
-    if axis is None:
-        inputs.append(1)
-        assert not keepdims, 'can not set axis=None and keepdims=True'
-    else:
-        remove_axis = not keepdims
-        kwargs['axis'] = axis
-
-    ret = reduce_general(inputs, config=config, comp_graph=comp_graph,
-                         **kwargs)
-    if remove_axis:
-        ret = _mgb._Opr.remove_axis(ret, axis, _mgb.make_opr_config())
-    return _helper.cvt_opr_result(ret)
-
-def _reduce_like(impl, src, axis, keepdims,
-                 name, comp_node, config, comp_graph):
-    config = _helper.gen_config(name, comp_node, config)
-    remove_axis = False
-    if axis is None:
-        assert not keepdims, 'can not set axis=None and keepdims=True'
-        src = src.flatten()
-        axis = 0
-    else:
-        assert isinstance(axis, int) and axis >= 0, (
-            'bad axis: {!r}'.format(axis))
-        remove_axis = not keepdims
-
-    ret = impl(src, axis=axis, config=config, comp_graph=comp_graph)
-    if remove_axis:
-        ret = _mgb._Opr.remove_axis(ret, axis, _mgb.make_opr_config())
-    return _helper.cvt_opr_result(ret)
-
-def dimshuffle(src, pattern, ndim=0, *,
-               name=None, comp_node=None, config=None):
-    """swap shapes and strides according to given pattern
-
-    :param pattern: a list of integers, where each element is the input axis of
-        that output axis. An element could also be 'x' for creating a new axis
-        with shape 1
-    :param ndim: number of input dimensions; 0 to be inferred from pattern;
-        this is required only for grad
-    """
-    config = _helper.gen_config(name, comp_node, config)
-    if not isinstance(pattern, (list, tuple)):
-        raise TypeError('could not convert {} to dimshuffle pattern'.format(
-            pattern))
-    pattern_mgb = _mgb._VectorInt()
-    for i in pattern:
-        if i == 'x':
-            pattern_mgb.push_back(-1)
-        else:
-            i = int(i)
-            assert i >= 0
-            pattern_mgb.push_back(i)
-    return _mgb._Opr.dimshuffle(src, pattern_mgb, int(ndim), config)
-
-def param_pack_split(src, shapes, *,
-                     name=None, comp_node=None, config=None):
-    """
-    split param into a list of tensor for given shape
-    ParamPackSplit operator has a input: ``src`` and would
-    have a ``output``. output[i] indicates the address of tensor which part of
-    ``src`` would transfer its elements into.
-
-    Example: a input tensor with size 32, the shapes: ``[(1, 2, 4), (4, 2, 2),
-    (4, 2, 1)]``, the output tensor would be a list of address with size 3.
-    output[0] indicates the address of tensor with shapes[0]:(1, 2, 4),
-    output[1] indicates the address of tensor with shapes[1]:(4, 2, 2),
-    output[2] indicates the address of tensor with shapes[2]:(4, 2, 1).
-
-    :param src: The concatenated input tensor.
-    :type src: :class:`SymbolVar`
-    :param shapes: Shapes of output tensors
-    :type shapes: list of list of int
-    """
-    config = _helper.gen_config(name, comp_node, config)
-
-    if not isinstance(shapes, (list, tuple)):
-        raise TypeError('could not convert {} to tensor shapes'.format(
-            shapes))
-
-    shapes_mgb = _mgb._VectorTensorShape()
-
-    for s in shapes:
-        s = tuple(map(int, s))
-        assert min(s) > 0
-        shapes_mgb.push_back(s)
-
-    return _mgb._Opr.param_pack_split(src, shapes_mgb, config)
-
-class _modify_subtensor_helper:
-    def __init__(self, dest, val, *, name=None, comp_node=None, config=None):
-        self.dest = dest
-        self.val = val
-        self.config = _helper.gen_config(name, comp_node, config)
-
-    def __getitem__(self, idx):
-        inp = _mgb._VectorSymbolVar()
-        dest, desc = _helper.cvt_getitem_to_idx_desc(
-            self.dest, idx, allow_newaxis=False)
-        assert desc is not None, 'no __getitem__ entries given'
-        inp.push_back(dest)
-        inp.push_back(self.val)
-        return _mgb._create_subtensor_like_opr(
-            self._opr_name, inp, desc, self.config)
-
-class set_subtensor(_modify_subtensor_helper):
-    """a proxy object which supports ``__getitem__`` to set subtensor.
-        ``c = set_subtensor(a, b)[idx]`` is equivalent to the numpy
-        expression::
-
-            c = a.copy()
-            c[idx] = b
-
-    """
-    _opr_name = 'set_subtensor'
-
-
-class incr_subtensor(_modify_subtensor_helper):
-    """a proxy object which supports ``__getitem__`` to increase subtensor.
-        ``c = incr_subtensor(a, b)[idx]`` is equivalent to the numpy
-        expression::
-
-            c = a.copy()
-            c[idx] += b
-    """
-    _opr_name = 'incr_subtensor'
-
-class mesh_indexing:
-    """ Extract elements from given tensor by the coordinates which is
-    Cartesian product of given index; example::
-
-        mesh_indexing(x)[:, [2, 3], :, [2, 3, 4]]
-    """
-
-    def __init__(self, src, *, name=None, comp_node=None, config=None):
-        self.src = src
-        self.config = _helper.gen_config(name, comp_node, config)
-
-
-    def __getitem__(self, idx):
-        inp, desc = _helper.cvt_getitem_to_idx_desc(self.src, idx)
-        if desc is None:
-            return inp
-        return _mgb._create_subtensor_like_opr(
-            'mesh_indexing', [inp], desc, self.config)
-
-class batched_mesh_indexing:
-    """ Similar to :class:`mesh_indexing`, while the k-th position of
-    slices is a 2-dim matrix `matrix[k]`.
-    The `matrix[k] is a list of index. The i-th row `matrix[k][i]`
-    represents the index of the associated k-th position slice when
-    `batch_idx == i` ; example::
-
-        batched_mesh_indexing(x)[:, [[1, 2], [2, 3]], 1:-1:-1]
-
-    .. warning::
-        The first dimension of slices must be (start, stop, step) like,
-            cannot be any of SymbolVar, numpy.array, Python list.
-        And the shape of other indexs must be (n, x) while n is the length
-            of first dimension of tensor after applying [start:stop:step]
-
-    """
-
-    def __init__(self, src, *, name=None, comp_node=None, config=None):
-        self.src = src
-        self.config = _helper.gen_config(name, comp_node, config)
-
-
-    def __getitem__(self, idx):
-        inp, desc = _helper.cvt_getitem_to_idx_desc(self.src, idx)
-        if desc is None:
-            return inp
-        return _mgb._create_subtensor_like_opr(
-            'batched_mesh_indexing', [inp], desc, self.config)
-
-class incr_mesh_indexing(_modify_subtensor_helper):
-    _opr_name = 'incr_mesh_indexing'
-
-class set_mesh_indexing(_modify_subtensor_helper):
-    _opr_name = 'set_mesh_indexing'
-
-class batched_incr_mesh_indexing(_modify_subtensor_helper):
-    _opr_name = 'batched_incr_mesh_indexing'
-
-class batched_set_mesh_indexing(_modify_subtensor_helper):
-    _opr_name = 'batched_set_mesh_indexing'
-
-class advanced_indexing:
-    """wrapper for numpy-like advanced indexing, where a non-slice index can be
-    a vector; example::
-
-        advanced_indexing(x)[:, [2, 3]]
-
-    """
-    def __init__(self, src, *, name=None, comp_node=None, config=None):
-        self.src = src
-        self.config = _helper.gen_config(name, comp_node, config)
-
-    def __getitem__(self, idx):
-        inp, desc = _helper.cvt_getitem_to_idx_desc(self.src, idx)
-        if desc is None:
-            return inp
-        return _mgb._create_subtensor_like_opr(
-            'mavi', [inp], desc, self.config)
-
-class set_advanced_indexing(_modify_subtensor_helper):
-    """:class:`set_subtensor` equivalent with advanced-indexing support"""
-    _opr_name = 'set_mavi'
-
-
-class incr_advanced_indexing(_modify_subtensor_helper):
-    """:class:`incr_subtensor` equivalent with advanced-indexing support"""
-    _opr_name = 'incr_mavi'
-
-
-def mean(inp, axis, keepdims):
-    """average value along an axis"""
-    if hasattr(inp.dtype, 'metadata'):
-        return reduce_(inp, 'MEAN', axis, keepdims)
-    else:
-        s = reduce_(inp, 'SUM', axis, keepdims)
-        if axis is None:
-            cnt = inp.shape.prod()
-        else:
-            cnt = inp.axis_shape(axis)
-        return s / cnt
-
-def square(inp):
-    """*inp* squared"""
-    return inp ** 2
-
-def sqrt(inp):
-    """square root"""
-    return inp ** 0.5
-
-
-class _LoopDescMakerCallback(_mgb._LoopDescMakerCallback):
-    def __init__(self, func):
-        super().__init__()
-        assert isinstance(func, collections.Callable)
-        self._func = func
-        self.__disown__()
-
-    def call(self, desc):
-        self._func(desc)
-
-
-def make_loop(desc_maker, *,
-              swap_interval=-5, name=None, comp_node=None, config=None):
-    """Create a loop operator. The loop operator works in the following way:
-
-    1. Copy variables specified by :meth:`.LoopDesc.add_input` from the parent
-       graph into the sub graph.
-    2. Evaluates the loop condition.
-    3. If the absolute value of the loop condition is no more than 1e-6, go to
-       5.
-    4. Update variables in the sub graph using rules specified by
-       :meth:`.LoopDesc.assign` and then go to 2 again.
-    5. Copy values of output variables given by :meth:`.LoopDesc.add_output`
-       into the parent graph and exit.
-
-    The loop operator could be thought of as a digital circuit, where the sub
-    graph (which must be purely functional) is the combinational logic part and
-    the :meth:`.LoopDesc.assign` rules serve as the flip-flops.
-
-    :type desc_maker: callable
-    :param desc_maker: a function to create the loop descriptor; it would
-        receive a :class:`.LoopDesc` object and should call methods on it to
-        describe the sub graph. This function may be called multiple times, and
-        it should behave exactly the same in every call.
-
-    :type swap_interval: int
-    :param swap_interval: number of loop executions between swapping saved
-        mutable states to host; larger *swap_interval* requires more memory and
-        less copy stall. If *swap_interval* is negative, then statically
-        inferred loop time would be used if possible; otherwise its absolute
-        value would be used as swap interval.
-
-    :rtype: list of :class:`.SymbolVar`
-    :return: the output vars, corresponding to each
-        :meth:`.LoopDesc.add_output` call.
-    """
-    config = _helper.gen_config(name, comp_node, config)
-    return _mgb._make_loop(_LoopDescMakerCallback(desc_maker), swap_interval,
-                           config)
-
-def symvar_from_shared_nd(sv, comp_graph, name=None):
-    """get a symbol var in a computing graph that represents a shared (i.e.
-    pre-allocated) value on device
-
-    :param sv: the shared value
-    :type sv: :class:`.SharedND`
-    :param comp_graph: the computing graph to which this symvar should belong
-    :type graph: :class:`.CompGraph`
-    :param name: the name of resulting symvar
-    :type name: str or None
-    :rtype: :class:`.SymbolVar`
-    """
-    assert isinstance(sv, _mgb.SharedND)
-    return sv.symvar(comp_graph, name)
-
-def zero_grad(sv, **kwargs):
-    return set_grad(sv, None, **kwargs)
-
-# for backward pickle compatiblility
-def _make_enum_unpickle(new_enum):
-    """create a class that can be used for unpickling old enum values"""
-    class OldEnum:
-        def __new__(cls, value):
-            return new_enum[value]
-    return OldEnum
-
-
-
-ConvMode = _make_enum_unpickle(_opr_param_defs.Convolution.Mode)
-PoolingMode = _make_enum_unpickle(_opr_param_defs.Pooling.Mode)
-ROIPoolingMode = _make_enum_unpickle(_opr_param_defs.ROIPooling.Mode)
-WarpPerspectiveBorderMode = _make_enum_unpickle(
-    _opr_param_defs.WarpPerspective.BorderMode)
-WarpPerspectiveInterpMode = _make_enum_unpickle(
-    _opr_param_defs.WarpPerspective.InterpolationMode)
diff --git a/python_module/src/swig/callback.i b/python_module/src/swig/callback.i
deleted file mode 100644
index d5953a13..00000000
--- a/python_module/src/swig/callback.i
+++ /dev/null
@@ -1,215 +0,0 @@
-/*
- * $File: callback.i
- *
- * This file is part of MegBrain, a deep learning framework developed by Megvii.
- *
- * $Copyright: Copyright (c) 2014-2017 Megvii Inc. All rights reserved.
- */
-
-%feature("autodoc",
-"""It is used to be passed as arguments to callbacks (used in
-:meth:`.CompGraph.compile`, :func:`.callback_injector`, and
-:meth:`.CraniotomeBase.execute`). Object of this type could also be directly
-passed to :meth:`.SharedND.set_value`, to bypass some host and device
-communication. Note that the underlying buffer may be reused after the callback
-returns, so reference to this object should not be passed outside of the
-callback, and :meth:`get_value` should be called immediately if the numerical
-value is needed.""")
-CompGraphCallbackValueProxy;
-
-class CompGraphCallbackValueProxy {
-    public:
-
-        PyObject* _get_npyarr();
-        PyObject* _get_dtype();
-        std::vector<size_t> _get_shape();
-
-        uintptr_t _pubapi_dev_tensor_ptr(int version);
-        CompNode _get_comp_node();
-
-        %pythoncode{
-
-            @property
-            def shape(self):
-                """get shape of the var
-
-                :type: tuple of int
-                """
-                return tuple(map(int, self._get_shape()))
-
-            @property
-            def comp_node(self):
-                """get comp node of the var
-
-                :type: :class:`.CompNode`
-                """
-                return self._get_comp_node()
-
-            @property
-            def dtype(self):
-                """get data type of the var
-
-                :type: :class:`.numpy.dtype`
-                """
-                return self._get_dtype()
-
-            def get_value(self, *, borrow_mem=False):
-                """get value as numpy array
-
-                :param borrow_mem: whether to forward internal buffer with
-                    zero-copy; if True, the content in returned buffer would be
-                    modified directly by asynchronous graph execution.
-                """
-                ret = self._get_npyarr()
-                if not borrow_mem:
-                    ret = ret.copy()
-                return ret
-
-            @property
-            def dev_ptr(self):
-                """this method is DEPRECATED; use :meth:`pubapi_dev_tensor_ptr`
-                instead"""
-                return self._pubapi_dev_tensor_ptr(0)
-
-            @property
-            def pubapi_dev_tensor_ptr(self):
-                """get a pointer to the corresponding mgb::pubapi::DeviceTensor object
-
-                :rtype: int
-                :return: the address as an integer
-                """
-                return self._pubapi_dev_tensor_ptr(1)
-        }
-};
-%template(_VectorCompGraphCallbackValueProxy)
-    std::vector<CompGraphCallbackValueProxy>;
-
-%feature("director") _CompGraphCallback;
-class _CompGraphCallback {
-    public:
-        _CompGraphCallback();
-
-        void set_eager_copy(bool flag);
-
-        virtual ~_CompGraphCallback();
-        virtual void call(std::vector<CompGraphCallbackValueProxy> &value) = 0;
-};
-
-%feature("director") _SplitPartCallback;
-class _SplitPartCallback {
-    public:
-        _SplitPartCallback();
-        virtual ~_SplitPartCallback();
-
-        virtual std::vector<size_t> call(size_t tot_size) = 0;
-};
-
-%feature("director") _SetGradCallback;
-class _SetGradCallback {
-    public:
-        _SetGradCallback();
-        virtual ~_SetGradCallback();
-
-        virtual SymbolVar call(CompGraph &graph) = 0;
-        virtual bool empty() = 0;
-};
-
-%feature("director") _TimeoutCallback;
-class _TimeoutCallback {
-    public:
-        _TimeoutCallback();
-        virtual ~_TimeoutCallback();
-
-        virtual bool call() = 0;
-};
-
-%pythoncode{
-import collections
-import inspect
-from .mgb_helper import callback_lazycopy
-
-class _CompGraphCallbackPyWrapper(_CompGraphCallback):
-    """wraps around a callable to be used as comp graph callback"""
-
-    def __init__(self, f):
-        super().__init__()
-        if isinstance(f, callback_lazycopy):
-            f = f.func
-            self.set_eager_copy(False)
-        else:
-            self.set_eager_copy(True)
-            assert isinstance(f, collections.Callable)
-        self._func = f
-        self.__disown__()
-
-    def call(self, value):
-        if value.size() == 1:
-            self._func(value[0])
-        else:
-            self._func(value)
-
-
-_CompGraphCallbackPyWrapperNoEager = lambda f: (
-    _CompGraphCallbackPyWrapper(callback_lazycopy(f)))
-
-class _SplitPartCallbackPyWrapper(_SplitPartCallback):
-    def __init__(self, f):
-        super().__init__()
-        assert isinstance(f, collections.Callable)
-        self._func = f
-        self.__disown__()
-
-    def call(self, size):
-        return tuple(map(int, self._func(size)))
-
-
-class _SetGradCallbackPyWrapper(_SetGradCallback):
-    def __init__(self, f):
-        super().__init__()
-        if f is None:
-            self._func = None
-        else:
-            assert isinstance(f, collections.Callable)
-            nr_arg = len(list(filter(
-                lambda x: (
-                    x.kind == inspect.Parameter.POSITIONAL_OR_KEYWORD and
-                    x.default == inspect.Parameter.empty),
-                inspect.signature(f).parameters.values())))
-            if not nr_arg:
-                f = lambda graph, f0=f: f0()
-            else:
-                assert nr_arg == 1, 'bad callback for SetGrad: {}'.format(f)
-            self._func = f
-
-        self.__disown__()
-
-    def call(self, graph):
-        if self._func is None:
-            return SymbolVar()
-
-        ret = self._func(graph)
-        if ret is None:
-            ret = SymbolVar()
-        else:
-            assert isinstance(ret, SymbolVar), (
-                'bad return value for var maker: {!r}'.format(ret))
-        return ret
-
-    def empty(self):
-        return self._func is None
-
-
-class _TimeoutCallbackPyWrapper(_TimeoutCallback):
-    def __init__(self, f):
-        super().__init__()
-        assert isinstance(f, collections.Callable)
-        self._func = f
-        self.__disown__()
-    
-    def call(self):
-        return bool(self._func())
-
-
-} // %pythoncode
-
-// vim: ft=swig
diff --git a/python_module/src/swig/comp_graph.i b/python_module/src/swig/comp_graph.i
deleted file mode 100644
index 80c05586..00000000
--- a/python_module/src/swig/comp_graph.i
+++ /dev/null
@@ -1,87 +0,0 @@
-/*
- * $File: comp_graph.i
- *
- * This file is part of MegBrain, a deep learning framework developed by Megvii.
- *
- * $Copyright: Copyright (c) 2014-2017 Megvii Inc. All rights reserved.
- */
-
-
-%pythoncode{
-from .mgb_helper import copy_output, FuncOutputSaver
-import json
-} // pythoncode
-
-%feature("autodoc", """a callable object compiled from :class:`CompGraph`.
-
-.. note::
-
-    Only the most recently compiled AsyncExec object can be used.
-""") AsyncExec;
-%feature("autodoc", """explicitly release the underlying staticially allocated
-device memory""") AsyncExec::clear_device_memory;
-class AsyncExec {
-    public:
-        AsyncExec() = delete;
-
-        void _execute();
-        void _wait();
-        double _get_prev_exec_time();
-        std::string _to_json_str();
-        SymbolVarArray _find_mutable_input();
-        std::vector<std::pair<CompNode, size_t>>
-        _update_static_alloc_plan_and_get_size();
-
-        void clear_device_memory();
-
-        %include "comp_graph_impl_AsyncExec.py"
-};
-
-%template(_VectorAsyncExec) std::vector<AsyncExec>;
-
-%feature("autodoc", """use device memory manager in another computing graph to
-manage memory of this graph, so their memories can be shared. This is safe only
-when :class:`AsyncExec` compiled from these graphs do not run concurrently.""")
-CompGraph::share_device_memory_with;
-%feature("valuewrapper") CompGraph;
-class CompGraph {
-    public:
-        CompGraph();
-
-        AsyncExec _do_compile(bool copy, bool optimize_for_inference);
-        std::vector<AsyncExec> _do_compile_multi_part();
-        void _add_output_spec(SymbolVar &var,  _CompGraphCallback *callback);
-        void _add_multi_part_endpoint();
-        void _clear_output_spec();
-        size_t _release();
-
-        CompGraph& share_device_memory_with(CompGraph &other);
-
-        PyObject* _user_data();
-        void clear_device_memory();
-
-        %extend {
-            size_t _id() const {
-                return $self->get().id();
-            }
-
-            size_t _get_ptr_addr() const {
-                return reinterpret_cast<size_t>(&$self->get());
-            }
-
-            std::string get_mem_allocation_info() const {
-                return self->get().get_mem_allocation_info();
-            }
-
-            std::string __repr__() const {
-                auto &&graph = $self->get();
-                return mgb::ssprintf("<CompGraph #%zu at %p>", graph.id(), &graph);
-            }
-        }
-
-        %include "comp_graph_impl_CompGraph.py"
-};
-
-%include "comp_graph_tools.i"
-
-// vim: ft=swig
diff --git a/python_module/src/swig/comp_graph_impl_AsyncExec.py b/python_module/src/swig/comp_graph_impl_AsyncExec.py
deleted file mode 100644
index 28836c13..00000000
--- a/python_module/src/swig/comp_graph_impl_AsyncExec.py
+++ /dev/null
@@ -1,229 +0,0 @@
-%pythoncode {
-
-_var2output_saver = None
-"""map from var id to corresponding output saver; setup by
-:meth:`.CompGraph.compile`"""
-
-_expand_single_output = False
-"""whether output contains only a single element and it should not be wrapped
-by a list; setup by :meth:`.CompGraph.compile`"""
-
-__output_savers = None
-"""list of (symvar, output saver)"""
-
-__inputs = None
-"""list of (symvar, _HostSharedND.make_proxy(symvar)) pairs"""
-
-__allow_args_input = None
-
-__warned_unused_keys = None
-
-__auto_wait_enabled = True
-
-callbefore_func = None
-
-callback_func = None
-
-def __normalize_var_list(self, vlist):
-    if len(vlist) == 1:
-        vlist, = vlist
-    if isinstance(vlist, SymbolVar):
-        return [vlist]
-    ret = []
-    for i in vlist:
-        assert isinstance(i, SymbolVar)
-        ret.append(i)
-    return ret
-
-def _setup_args(self, args, kwargs):
-    if kwargs:
-        assert not args, 'should not provide both args and kwargs'
-        for symvar, hsv in self.__inputs:
-            val = kwargs.pop(symvar.name, None)
-            assert val is not None, (
-                'missing input at runtime: {}'.format(symvar))
-            hsv.set_value(val, borrow=self.__auto_wait_enabled)
-
-        if kwargs:
-            keys = set(kwargs.keys())
-            if keys != self.__warned_unused_keys:
-                from .logconf import get_logger
-                logger = get_logger()
-                logger.warning(
-                    'extra kwargs provided for megbrain AsyncExec: {}'.format(
-                        keys))
-                self.__warned_unused_keys = keys
-        return
-
-    assert not args or self.__allow_args_input, (
-        'pass non-keyword args to function compiled without'
-        ' inputs spec')
-    assert len(args) == len(self.__inputs), (
-        'inputs do not match: args={} needed={}'.format(
-            args, [i[0] for i in self.__inputs]))
-    for (symvar, hsv), val in zip(self.__inputs, args):
-        hsv.set_value(val, borrow=self.__auto_wait_enabled)
-
-def enable_borrow_on_cpu(self, flag=True):
-    """whether to allow borrow input tensor memory on CPU; if set to True, then
-    the user should ensure that memory buffers of input tensors are unchanged.
-
-    This is set to False by default.
-    """
-    for _, i in self.__inputs:
-        i.enable_borrow_on_cpu(flag)
-
-def __call__(self, *args, **kwargs):
-    """Execute the function; either one of positional arguments or keyword
-    arguments must be given. Set :attr:`inputs` to change the order of
-    positional arguments. The keys in keyword arguments are the names of input
-    symvars
-
-    :return: if auto wait is disabled, the return value would be
-        :class:`FuncOutputSaver` objects corresponding to the vars marked by
-        :class:`copy_output`; if auto wait is enabled, the numerical values as
-        :class:`numpy.ndarray` would be returned.
-    """
-    if self.callbefore_func:
-        if not callable(self.callbefore_func):
-            raise TypeError(
-                    "callbefore func must be callable: {}".format(self.callbefore_func))
-        self.callbefore_func()
-    self._setup_args(args, kwargs)
-    self._execute()
-    if self.callback_func:
-        if not callable(self.callback_func):
-            raise TypeError(
-                    "callback func must be callable: {}".format(self.callback_func))
-        self.callback_func()
-    if self.__auto_wait_enabled:
-        self.wait()
-
-    if not self.__output_savers:
-        return
-    ret = []
-    if self.__auto_wait_enabled:
-        for _, i in self.__output_savers:
-            ret.append(i.get())
-    else:
-        for _, i in self.__output_savers:
-            ret.append(i)
-    if self._expand_single_output:
-        ret, = ret
-    return ret
-
-def wait(self):
-    """wait for previous async exec to finish; wait is needed (i.e. the
-    function runs in async mode) only when there is no output callback (i.e.
-    all outputs are given by dest symvar only), or :meth:`disable_auto_wait` is
-    called explicitly.
-
-    :return: self"""
-    self._wait()
-    return self
-
-@property
-def prev_exec_time(self):
-    """previous execution time in seconds"""
-    return self._get_prev_exec_time()
-
-@property
-def inputs(self):
-    """get input vars needed at runtime, in the order as the values that should
-    be passed to :meth:`__call__`
-
-    :setter: Set the order of input vars, which must be created by
-        :func:`.make_arg`. None could also given, and in such case only keyword
-        arguments would be allowed for :meth:`__call__`
-
-    :type: tuple of :class:`.SymbolVar`
-    """
-    return tuple(i[0] for i in self.__inputs)
-
-@inputs.setter
-def inputs(self, *inputs):
-    if self.__inputs is None:
-        needed = tuple(self._find_mutable_input())
-        used_names = set()
-        for i in needed:
-            assert i.name not in used_names, (
-                'duplicated input name: {}'.format(i.name))
-            used_names.add(i.name)
-        self.__inputs = [(i, _HostSharedND.make_proxy(i)) for i in needed]
-
-    if len(inputs) == 1 and inputs[0] is None:
-        self.__allow_args_input = False
-        return
-
-    inputs = self.__normalize_var_list(inputs)
-    inpvar2proxy = dict(self.__inputs)
-    self.__allow_args_input = True
-    reordered = []
-    for i in inputs:
-        proxy = inpvar2proxy.pop(i, None)
-        if proxy is None:
-            raise TypeError('extra input var provided: {}; needed: {}'.format(
-                i, self.inputs))
-        reordered.append((i, proxy))
-
-    assert not inpvar2proxy, 'inputs not provided: {}'.format(
-        list(inpvar2proxy.keys()))
-
-    self.__inputs = reordered
-
-@property
-def available_outputs(self):
-    """get output vars that could be used to set :attr:`outputs`. The order may
-    be unstable
-
-    :type: tuple of :class:`.SymbolVar`"""
-    return tuple(self._var2output_saver.keys())
-
-@property
-def outputs(self):
-    """get output vars whose corresponding values would be returned by
-    :meth:`__call__`
-
-    :setter: set the order of output vars to be returned. Duplicated vars could
-        be included, but all the vars must have been provided to
-        :meth`.CompGraph.compile`.
-
-    :type: tuple of :class:`.SymbolVar`"""
-    if not self.__output_savers:
-        return
-
-    if self._expand_single_output:
-        (var, saver), = self.__output_savers
-        return var
-    return tuple(var for var, saver in self.__output_savers)
-
-@outputs.setter
-def outputs(self, *outputs):
-    olist = []
-    for var in self.__normalize_var_list(outputs):
-        saver = self._var2output_saver.get(var)
-        assert saver is not None, 'var {} is not set to be output var'.format(
-            var)
-        olist.append((var, saver))
-    self.__output_savers = olist
-
-def dump(self):
-    """dump internal graph and execution sequence as
-        json-serializable object"""
-    return json.loads(self._to_json_str())
-
-def disable_auto_wait(self):
-    """if there is output callback function, then by default when
-    :meth:`__call__` is invoked, it would not return until all computation is
-    finished. This behavior can be changed by disabling auto wait, so the
-    function returns as early as possible."""
-    self.__auto_wait_enabled = False
-
-def update_static_alloc_plan_and_get_size(self):
-    """update static memory allocation plan without actual allocation
-
-    :return: a dict that maps from comp node to size of allocation in bytes
-    """
-    return {k: v for k, v in self._update_static_alloc_plan_and_get_size()}
-
-}
diff --git a/python_module/src/swig/comp_graph_impl_CompGraph.py b/python_module/src/swig/comp_graph_impl_CompGraph.py
deleted file mode 100644
index 5f87a532..00000000
--- a/python_module/src/swig/comp_graph_impl_CompGraph.py
+++ /dev/null
@@ -1,191 +0,0 @@
-%pythoncode{
-
-@property
-def id(self):
-    """an integer increasing ID"""
-    return self._id()
-
-def __eq__(self, rhs):
-    return isinstance(rhs, CompGraph) and self.id == rhs.id
-
-def __hash__(self):
-    return self.id
-
-@property
-def user_data(self):
-    """get a dict that is associated with this computing graph to store
-    arbitrary user data"""
-    return self._user_data()
-
-def _process_output_spec(self, inputs, outspec):
-    """process user-provided output spec and add to the output staging list of
-    this graph
-
-    :return: a callable ``f(func)` to update compiled :class:`.AsyncExec` status
-    """
-    assert outspec
-
-    if isinstance(outspec, copy_output):
-        outspec = [outspec]
-        expand_single_output = True
-    else:
-        expand_single_output = False
-
-    var2output_saver = {}
-    output_vars = []
-
-    for spec in outspec:
-        if isinstance(spec, copy_output):
-            var = spec.symvar
-            output_vars.append(var)
-            if var in var2output_saver:
-                continue
-
-            callback = FuncOutputSaver(spec.borrow_mem)
-            var2output_saver[var] = callback
-        elif isinstance(spec, SymbolVar):
-            var = spec
-            callback = None
-        else:
-            var, callback = spec
-            assert isinstance(var, SymbolVar)
-        if callback is not None:
-            callback = _CompGraphCallbackPyWrapper(callback)
-        self._add_output_spec(var, callback)
-
-    def update(func):
-        assert isinstance(func, AsyncExec)
-        func.inputs = inputs
-        if output_vars:
-            func._var2output_saver = var2output_saver
-            func._expand_single_output = expand_single_output
-            func.outputs = output_vars
-
-    return update
-
-def compile(self, inputs, outspec, *, copy=False, optimize_for_inference=False):
-    """Compile the graph to get a callable function for numerical evaluation
-
-    .. warning::
-
-        If ``compile()`` is called multiple times, only the most recent result
-        function can be used.
-
-    :type inputs: iterable of :class:`.SymbolVar` or None
-    :param inputs: specifying the positional parameters to be passed to the
-        generated function, or use None for keyword params only
-    :type outspec: iterable of *single_outspec*
-    :param outspec: specifying how the compiled function should
-        return outputs. Each *single_outspec* may be one of the
-        following forms:
-
-            * a pair of (var, callback), the callback would be called during
-              function execution with a :class:`.CompGraphCallbackValueProxy`
-              argument corresponding to the given symbolvar. Additionally,
-              *callback* may be wrapped by :class:`.callback_lazycopy`; see the
-              its document for details.
-            * a single :class:`.SymbolVar`, to ensure this var is computed (so
-              the non-pure operators on its dependency path could take effect)
-            * a :class:`.copy_output` object, so the var's value would be
-              copied to the return value of compiled function. If there is one
-              such spec, the function would be synchronous.
-    :param copy: whether to copy the graph
-    :param optimize_for_inference: whether to run
-        :func:`.optimize_for_inference` on the output vars before compiling
-    :rtype: :class:`.AsyncExec`
-    """
-
-    self._clear_output_spec()
-    ret_update = self._process_output_spec(inputs, outspec)
-    ret = self._do_compile(copy, optimize_for_inference)
-    ret_update(ret)
-    return ret
-
-def compile_outonly(self, outputs, *, inputs=None):
-    """compile for only output; (almost) equavalent to
-    ``self.compile(inputs, [copy_output(i) for i in outputs])``
-
-    :type outputs: :class:`.SymbolVar` or list of
-        :class:`.SymbolVar`
-    :param outputs: the output symbol vars
-    """
-    if isinstance(outputs, SymbolVar):
-        outputs = copy_output(outputs)
-    else:
-        assert isinstance(outputs, collections.Iterable), (
-            '{} not iterable'.format(outputs))
-        outputs = [copy_output(i) for i in outputs]
-
-    return self.compile(inputs, outputs)
-
-def compile_multi_part(self, io_specs):
-    """Compile multiple functions for partial execution. Each function would
-    only execute the oprs necessary to compute current outspec, and intermediate
-    results from previous functions are reused. The functions would share
-    underlying device storage with this graph.
-
-    .. warning::
-
-        Each individual partial function would have a newly created computing
-        graph. Therefore plugins attached on this graph would not be effective
-        on the partial functions.
-
-    :param io_specs: input/output specifications as a list of
-        ``(inputs, outspec)`` pairs. Each pair is defined as the params of
-        :meth:`compile`.
-    :return: a list of :class:`.AsyncExec` objects as the functions
-        corresponding to each part
-    """
-    self._clear_output_spec()
-    updaters = []
-    for inputs, outspec in io_specs:
-        updaters.append(self._process_output_spec(inputs, outspec))
-        self._add_multi_part_endpoint()
-    funcs = self._do_compile_multi_part()
-    for i, j in zip(funcs, updaters):
-        j(i)
-    return funcs
-
-def make_shared(self, comp_node, *, dtype=None,
-                shape=None, value=None, name=None, volatile=False):
-    """make a shared value belonging to this comp graph; see
-        :func:`.make_shared`"""
-    from . import make_shared
-    return make_shared(comp_node, dtype=dtype, shape=shape, value=value,
-                       comp_graph=self, name=name, volatile=volatile)
-
-def make_immutable(self, comp_node, value, *, dtype=None, name=None):
-    """make an immutable value belonging to this comp graph; see
-        :func:`.make_immutable`"""
-    from . import make_immutable
-    return make_immutable(comp_node, self, value, dtype=dtype, name=name)
-
-def make_arg(self, comp_node, *, dtype=np.float32, shape=None, name=None,
-             value=None):
-    """make a runtime argument belonging to this comp graph; see
-        :func:`.make_arg`"""
-    from . import make_arg
-    return make_arg(comp_node, self, dtype=dtype, shape=shape, name=name,
-                    value=value)
-
-def set_option(self, name, val):
-    """set comp graph option; see :func:`.set_comp_graph_option`"""
-    from .config import set_comp_graph_option
-    return set_comp_graph_option(self, name, val)
-
-def is_eager(self):
-    """return True if comp_graph is in eager mode"""
-    from .config import comp_graph_is_eager
-    return comp_graph_is_eager(self)
-
-def release(self):
-    """explicitly release the underlying computing graph storage; this is
-    mostly useful in eager evaluation mode, since doing so would release the
-    underlying device storage
-
-    :return: original reference count before release
-    :rtype: int
-    """
-    return int(self._release())
-
-}
diff --git a/python_module/src/swig/comp_graph_tools.i b/python_module/src/swig/comp_graph_tools.i
deleted file mode 100644
index 7f6262df..00000000
--- a/python_module/src/swig/comp_graph_tools.i
+++ /dev/null
@@ -1,55 +0,0 @@
-%{
-#include "megbrain/gopt/framework.h"
-%}
-
-%inline {
-
-    SymbolVarArray _get_owner_opr_inputs(SymbolVar var) {
-        mgb_assert(var.node());
-        return mgb::cg::to_symbol_var_array(var.node()->owner_opr()->input());
-    }
-
-    std::string _get_owner_opr_type(SymbolVar var) {
-        mgb_assert(var.node());
-        return var.node()->owner_opr()->dyn_typeinfo()->name;
-    }
-
-    std::string _get_opr_type(Operator opr) {
-        return opr.node()->dyn_typeinfo()->name;
-    }
-
-    SymbolVarArray _replace_vars(const SymbolVarArray& repl_src,
-                                 const SymbolVarArray& repl_dst,
-                                 const SymbolVarArray& vars) {
-        mgb::ThinHashMap<SymbolVar, SymbolVar> varmap;
-        for (size_t i = 0; i < repl_src.size(); ++i) {
-            varmap[repl_src[i]] = repl_dst[i];
-        }
-        return mgb::cg::replace_vars(vars, varmap);
-    }
-
-    typedef std::vector<Operator> OperatorArray;
-    SymbolVarArray _replace_oprs(const OperatorArray& repl_src,
-                                 const OperatorArray& repl_dst,
-                                 const SymbolVarArray& vars) {
-        mgb::ThinHashMap<mgb::cg::OperatorNodeBase*, mgb::cg::OperatorNodeBase*>
-                oprmap;
-        for (size_t i = 0; i < repl_src.size(); ++i) {
-            oprmap[repl_src[i].node()] = repl_dst[i].node();
-        }
-        return mgb::cg::replace_oprs(vars, oprmap);
-    }
-
-    void _set_priority_to_id(const SymbolVarArray& dest_vars) {
-        auto on_opr = [](mgb::cg::OperatorNodeBase* opr) {
-            if (opr->node_prop().attribute().priority == 0) {
-                opr->node_prop().attribute().priority = opr->id();
-            }
-        };
-        mgb::cg::DepOprIter dep_iter{on_opr};
-        for (const SymbolVar& var : dest_vars) {
-            dep_iter.add(var);
-        }
-    }
-}
-// vim: ft=swig foldmethod=marker foldmarker=f{{{,f}}}
diff --git a/python_module/src/swig/comp_node.i b/python_module/src/swig/comp_node.i
deleted file mode 100644
index 708cd34f..00000000
--- a/python_module/src/swig/comp_node.i
+++ /dev/null
@@ -1,165 +0,0 @@
-/*
- * $File: comp_node.i
- *
- * This file is part of MegBrain, a deep learning framework developed by Megvii.
- *
- * $Copyright: Copyright (c) 2014-2017 Megvii Inc. All rights reserved.
- */
-
-%{
-using mgb::CompNode;
-static CompNode::DeviceType str2device_type(
-        const std::string &str, bool allow_unspec) {
-    using T = CompNode::DeviceType;
-    if (str == "CPU") {
-        return T::CPU;
-    } else if (str == "CUDA" || str == "GPU") {
-        return T::CUDA;
-    } else {
-        mgb_assert(allow_unspec && str == "XPU", "bad device type: %s; which "
-                "must be either CPU, GPU or XPU", str.c_str());
-        return T::UNSPEC;
-    }
-}
-%}
-
-class CompNode {
-    public:
-        static CompNode load(const char* id);
-
-        %extend {
-            static std::vector<int> _parse_locator(const std::string &id) {
-                auto logi = CompNode::Locator::parse(id);
-                return {
-                    static_cast<int>(logi.type), logi.device, logi.stream,
-                };
-            }
-            static void _set_device_map(const std::string &type,
-                    int from, int to) {
-                CompNode::Locator::set_device_map(
-                        str2device_type(type, false), from, to);
-            }
-
-            static size_t _get_device_count(const std::string &type, bool warn) {
-                return CompNode::get_device_count(str2device_type(type, true), warn);
-            }
-
-            static void _set_unspec_device_type(const std::string &type) {
-                CompNode::Locator::set_unspec_device_type(
-                    str2device_type(type, false));
-            }
-
-            static void _try_coalesce_all_free_memory() {
-                CompNode::try_coalesce_all_free_memory();
-            }
-
-            bool _check_eq(const CompNode &rhs) const {
-                return (*$self) == rhs;
-            }
-
-            std::vector<int> _get_locator() const {
-                auto logi = $self->locator_logical(), phys = $self->locator();
-                return {
-                    static_cast<int>(logi.type), logi.device, logi.stream,
-                            static_cast<int>(phys.type), phys.device,
-                            phys.stream,
-                };
-            }
-
-            std::string __getstate__() {
-                return $self->to_string_logical();
-            }
-
-            std::string __str__() {
-                return $self->to_string();
-            }
-
-            std::string __repr__() {
-                return mgb::ssprintf("CompNode(\"%s\" from \"%s\")",
-                        $self->to_string().c_str(),
-                        $self->to_string_logical().c_str());
-            }
-
-            size_t _get_mem_align_() const {
-                return $self->get_mem_addr_alignment();
-            }
-
-            size_t __hash__() {
-                return mgb::hash(*$self);
-            }
-
-            std::pair<size_t, size_t> _get_mem_status_bytes() {
-                return $self->get_mem_status_bytes();
-            }
-        }
-
-        %pythoncode {
-            DEVICE_TYPE_MAP = {
-                0: 'XPU',
-                1: 'CUDA',
-                2: 'CPU'
-            }
-
-            cn_thread_local = threading.local()
-            """used to save map location when calling :func:`mge.load()`"""
-
-            def __setstate__(self, state):
-                """:func:`mge.load()` and :func:`deepcopy()` call this function,
-                The latter will not produce the map_location attribute"""
-                if "map_location" in CompNode.cn_thread_local.__dict__.keys():
-                    state = CompNode.cn_thread_local.map_location(state)
-                self.this = CompNode_load(state).this
-
-            def __eq__(self, rhs):
-                return isinstance(rhs, CompNode) and self._check_eq(rhs)
-
-            @property
-            def mem_align(self):
-                """memory alignment in bytes"""
-                return self._get_mem_align_()
-
-            @property
-            def locator_logical(self) -> [str, int, int]:
-                """logical locator: a tuple containing (type, device, stream)"""
-                t, d, s = self._get_locator()[:3]
-                return self.DEVICE_TYPE_MAP[t], d, s
-
-            @property
-            def locator_physical(self) -> [str, int, int]:
-                """physical locator: a tuple containing (type, device, stream)"""
-                t, d, s = self._get_locator()[3:]
-                return self.DEVICE_TYPE_MAP[t], d, s
-
-            @property
-            def mem_status_bytes(self) -> [int, int]:
-                """get (total, free) memory on the computing device in bytes.
-
-                Free memory includes memory chunks that buffered by the memory manager.
-
-                Please note that the results are the same for different CompNode within same device.
-                """
-                return self._get_mem_status_bytes()
-        }
-};
-%template(_VectorCompNode) std::vector<CompNode>;
-%template(_VectorCompNodeAndSize) std::vector<std::pair<CompNode, size_t>>;
-
-%pythoncode {
-
-def as_comp_node(desc):
-    """create a :class:`.CompNode` by desc
-
-    :type desc: str or :class:`.CompNode`
-    :param desc: if str, an id describing the comp node, like 'gpu0', 'gpu1'. A
-        special id 'gpux' represents the logical default comp node. Otherwise
-        it should already be a :class:`.CompNode`.
-    """
-    if isinstance(desc, str):
-        return CompNode_load(desc)
-    assert isinstance(desc, CompNode), (
-        'could not convert {} to CompNode'.format(desc))
-    return desc
-
-}
-
-// vim: ft=swig
diff --git a/python_module/src/swig/craniotome.i b/python_module/src/swig/craniotome.i
deleted file mode 100644
index adff8c4d..00000000
--- a/python_module/src/swig/craniotome.i
+++ /dev/null
@@ -1,88 +0,0 @@
-/*
- * $File: craniotome.i
- *
- * This file is part of MegBrain, a deep learning framework developed by Megvii.
- *
- * $Copyright: Copyright (c) 2014-2017 Megvii Inc. All rights reserved.
- */
-
-%{
-#include "craniotome.h"
-%}
-
-typedef std::vector<std::vector<size_t>> TensorShapeVec;
-%template(_VectorTensorShape) std::vector<std::vector<size_t>>;
-
-%feature("director") CraniotomeDesc;
-class CraniotomeDesc {
-
-    public:
-        virtual ~CraniotomeDesc() = default;
-
-        virtual void _setup_self(PyObject *result) const = 0;
-
-        virtual bool _is_same(PyObject *rhs) const = 0;
-
-        virtual uint32_t _node_flag() const = 0;
-
-        virtual size_t _hash() const = 0;
-
-        virtual std::string _get_opr_type_name() = 0;
-
-        virtual size_t _get_nr_outputs() = 0;
-
-        virtual void _execute(
-                const std::vector<CompGraphCallbackValueProxy> &inputs,
-                std::vector<SharedND> &outputs) = 0;
-
-        virtual TensorShapeVec _infer_shape(
-                const TensorShapeVec &inp_shape) = 0;
-
-        virtual SymbolVarArray _grad(
-                size_t wrt_idx,
-                const SymbolVarArray &inputs,
-                const SymbolVarArray &outputs,
-                const SymbolVarArray &out_grad) = 0;
-
-        virtual size_t _get_nr_dev_comp_order_deps() = 0;
-
-        SymbolVarArray _get_all_io_vars();
-
-        virtual bool _init_output_dtype(
-                PyObject *input_dtypes, PyObject *result) = 0;
-
-        virtual CompGraph _get_comp_graph() = 0;
-
-        virtual void _copy() const = 0;
-        void _set_copy_result(CraniotomeDesc *result);
-
-        virtual void _setup_serialize_params(PyObject *output) const = 0;
-
-        virtual void _on_graph_compile_or_func_del(
-                const std::vector<size_t>& used_outputs) = 0;
-
-        %extend {
-            CompNode _get_comp_node() {
-                mgb_assert($self->owner_opr);
-                return $self->owner_opr->comp_node();
-            }
-
-            size_t _get_opr_id() {
-                mgb_assert($self->owner_opr);
-                return $self->owner_opr->id();
-            }
-        }
-};
-
-%inline {
-    static SymbolVarArray make_opr_from_craniotome_desc(
-            CraniotomeDesc *desc,
-            const SymbolVarArray inputs,
-            const OperatorNodeConfig &config) {
-
-        return mgb::opr::Craniotome::make(
-            std::unique_ptr<CraniotomeDesc>(desc), inputs, config);
-    }
-}
-
-// vim: ft=swig
diff --git a/python_module/src/swig/loop.i b/python_module/src/swig/loop.i
deleted file mode 100644
index 0a094d2d..00000000
--- a/python_module/src/swig/loop.i
+++ /dev/null
@@ -1,134 +0,0 @@
-/*
- * $File: loop.i
- *
- * This file is part of MegBrain, a deep learning framework developed by Megvii.
- *
- * $Copyright: Copyright (c) 2014-2017 Megvii Inc. All rights reserved.
- */
-
-%{
-#include "megbrain/opr/loop.h"
-#include <cctype>
-using LoopDesc = mgb::opr::Loop::Desc;
-%}
-
-%feature("autodoc",
-"""An object used by callbacks for :func:`.make_loop` to describe the sub graph in
-loop operator. See docs of :func:`.make_loop` for more explanation.
-""") LoopDesc;
-
-%feature("autodoc",
-"""forward a variable belonging to the parent graph into the sub graph
-
-:type input: :class:`.SymbolVar`
-:param input: a variable in the parent graph
-:type has_assign: bool
-:param has_assign: whether this input var would be assigned later
-:rtype: :class:`.SymbolVar`
-:return: the corresponding variable in the sub graph
-""") LoopDesc::add_input;
-
-%feature("autodoc",
-"""instructs that value of a variable in the sub graph should be replaced by
-the new value at the end of each loop.
-
-:type dest: :class:`.SymbolVar`
-:param dest: the variable to be updated. It must be a return value of
-    :meth:`add_input`.
-:type val: :class:`.SymbolVar`
-:param val: the new value
-:return: self to be chained
-""") LoopDesc::assign;
-
-%feature("autodoc",
-"""set a variable to indicate whether the loop should be repeated.
-
-:type cond: :class:`.SymbolVar`
-:param cond: loop would be repeated if the absolute value of *cond* is more
-    than 1e-6; It must evaluates to a scalar.
-:return: self to be chained
-""") LoopDesc::set_loop_condition;
-
-%feature("autodoc",
-"""get the loop counter, which would indicate current loop count, starting from zero.
-
-:rtype: :class:`.SymbolVar`
-:return: the loop counter
-""") LoopDesc::get_counter_var;
-
-%feature("autodoc",
-"""mark a variable to be copied as output value of the loop operator.
-
-:type var: :class:`.SymbolVar`
-:param var: a variable in sub graph whose value should be copied into the
-    parent graph
-:type mode: str
-:param mode: output mode; possible values are:
-
-    * ``'last'``: only the last value would be recorded
-    * ``'all'``: all the value would be recorded; shape of the variable should
-      not change during looping, and the output var would be prepended with an
-      extra leading dimension to index the loop count.
-    * ``'sum'``: sum of all values of this variable during looping would be
-      copied to output
-    * ``'product'``: product of all values of this variable during looping
-      would be copied to output
-:rtype: int
-:return: call id, starting at 0 and increasing continuously
-""") LoopDesc::add_output;
-
-class LoopDesc {
-    public:
-        LoopDesc() = delete;
-        ~LoopDesc() = delete;
-
-        SymbolVar add_input(SymbolVar input, bool has_assign = false);
-        LoopDesc& assign(SymbolVar dest, SymbolVar val);
-        LoopDesc& set_loop_condition(SymbolVar cond);
-        SymbolVar get_counter_var();
-
-        %extend {
-            size_t add_output(SymbolVar& var, std::string mode) {
-                using Desc = mgb::opr::Loop::Desc;
-                auto get_mode = [&]() {
-                    using OM = Desc::OutputMode;
-                    for (char &i: mode)
-                        i = std::tolower(i);
-                    if (mode == "last")
-                        return OM::LAST;
-                    if (mode == "all")
-                        return OM::ALL;
-                    if (mode == "sum")
-                        return OM::SUM;
-                    throw mgb::MegBrainError(
-                            mgb::ssprintf("unrecognized loop mode: %s",
-                                mode.c_str()));
-                };
-                return $self->add_output(var, get_mode());
-            }
-        }
-
-};
-
-%feature("director") _LoopDescMakerCallback;
-%inline {
-    class _LoopDescMakerCallback {
-        public:
-            virtual ~_LoopDescMakerCallback() = default;
-            virtual void call(LoopDesc &desc) = 0;
-    };
-
-    static SymbolVarArray _make_loop(
-            _LoopDescMakerCallback* callback, int swap_interval,
-            const OperatorNodeConfig &config) {
-
-        std::shared_ptr<_LoopDescMakerCallback> callbackptr{callback};
-
-        auto desc_maker = [callbackptr](mgb::opr::Loop::Desc &loop_desc) {
-            callbackptr->call(loop_desc);
-        };
-        return mgb::opr::Loop::make(desc_maker, swap_interval, config);
-    }
-} // %inline
-
-// vim: ft=swig
diff --git a/python_module/src/swig/mgb.i b/python_module/src/swig/mgb.i
deleted file mode 100644
index 3a2c871d..00000000
--- a/python_module/src/swig/mgb.i
+++ /dev/null
@@ -1,71 +0,0 @@
-/*
- * $File: mgb.i
- *
- * This file is part of MegBrain, a deep learning framework developed by Megvii.
- *
- * $Copyright: Copyright (c) 2014-2017 Megvii Inc. All rights reserved.
- */
-
-%include "symbol_var_array.i"
-
-%include "mgb_exception.i"
-%module(directors="1") mgb
-%{
-#define SWIG_FILE_WITH_INIT 1
-void mgb_init_numpy(); // implemented in python_helper.cpp
-void _init_intbx_types(PyObject *m); // implemented in intbx.cpp
-void _init_bfloat16_types(PyObject *m); // implemented in bfloat16.cpp
-%}
-
-%init %{
-    mgb_init_numpy();
-    _init_intbx_types(m);
-    _init_bfloat16_types(m);
-%}
-
-%include "std_vector.i"
-%include "std_pair.i"
-%include "stdint.i"
-%template(_VectorSizeT) std::vector<size_t>;
-%template(_VectorInt) std::vector<int>;
-%template(_VectorString) std::vector<std::string>;
-%template(_PairStringSizeT) std::pair<std::string, size_t>;
-%template(_PairSizeTSizeT) std::pair<size_t, size_t>;
-%template(_VectorPairSizeTString) std::vector<std::pair<size_t, std::string>>;
-
-%pythoncode %{
-import numpy as np
-import os
-import threading
-intb1 = _mgb.intb1
-intb2 = _mgb.intb2
-intb4 = _mgb.intb4
-bfloat16 = _mgb.bfloat16
-%}
-
-%{
-#include "megbrain/comp_node.h"
-#include "megbrain/tensor.h"
-#include "megbrain/graph.h"
-
-#include "megbrain_wrap.h"
-#include "megbrain_config.h"
-#include "megbrain_serialize.h"
-#include "plugin.h"
-%}
-
-%include "megbrain_build_config.h"
-%include "comp_node.i"
-%include "comp_graph.i"
-%include "symbol_var.i"
-%include "shared_nd.i"
-%include "../cpp/megbrain_config.h"
-%include "callback.i"
-%include "operator.i"
-%include "craniotome.i"
-%include "misc.i"
-%include "loop.i"
-%include "../cpp/megbrain_serialize.h"
-%include "../cpp/plugin.h"
-
-// vim: ft=swig
diff --git a/python_module/src/swig/mgb_exception.i b/python_module/src/swig/mgb_exception.i
deleted file mode 100644
index 8a6ee7c5..00000000
--- a/python_module/src/swig/mgb_exception.i
+++ /dev/null
@@ -1,40 +0,0 @@
-/*
- * $File: mgb_exception.i
- *
- * This file is part of MegBrain, a deep learning framework developed by Megvii.
- *
- * $Copyright: Copyright (c) 2014-2017 Megvii Inc. All rights reserved.
- */
-
-
-%include "std_string.i"
-%include "std_except.i"
-%include "pyabc.i"
-
-%{
-#include "python_helper.h"
-%}
-
-namespace PyMGBExceptionMaker {
-    void _reg_exception_class(PyObject *cls);
-}
-
-%feature("director:except") {
-    if ($error)
-        PyExceptionForward::throw_();
-}
-
-%include "exception.i"
-%allowexception;
-%exception {
-    try {
-        $action
-    } catch (std::exception &e) {
-        PyMGBExceptionMaker::setup_py_exception(e);
-        SWIG_fail;
-    }  catch(...) {
-        SWIG_exception(SWIG_UnknownError, "Unknown exception");
-    }
-}
-
-// vim: ft=swig
diff --git a/python_module/src/swig/misc.i b/python_module/src/swig/misc.i
deleted file mode 100644
index d3f18946..00000000
--- a/python_module/src/swig/misc.i
+++ /dev/null
@@ -1,151 +0,0 @@
-/*
- * $File: misc.i
- *
- * This file is part of MegBrain, a deep learning framework developed by Megvii.
- *
- * $Copyright: Copyright (c) 2014-2017 Megvii Inc. All rights reserved.
- */
-
-
-%{
-#include "megbrain/utils/persistent_cache.h"
-#include "megbrain/serialization/helper.h"
-#include "megbrain/gopt/inference.h"
-#include "megbrain/plugin/opr_footprint.h"
-using _PyStackExtracter = PyStackExtracter;
-using _PersistentCache = mgb::PersistentCache;
-using _PersistentCacheBlob = _PersistentCache::Blob;
-using _MaybePersistentCacheBlob = mgb::Maybe<_PersistentCacheBlob>;
-using _OptimizeForInferenceOptions = mgb::gopt::OptimizeForInferenceOptions;
-%}
-
-%feature("director") _PyStackExtracter;
-class _PyStackExtracter {
-    public:
-        virtual ~_PyStackExtracter() = default;
-        virtual std::string extract() = 0;
-        static void reg(_PyStackExtracter *p);
-};
-
-// from Blob to python bytes
-%typemap(in) const _PersistentCacheBlob& {
-    mgb_assert(PyBytes_Check($input));
-    $1->ptr = PyBytes_AsString($input);
-    $1->size = PyBytes_Size($input);
-}
-%typemap(directorin) const _PersistentCacheBlob& {
-    $input = PyBytes_FromStringAndSize(
-        static_cast<const char*>($1.ptr), $1.size);
-}
-%typemap(directorout) _MaybePersistentCacheBlob {
-    mgb_assert($1->ob_refcnt >= 2, "persistent cache result refcnt too small");
-    if ($1 == Py_None) {
-        $result = mgb::None;
-    } else {
-        mgb_assert(PyBytes_Check($input));
-        _PersistentCacheBlob blob;
-        blob.ptr = PyBytes_AsString($1);
-        blob.size = PyBytes_Size($1);
-        $result = blob;
-    }
-}
-
-%feature("director") _PersistentCache;
-class _PersistentCache {
-    public:
-        virtual ~_PersistentCache() = default;
-
-        virtual void put(const std::string &category,
-                const _PersistentCacheBlob &key,
-                const _PersistentCacheBlob &value) = 0;
-
-        virtual _MaybePersistentCacheBlob get(
-                const std::string &category,
-                const _PersistentCacheBlob &key) = 0;
-
-        %extend {
-            static void reg(_PersistentCache *p) {
-                _PersistentCache::set_impl({p, [](_PersistentCache*){}});
-            }
-        }
-};
-
-struct _OptimizeForInferenceOptions {
-#define SET(n)  void enable_##n();
-        SET(f16_io_f32_comp);
-        SET(f16_io_comp);
-        SET(fuse_conv_bias_nonlinearity);
-        SET(fuse_conv_bias_with_z);
-#undef SET
-#define SET(_trans, _trans_capital)   \
-        void enable_##_trans(); \
-
-        SET(nchw4, NCHW4);
-        SET(nhwcd4, NHWCD4);
-        SET(nchw88, NCHW88);
-        SET(nchw44, NCHW44);
-        SET(nchw44_dot, NCHW44_DOT);
-        SET(nchw32, NCHW32);
-        SET(chwn4, CHWN4);
-#undef SET
-};
-
-%inline {
-    static SymbolVarArray _optimize_for_inference(
-            const SymbolVarArray& dest_vars,
-            const _OptimizeForInferenceOptions& opt) {
-        return mgb::gopt::optimize_for_inference(dest_vars, opt);
-    }
-
-    // defined in function_replace.cpp
-    void _register_logger(PyObject *logger);
-    void _timed_func_set_fork_exec_path(const char *arg0, const char *arg1);
-    void _timed_func_exec_cb(const char *user_data);
-
-    // defined in megbrain_wrap.cpp
-    void _mgb_global_finalize();
-    std::vector<size_t> _get_mgb_version();
-    SymbolVarArray _grad(SymbolVar target, SymbolVarArray wrts,
-            bool warn_mid_wrt, int use_virtual_grad,
-            bool return_zero_for_nodep);
-    SymbolVar _inter_graph_trans_var(
-            CompGraph &dest_graph, SymbolVar src);
-    SymbolVar _get_graph_optimizer_replaced_var(SymbolVar src);
-    void _add_update_fastpath(SharedND& dest, SharedND& delta,
-            float alpha, float beta, float bias);
-    void _add_update_fastpath(SharedND& dest,
-            CompGraphCallbackValueProxy& delta,
-            float alpha, float beta, float bias);
-
-    static SymbolVar _current_grad_target(CompGraph &graph) {
-        return mgb::cg::current_grad_target(graph.get());
-    }
-
-    uint32_t _get_dtype_num(PyObject *dtype) {
-        return static_cast<uint32_t>(npy::dtype_np2mgb(dtype).enumv());
-    }
-
-    PyObject* _get_serialized_dtype(PyObject *dtype) {
-        PYTHON_GIL;
-        std::string sdtype;
-        auto write = [&sdtype](const void* data, size_t size) {
-            auto pos = sdtype.size();
-            sdtype.resize(pos + size);
-            memcpy(&sdtype[pos], data, size);
-        };
-        mgb::serialization::serialize_dtype(npy::dtype_np2mgb(dtype), write);
-        return PyBytes_FromStringAndSize(sdtype.data(), sdtype.size());
-    }
-
-    size_t max_size_t() {
-        return std::numeric_limits<size_t>::max();
-    }
-
-    std::string _get_opr_fp_graph_exec(
-        CompGraph& cg, const SymbolVarArray& outputs) {
-        auto json = mgb::OprFootprint::get_opr_fp_graph_exec(cg.get(), outputs);
-        return json->to_string();
-    }
-}
-
-// vim: ft=swig
diff --git a/python_module/src/swig/operator.i b/python_module/src/swig/operator.i
deleted file mode 100644
index e7bd0911..00000000
--- a/python_module/src/swig/operator.i
+++ /dev/null
@@ -1,148 +0,0 @@
-/*
- * $File: operator.i
- *
- * This file is part of MegBrain, a deep learning framework developed by Megvii.
- *
- * $Copyright: Copyright (c) 2014-2017 Megvii Inc. All rights reserved.
- */
-
-%{
-#include "opr_helper.h"
-#include "opr_defs.h"
-
-using ::mgb::cg::OperatorNodeConfig;
-
-using _AxisIndexer = AxisIndexer;
-
-static inline PyObject* _to_mgb_supported_dtype(PyObject* dtype) {
-    return ::npy::to_mgb_supported_dtype(dtype);
-}
-
-%}
-
-%feature("autodoc", "Extra configuration for an operator") OperatorNodeConfig;
-class OperatorNodeConfig {
-    public:
-        OperatorNodeConfig();
-        void name(const std::string &name);
-        void comp_node(const CompNode &node);
-
-        %extend {
-            void comp_node_arr(const std::vector<CompNode> &arr) {
-                OperatorNodeConfig::CompNodeArray tarr(arr.begin(), arr.end());
-                $self->comp_node_arr(tarr);
-            }
-
-            CompNode require_comp_node() {
-                mgb_assert($self->comp_node().size() == 1,
-                        "comp_node is required for the config");
-                return $self->comp_node()[0];
-            }
-
-            void output_dtype(PyObject* dtype) {
-                $self->output_dtype(npy::dtype_np2mgb(dtype));
-            }
-        }
-};
-
-%feature("autodoc",
-          "representing a operator node in a computing graph") Operator;
-class Operator {
-public:
-    %extend {
-        size_t _get_id() const {
-            return $self->id();
-        }
-
-        const std::string& _get_name() const {
-            return $self->name();
-        }
-
-        const std::string& _get_params() const {
-            return $self->params();
-        }
-
-        SymbolVarArray _get_inputs() {
-            return $self->inputs();
-        }
-
-        SymbolVarArray _get_outputs() {
-            return $self->outputs();
-        }
-
-        CompGraph _get_owner_graph() {
-            const auto& cg = $self->get_owner_graph();
-            return CompGraph::make_from_shared_ptr(cg);
-        }
-
-        %include "operator.py"
-    }
-};
-
-%template(_VectorOperator) std::vector<Operator>;
-
-class _AxisIndexer {
-public:
-    static _AxisIndexer make_interval(int axis, SymbolVar begin, SymbolVar end,
-                                      SymbolVar step);
-
-    static _AxisIndexer make_index(int axis, SymbolVar idx);
-};
-%template(_VectorAxisIndexer) std::vector<_AxisIndexer>;
-
-%inline {
-    // all defined in opr_helper.cpp
-    SymbolVarArray _create_opr(
-        const char *name, const SymbolVarArray &inputs, PyObject *params,
-        const OperatorNodeConfig &config);
-
-    SymbolVar _create_subtensor_like_opr(
-            const std::string &name,
-            const SymbolVarArray& inputs,
-            const std::vector<_AxisIndexer> &idx,
-            const OperatorNodeConfig &config);
-
-    SymbolVar _make_immutable(
-            CompGraph &comp_graph, PyObject *npyarr, PyObject *dtype,
-            const OperatorNodeConfig &config);
-}
-
-PyObject* _to_mgb_supported_dtype(PyObject *dtype);
-
-%include "../cpp/opr_defs.h"
-
-%pythoncode {
-
-def make_opr_config(name=None, comp_node=None, output_dtype=None):
-    """make :class:`.OperatorNodeConfig` from given name or comp_node
-
-    :type name: None or str
-    :param name: name for the operator
-    :type comp_node: None or comp_node-compatible or iterable of
-        comp_node-compatible
-    :param comp_node: a single comp_node, or iterable of comp_nodes
-    :type dtype: None or numpy-dtype compatible
-    :param dtype: the specified dtype the operator.
-    """
-    rst = OperatorNodeConfig()
-    if comp_node is not None:
-        if isinstance(comp_node, str):
-            rst.comp_node(as_comp_node(comp_node))
-        elif isinstance(comp_node, collections.Iterable):
-            vec = _VectorCompNode()
-            for i in comp_node:
-                vec.push_back(as_comp_node(i))
-            rst.comp_node_arr(vec)
-        else:
-            rst.comp_node(as_comp_node(comp_node))
-    if name is not None:
-        assert isinstance(name, str)
-        rst.name(name)
-    if output_dtype is not None:
-        rst.output_dtype(output_dtype)
-
-    return rst
-
-} // %pythoncode
-
-// vim: ft=swig foldmethod=marker foldmarker=f{{{,f}}}
diff --git a/python_module/src/swig/operator.py b/python_module/src/swig/operator.py
deleted file mode 100644
index a1eb095e..00000000
--- a/python_module/src/swig/operator.py
+++ /dev/null
@@ -1,37 +0,0 @@
-%pythoncode {
-
-__owner_graph = None
-
-@property
-def owner_graph(self):
-    """get the owner graph; note that a reference would be kept in this var"""
-    if self.__owner_graph is None:
-        self.__owner_graph = self._get_owner_graph()
-    return self.__owner_graph
-
-@property
-def id(self):
-    """an integer identifier for this opr that is unique in the computing
-    graph"""
-    return int(self._get_id())
-
-@property
-def name(self):
-    return self._get_name()
-
-@property
-def params(self):
-    import json
-    return json.loads(self._get_params())
-
-@property
-def inputs(self):
-    return tuple(self._get_inputs())
-
-@property
-def outputs(self):
-    return tuple(self._get_outputs())
-
-def __repr__(self):
-    return 'Operator(id={},name={})'.format(self.id, self.name)
-}
diff --git a/python_module/src/swig/shared_nd.i b/python_module/src/swig/shared_nd.i
deleted file mode 100644
index 03e3dced..00000000
--- a/python_module/src/swig/shared_nd.i
+++ /dev/null
@@ -1,125 +0,0 @@
-/*
- * $File: shared_nd.i
- *
- * This file is part of MegBrain, a deep learning framework developed by Megvii.
- *
- * $Copyright: Copyright (c) 2014-2017 Megvii Inc. All rights reserved.
- */
-
-%pythoncode {
-from .mgb_helper import SharedNDLazyInitializer
-} // pythoncode
-
-%feature("autodoc", """a value stored on computing device and can be modified
-by special operators in the graph""") SharedND;
-class SharedND {
-    public:
-        SharedND(CompNode comp_node, PyObject *dtype);
-
-        void _set_init_shape(const std::vector<size_t> &shape);
-        void _resize(const std::vector<size_t> &shape);
-        void _reset_zero();
-
-        PyObject* _get_npyarr();
-        PyObject* _get_dtype();
-        std::vector<size_t> _get_shape();
-
-        void _copy_from_npyarr(PyObject *npyarr);
-        void _copy_from_value_proxy(CompGraphCallbackValueProxy &value);
-        void _share_from_value_proxy(CompGraphCallbackValueProxy &value);
-        static SharedND _from_symvar(SymbolVar symvar);
-
-        void _set_copy_sync(bool flag);
-        uintptr_t _pubapi_dev_tensor_ptr(int version);
-
-        void copy_to_sub_from_shared(
-                int axis, ptrdiff_t begin, ptrdiff_t end, ptrdiff_t step,
-                const SharedND &rhs);
-
-        void copy_from_shared_sub(const SharedND &rhs,
-                int axis, ptrdiff_t begin, ptrdiff_t end, ptrdiff_t step);
-
-        CompNode _get_comp_node();
-
-        SymbolVar _as_sym_var(CompGraph &graph, const std::string &name,
-                bool volatile_);
-
-        void _share_memory_from(const SharedND &rhs, size_t begin);
-
-        void _reset_dev_tensor(const SharedND& rhs);
-
-        %include "shared_nd_SharedND.py"
-};
-%template(_VectorSharedND) std::vector<SharedND>;
-
-class _HostSharedND {
-    public:
-        _HostSharedND(CompNode node, PyObject *dtype);
-        static _HostSharedND make_proxy(SymbolVar var);
-
-        SymbolVar _as_sym_var(CompGraph &cg, bool enable_static_infer,
-                              const std::string &name);
-        PyObject* _get_dtype();
-        void _resize(const std::vector<size_t> &shape);
-        void _copy_from_npyarr(PyObject *npyarr, bool borrow);
-        void _enable_borrow_on_cpu(bool flag);
-        std::string __repr__() const;
-
-        %include "shared_nd_HostSharedND.py"
-};
-
-
-%feature("autodoc",
-"""a scalar value that can be modified after it has been created;
-compared to :class:`SharedND`, it has the advantage that no comp node needs to
-be specified.""") SharedScalar;
-class SharedScalar {
-    public:
-        SharedScalar(PyObject *val);
-        void _set(PyObject *val);
-        PyObject* _get();
-        bool _dtype_locked();
-        void _lock_dtype();
-        SymbolVar _as_sym_var(CompGraph &cg, CompNode &cn);
-
-        %pythoncode {
-
-            def lock_dtype(self):
-                """lock dtype so further set() calls must pass the same dtyped
-                value"""
-                self._lock_dtype()
-
-            @property
-            def dtype_locked(self):
-                """whether dtype is locked"""
-                return self._dtype_locked()
-
-            def set(self, val):
-                self._set(val)
-
-            def get(self):
-                """get the value stored in this SharedScalar"""
-                return self._get()[0]
-
-            def __getstate__(self):
-                state = self.__dict__.copy()
-                del state['this']
-                state['__shared_scalar_value'] = self.get()
-                state['__shared_scalar_dtype_locked'] = self.dtype_locked
-                return state
-
-            def __setstate__(self, state):
-                val = SharedScalar(state.pop('__shared_scalar_value'))
-                if state.pop('__shared_scalar_dtype_locked', True):
-                    val._lock_dtype()
-                self.this = val.this
-                for k, v in state.items():
-                    self.__dict__[k] = v
-
-            def __repr__(self):
-                return 'SharedScalar({})'.format(self.get())
-        }
-};
-
-
-// vim: ft=swig
diff --git a/python_module/src/swig/shared_nd_HostSharedND.py b/python_module/src/swig/shared_nd_HostSharedND.py
deleted file mode 100644
index f94ff798..00000000
--- a/python_module/src/swig/shared_nd_HostSharedND.py
+++ /dev/null
@@ -1,67 +0,0 @@
-%pythoncode{
-
-__dtype = None
-
-def as_sym_var(self, cg, enable_static_infer, name=None):
-    """get symvar to represent value of this HostSharedND in a
-    computing graph
-
-    :type cg: :class:`.CompGraph`
-    :param cg: computing graph
-    :type enable_static_infer: :class:`bool`
-    :param enable_static_infer: whether to enable static value
-        inference for this symvar; if set to True, the value must
-        be set up before calling :meth:`as_sym_var`.
-    """
-    if name is None:
-        name = ''
-    return self._as_sym_var(cg, enable_static_infer, name)
-
-def symvar(self, comp_graph, name=None, *, enable_static_infer=None):
-    return self.as_sym_var(comp_graph, enable_static_infer, name)
-
-def enable_borrow_on_cpu(self, flag):
-    """whether to allow borrow memory in :meth:`set_value` if
-    the underlying comp ndoe is on CPU"""
-    self._enable_borrow_on_cpu(flag)
-
-def _set_value_print_warn(
-        self, reason, *,
-        disabled=os.getenv('MGB_DISABLE_SET_VALUE_WARN') is not None):
-    if disabled:
-        return
-    from .logconf import get_logger
-    logger = get_logger()
-    logger.warning('set {} from incompatible object is slow: {}'.format(
-        self, reason))
-
-def set_value(self, w, *, borrow=False):
-    """set value to given numpy array
-
-    :param borrow: if set to True, the memory of *w* may be
-        borrowed, and *w* must remain unmodified during usage of
-        this object
-    :type borrow: bool
-    :return: self
-    """
-    if self.__dtype is None:
-        self.__dtype = self._get_dtype()
-
-    if not isinstance(w, np.ndarray):
-        wtype = type(w)
-        w = np.ascontiguousarray(w, self.__dtype)
-        if w.size >= 1024:
-            self._set_value_print_warn(
-                'not an ndarray object: {}'.format(wtype))
-    elif w.size >= 1024:
-        if w.dtype != self.__dtype:
-            self._set_value_print_warn(
-                'dtype mismatch: expect {}, get {}'.format(
-                    self.__dtype, w.dtype))
-        elif not w.flags['C_CONTIGUOUS']:
-            self._set_value_print_warn('non-contiguous ndarray')
-
-    self._copy_from_npyarr(w, borrow)
-    return self
-
-}
diff --git a/python_module/src/swig/shared_nd_SharedND.py b/python_module/src/swig/shared_nd_SharedND.py
deleted file mode 100644
index da024c7a..00000000
--- a/python_module/src/swig/shared_nd_SharedND.py
+++ /dev/null
@@ -1,196 +0,0 @@
-%pythoncode{
-
-__lazy_initializer = None
-
-def __apply_lazy_initializer(self):
-    """ __lazy_initializer released by self.set_value()"""
-    if self.__lazy_initializer is not None:
-        self.set_value(self.__lazy_initializer.get_value())
-
-@property
-def shape(self):
-    """get shape of unerlying data"""
-    if self.__lazy_initializer is not None:
-        val = self.__lazy_initializer.get_shape()
-    else:
-        val = self._get_shape()
-    return tuple(map(int, val))
-
-@property
-def comp_node(self):
-    return self._get_comp_node()
-
-@property
-def dtype(self):
-    return self._get_dtype()
-
-@property
-def lazy_initializer(self):
-    """object to specify how to initialize this SharedND, or None
-    if not set
-
-    Please not that the initializer could be called at any time.
-
-    :type: :class:`.SharedNDLazyInitializer`
-    """
-    return self.__lazy_initializer
-
-@lazy_initializer.setter
-def lazy_initializer(self, init):
-    assert not len(self._get_shape()), (
-        'can not set initializer for initialized SharedND')
-    assert isinstance(init, SharedNDLazyInitializer)
-    self.__lazy_initializer = init
-
-def set_value(self, w, *, sync=True, inplace=False, share=False):
-    """set value from a numpy array or from outputs in callback
-
-    .. warning::
-
-        If sync is false, a reference to input is kept and the caller is
-        responsible to ensure that the input would not be modified after
-        this function returns.
-
-    :param w: value to be set
-    :type w: :class:`numpy.ndarray`-compatible, :class:`SharedND` or
-        :class:`.CompGraphCallbackValueProxy`
-    :param sync: whether to sync device before returns
-    :type sync: bool
-    :param inplace: whether to copy in-place from another :class:`.SharedND`,
-        guaranteed no memory allocating; if True, this SharedND must have the
-        same shape as *w*, and buffer for this :class:`SharedND` would not be
-        re-allocated.
-    :param share: directly share the buffer in a
-        :class:`.CompGraphCallbackValueProxy` with zero copy
-    :return: self
-    """
-
-    if self is w:
-        return self
-
-    if share:
-        assert isinstance(w, CompGraphCallbackValueProxy)
-        self._share_from_value_proxy(w)
-        return self
-
-    self._set_copy_sync(sync)
-    if isinstance(w, CompGraphCallbackValueProxy):
-        self._copy_from_value_proxy(w)
-        return self
-
-    if isinstance(w, SharedND):
-        w.__apply_lazy_initializer()
-        ax_type = -2
-        if inplace:
-            ax_type = -3
-        self.copy_from_shared_sub(w, ax_type, -1, -1, -1)
-        return self
-    assert not inplace, 'inplace only implemented for copying from SharedND'
-
-    if self.__lazy_initializer is not None:
-        del self.__lazy_initializer
-    self._copy_from_npyarr(w)
-    return self
-
-def get_value(self):
-    """get value as numpy array
-    :return: numpy array, or None if value is empty"""
-    self.__apply_lazy_initializer()
-    return self._get_npyarr()
-
-def resize(self, *shape):
-    """resize the SharedND to given shape and allocate memory, without
-    initializing data; usually :meth:`pubapi_dev_tensor_ptr` is then called to
-    get the buffer address and pass it to some other library
-
-    :return: self
-    """
-    if len(shape) == 1 and isinstance(shape[0], collections.Iterable):
-        shape = shape[0]
-    self._resize(shape)
-    return self
-
-def reset_zero(self):
-    """reset dev_tensor to zeros"""
-    self._reset_zero()
-
-def copy_to(self, dest):
-    """copy value to numpy array
-
-    :type dest: :class:`np.ndarray`
-    :param dest: destination array to write value of this var to,
-        which must match shape, have float32 dtype and be
-        contiguous
-    """
-    check_cont_ndarray(dest)
-    wflat = dest.reshape(-1)
-    assert wflat.ctypes.data == dest.ctypes.data
-    self._copy_to_flatten(wflat)
-    return dest
-
-@property
-def dev_ptr(self):
-    """this method is DEPRECATED; use :meth:`pubapi_dev_tensor_ptr` instead"""
-    return self._pubapi_dev_tensor_ptr(0)
-
-@property
-def pubapi_dev_tensor_ptr(self):
-    """get a pointer to the corresponding mgb::pubapi::DeviceTensor object
-
-    :rtype: int
-    :return: the address as an integer
-    """
-    return self._pubapi_dev_tensor_ptr(1)
-
-def symvar(self, comp_graph, name=None, *, volatile=False):
-    """convert to SymbolVar to be put into a computing graph
-
-    :param volatile: whether shape/ptr is allowed to change
-    """
-    self.__apply_lazy_initializer()
-    assert self.shape, "initial shape must be available"
-    if name is None:
-        name = ''
-    return self._as_sym_var(comp_graph, name, volatile)
-
-def __getstate__(self):
-    state = self.__dict__.copy()
-    del state['this']
-    state['value'] = self.get_value()
-    state['comp_node'] = self.comp_node
-    state['dtype'] = self.dtype
-    return state
-
-def __setstate__(self, state):
-    val = state.pop('value')
-    dtype = state.pop('dtype', 'float32')
-    snd = SharedND(state.pop('comp_node'), dtype)
-    if val is not None:
-        assert val.dtype == dtype
-        snd.set_value(val)
-    self.this = snd.this
-    for k, v in state.items():
-        self.__dict__[k] = v
-
-def share_memory_from(self, rhs, offset):
-    """
-    share memory from another SharedND, self and rhs must be initialized
-    :param rhs: another sharedND used to share memory
-    :type rhs: :class:`SharedND`
-
-    :param offset: offset in rhs sharedND
-    :type offset: int
-    """
-    assert self != rhs
-    self._share_memory_from(rhs, offset)
-
-def reset_dev_tensor(self, rhs):
-    """
-    reset devive tensor to another SharedND, self and rhs must be initialized.
-    :param rhs: another sharedND whose device tensor to be reset to.  
-    :type rhs: :class:`SharedND`
-    """
-    assert self != rhs
-    self._reset_dev_tensor(rhs)
-
-}
diff --git a/python_module/src/swig/symbol_var.i b/python_module/src/swig/symbol_var.i
deleted file mode 100644
index 2c86f206..00000000
--- a/python_module/src/swig/symbol_var.i
+++ /dev/null
@@ -1,113 +0,0 @@
-/*
- * $File: symbol_var.i
- *
- * This file is part of MegBrain, a deep learning framework developed by Megvii.
- *
- * $Copyright: Copyright (c) 2014-2017 Megvii Inc. All rights reserved.
- */
-
-%{
-using mgb::cg::SymbolVar;
-%}
-
-%feature("autodoc",
-"representing a symbolic variable in a computing graph") SymbolVar;
-
-class SymbolVar {
-public:
-    SymbolVar flatten();
-    SymbolVar rename(const std::string &name);
-    bool allow_shape_change();
-
-    %extend {
-
-        SymbolVar fill_retain_dtype(PyObject *val) {
-            return fill_retain_dtype(*$self, val);
-        }
-
-        CompGraph _get_owner_graph() {
-            mgb_assert($self->node());
-            auto cg = $self->node()->owner_graph()->shared_from_this();
-            return CompGraph::make_from_shared_ptr(cg);
-        }
-
-        Operator _get_owner_opr() {
-            mgb_assert($self->node());
-            return Operator{$self->node()->owner_opr()};
-        }
-
-        CompNode _get_comp_node() {
-            mgb_assert($self->node());
-            return $self->node()->comp_node();
-        }
-
-        const std::string& _get_name() const {
-            mgb_assert($self->node());
-            return $self->node()->name();
-        }
-
-        size_t _get_id() const {
-            mgb_assert($self->node());
-            return $self->node()->id();
-        }
-
-        std::vector<size_t> _get_imm_shape() {
-            mgb_assert($self->node());
-            return npy::shape2vec($self->node()->shape());
-        }
-
-        PyObject* _get_inferred_value() {
-            return get_symvar_inferred_value(*$self);
-        }
-
-        bool _is_valid() const {
-            return $self->node();
-        }
-
-        PyObject* _get_dtype() const {
-            return npy::dtype_mgb2np($self->dtype());
-        }
-
-        CompGraphCallbackValueProxy _eager_eval_get_value() const {
-            CompGraphCallbackValueProxy ret;
-            ret.setup($self->eager_eval_get_value(), false);
-            return ret;
-        }
-
-        void _reeval_if_eager_eval() {
-            auto &&var = $self->node();
-            mgb_assert(var);
-            auto &&cg = var->owner_graph();
-            if (cg->options().eager_evaluation) {
-                mgb_assert(var->owner_opr()->inserted_in_graph());
-                cg->insert_opr(std::unique_ptr<mgb::cg::OperatorNodeBase>(
-                    var->owner_opr()));
-            }
-        }
-
-        bool _is_shared_device_tensor() {
-            if ($self->node()
-                        ->owner_opr()
-                        ->same_type<mgb::opr::SharedDeviceTensor>())
-                return true;
-            return false;
-        }
-
-        %include "symbol_var_SymbolVar.py"
-
-    }
-
-};
-
-typedef std::vector<SymbolVar> SymbolVarArray;
-%template(_VectorSymbolVar) std::vector<SymbolVar>;
-
-// SymbolVarArray compatibility; see symbol_var_array.i for more details
-%typemap(out) SymbolVarArray {
-    $result = swig::from(static_cast<const std::vector<SymbolVar>&>($1));
-}
-%typemap(directorin) const SymbolVarArray& {
-    $input = swig::from(static_cast<const std::vector<SymbolVar>&>($1));
-}
-
-// vim: ft=swig
diff --git a/python_module/src/swig/symbol_var_SymbolVar.py b/python_module/src/swig/symbol_var_SymbolVar.py
deleted file mode 100644
index 96fb3383..00000000
--- a/python_module/src/swig/symbol_var_SymbolVar.py
+++ /dev/null
@@ -1,216 +0,0 @@
-%pythoncode {
-
-__owner_graph = None
-__owner_opr = None
-
-@property
-def owner_graph(self):
-    """get the owner graph; note that a reference would be kept in this var"""
-    if self.__owner_graph is None:
-        self.__owner_graph = self._get_owner_graph()
-    return self.__owner_graph
-
-@property
-def owner_opr(self):
-    """get the owner opr; get owner graph explicitly so it can keep a reference
-    to its owner graph"""
-    if self.__owner_opr is None:
-        self.__owner_opr = self._get_owner_opr()
-
-    self.__owner_opr.owner_graph
-    return self.__owner_opr
-
-@property
-def comp_node(self):
-    return self._get_comp_node()
-
-@property
-def name(self):
-    return self._get_name()
-
-@property
-def id(self):
-    """an integer identifier for this var that is unique in the computing
-    graph"""
-    return int(self._get_id())
-
-@property
-def imm_shape(self):
-    """shape as immediate number
-
-    :type: tuple of int
-    """
-    return tuple(map(int, self._get_imm_shape()))
-
-@property
-def inferred_value(self):
-    """get statically inferred value of this var, or None if
-    inference failed
-
-    :type: :class:`numpy.ndarray` or None"""
-    return self._get_inferred_value()
-
-@property
-def valid(self):
-    """whether this symvar is valid (i.e. has corresponding var node in
-    graph)"""
-    return self._is_valid()
-
-@property
-def volatile(self):
-    """whether the shape is volatile"""
-    return not self._is_shared_device_tensor()
-
-@property
-def dtype(self):
-    """get underling data type
-    :rtype: :class:`numpy.dtype`"""
-    return self._get_dtype()
-
-def __hash__(self):
-    return hash((self.owner_graph, self.id))
-
-def __eq__(self, rhs):
-    return (isinstance(rhs, SymbolVar) and
-            self.owner_graph == rhs.owner_graph and
-            self.id == rhs.id)
-
-def _binary_opr(self, mode, rhs):
-    from .opr import elemwise
-    return elemwise([self, rhs], mode=mode)
-
-def _binary_opr_lhs(self, mode, lhs):
-    from .opr import elemwise
-    return elemwise([lhs, self], mode=mode)
-
-def __add__(self, rhs):
-    return self._binary_opr('ADD', rhs)
-def __radd__(self, lhs):
-    return self._binary_opr_lhs('ADD', lhs)
-
-def __sub__(self, rhs):
-    return self._binary_opr('SUB', rhs)
-def __rsub__(self, lhs):
-    return self._binary_opr_lhs('SUB', lhs)
-
-def __mul__(self, rhs):
-    return self._binary_opr('MUL', rhs)
-def __rmul__(self, lhs):
-    return self._binary_opr_lhs('MUL', lhs)
-
-def __matmul__(self, rhs):
-    from .opr import matrix_mul
-    return matrix_mul(self, rhs)
-def __rmatmul__(self, rhs):
-    from .opr import matrix_mul
-    return matrix_mul(rhs, self)
-
-def __lshift__(self, rhs):
-    return self._binary_opr('SHL', rhs)
-def __rshift__(self, rhs):
-    return self._binary_opr('SHR', rhs)
-
-def __truediv__(self, rhs):
-    return self._binary_opr('TRUE_DIV', rhs)
-def __rtruediv__(self, lhs):
-    return self._binary_opr_lhs('TRUE_DIV', lhs)
-
-def __floordiv__(self, rhs):
-    return self._binary_opr('FLOOR_DIV', rhs)
-def __rfloordiv__(self, rhs):
-    return self._binary_opr_lhs('FLOOR_DIV', rhs)
-
-def __mod__(self, rhs):
-    return self._binary_opr('MOD', rhs)
-def __rmod__(self, rhs):
-    return self._binary_opr_lhs('MOD', rhs)
-
-def __pow__(self, rhs):
-    return self._binary_opr('POW', rhs)
-def __rpow__(self, lhs):
-    return self._binary_opr_lhs('POW', lhs)
-
-def __lt__(self, rhs):
-    return self._binary_opr('LT', rhs)
-def __gt__(self, lhs):
-    return self._binary_opr_lhs('LT', lhs)
-
-def __le__(self, rhs):
-    return self._binary_opr('LEQ', rhs)
-def __ge__(self, lhs):
-    return self._binary_opr_lhs('LEQ', lhs)
-
-def __neg__(self):
-    from .opr import elemwise
-    return elemwise([self], mode='NEGATE')
-
-def __getitem__(self, idx):
-    from .helper import cvt_getitem_to_idx_desc
-    inpvar, desc = cvt_getitem_to_idx_desc(self, idx)
-    if desc is None:
-        return inpvar
-    return _create_subtensor_like_opr('subtensor', [inpvar], desc, make_opr_config())
-        
-def reshape(self, *shp):
-    from .opr import reshape
-    return reshape(self, shp)
-
-def broadcast(self, *shp):
-    from .opr import broadcast
-    return broadcast(self, shp)
-
-def sum(self, axis=None, keepdims=False):
-    from .opr import reduce_
-    return reduce_(self, 'SUM', axis, keepdims)
-
-def max(self, axis=None, keepdims=False):
-    from .opr import reduce_
-    return reduce_(self, 'MAX', axis, keepdims)
-
-def min(self, axis=None, keepdims=False):
-    from .opr import reduce_
-    return reduce_(self, 'MIN', axis, keepdims)
-
-def prod(self, axis=None, keepdims=False):
-    from .opr import reduce_
-    return reduce_(self, 'PRODUCT', axis, keepdims)
-
-def mean(self, axis=None, keepdims=False):
-    from .opr import mean
-    return mean(self, axis, keepdims)
-
-def dimshuffle(self, *pattern, **kwargs):
-    from .opr import dimshuffle
-    ndim = kwargs.pop('ndim', 0)
-    assert not kwargs
-    return dimshuffle(self, pattern=pattern, ndim=ndim)
-
-def astype(self, target_dtype):
-    """see :func:`typecvt`"""
-    from .opr import typecvt
-    return typecvt(self, target_dtype)
-
-@property
-def shape(self):
-    from .opr import get_var_shape
-    return get_var_shape(self)
-
-def axis_shape(self, axis):
-    from .opr import get_var_shape
-    return get_var_shape(self, axis=axis)
-
-@property
-def eager_val(self):
-    """get value in eager evaluation mode"""
-    return self._eager_eval_get_value() if self.owner_graph.is_eager() else None
-
-
-def __iter__(self):
-    """add __iter__ to avoid implicit iteration by calling
-    __getitem__"""
-    raise NotImplementedError('SymbolVar var could not be itered')
-
-def __repr__(self):
-    return 'SymbolVar(id={},name={})'.format(self.id, self.name)
-
-}
diff --git a/python_module/src/swig/symbol_var_array.i b/python_module/src/swig/symbol_var_array.i
deleted file mode 100644
index 75b68dd0..00000000
--- a/python_module/src/swig/symbol_var_array.i
+++ /dev/null
@@ -1,46 +0,0 @@
-/*
- * $File: symbol_var_array.i
- *
- * This file is part of MegBrain, a deep learning framework developed by Megvii.
- *
- * $Copyright: Copyright (c) 2014-2017 Megvii Inc. All rights reserved.
- */
-
-/*
- * In megbrain, SymbolVarArray is SmallVector<SymbolVar>.
- *
- * I do no want to convert between std::vector<> and mgb::SmallVector in the
- * C++ wrappers; neither do I want to write a SmallVector<> interface file as
- * good as swig's std::vector<> implementation.
- *
- * So the goal becomes making swig generate python wrapper for std::vector<>,
- * but call SymbolVarArray in the generated C++ file.
- *
- * A logical solution is to derive SymbolVarArray from std::vector<> only in
- * the .i file so swig can use the correct name; however the generated python
- * class becomes uniterable. So our hack here is to specialize std::vector to
- * use SymbolVarArray in the generated C++ file.
- *
- * This file must be included before instantiation of std::vector<SymbolVar>.
- */
-%{
-#include <vector>
-#include "megbrain/graph/symbol_var.h"
-using SymbolVar = mgb::cg::SymbolVar;
-using SymbolVarArray = mgb::cg::SymbolVarArray;
-namespace std {
-template<typename alloc>
-class vector<SymbolVar, alloc> : public SymbolVarArray {
-public:
-    using SymbolVarArray::SymbolVarArray;
-    using allocator_type = alloc;
-
-    allocator_type get_allocator() const {
-        mgb_throw(mgb::MegBrainError, "get_allocator() should not be called");
-        return {};
-    }
-};
-}
-%}
-
-// vim: ft=swig
diff --git a/python_module/src/version.ld b/python_module/src/version.ld
deleted file mode 100644
index ac6e0802..00000000
--- a/python_module/src/version.ld
+++ /dev/null
@@ -1,17 +0,0 @@
-{
-global:
-    MGB_VSYM_*;
-    MEGDNN_VSYM_*;
-    mgb_get_extern_c_opr_api_versioned;
-    PyInit__mgb;
-    extern "C++" {
-        *mgb::*;
-        *megdnn::*;
-        *megcore::*;
-        megcore*;
-    };
-    megcore*;
-
-local:
-    *;
-};
diff --git a/python_module/test/.gitignore b/python_module/test/.gitignore
deleted file mode 100644
index 8b29bf97..00000000
--- a/python_module/test/.gitignore
+++ /dev/null
@@ -1,3 +0,0 @@
-log.txt
-*json
-massif.out*
diff --git a/python_module/test/README.md b/python_module/test/README.md
deleted file mode 100644
index 797e8c5f..00000000
--- a/python_module/test/README.md
+++ /dev/null
@@ -1,18 +0,0 @@
-# MegEngine Tests
-
-* unit: This directory has same layout as megengine directory.
-* regression: Small tests to check whether old issue is fixed.
-* integration: Tests involve multiple parts of megengine, tests that longer than 1min should be an manual test.
-* pytorch_comparison: Special directory for torch-related test
-* helpers
-    - Test utilities should placed in this directory
-    - `from helpers import ...` in your test code
-
-
-## Default running setup
-
-Execute `run.sh` to test default set of tests.
-
-- No torch related test
-- No internet related test
-- No doc related test
diff --git a/python_module/test/__init__.py b/python_module/test/__init__.py
deleted file mode 100644
index e69de29b..00000000
diff --git a/python_module/test/conftest.py b/python_module/test/conftest.py
deleted file mode 100644
index cf1a5359..00000000
--- a/python_module/test/conftest.py
+++ /dev/null
@@ -1,34 +0,0 @@
-# MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
-#
-# Copyright (c) 2014-2020 Megvii Inc. All rights reserved.
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-import os
-import sys
-
-sys.path.append(os.path.join(os.path.dirname(__file__)))
-
-
-def pytest_json_modifyreport(json_report):
-    events = []
-    timestamp = 0
-    for item in json_report["tests"]:
-        for stage in ["setup", "call", "teardown"]:
-            if stage in item:
-                events.append(
-                    {
-                        "name": item["nodeid"],
-                        "ph": "X",
-                        "ts": timestamp,
-                        "dur": item[stage]["duration"] * 1e6,
-                        "cat": stage,
-                        "pid": stage,
-                        "tid": item["nodeid"],
-                    }
-                )
-                timestamp += events[-1]["dur"]
-    json_report["traceEvents"] = events
-    del json_report["collectors"]
-    del json_report["tests"]
diff --git a/python_module/test/helpers/__init__.py b/python_module/test/helpers/__init__.py
deleted file mode 100644
index 63d89aed..00000000
--- a/python_module/test/helpers/__init__.py
+++ /dev/null
@@ -1,172 +0,0 @@
-# -*- coding: utf-8 -*-
-# MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
-#
-# Copyright (c) 2014-2020 Megvii Inc. All rights reserved.
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-from typing import List, Tuple
-
-import numpy as np
-
-import megengine._internal as mgb
-import megengine.functional as F
-from megengine import Graph, jit
-from megengine.module import Linear, Module
-from megengine.test import assertTensorClose
-
-from .env import modified_environ
-
-
-class MLP(Module):
-    def __init__(self):
-        super().__init__()
-        self.dense0 = Linear(28, 50)
-        self.dense1 = Linear(50, 20)
-
-    def forward(self, x):
-        x = self.dense0(x)
-        x = F.relu(x)
-        x = self.dense1(x)
-        return x
-
-
-def has_gpu(num=1):
-    try:
-        mgb.comp_node("gpu{}".format(num - 1))
-    except mgb.MegBrainError:
-        return False
-
-    return True
-
-
-def randomNp(*args):
-    for arg in args:
-        assert isinstance(arg, int)
-    return np.random.random(args)
-
-
-def randomTorch(*args):
-    import torch  # pylint: disable=import-outside-toplevel
-
-    for arg in args:
-        assert isinstance(arg, int)
-    return torch.tensor(randomNp(*args), dtype=torch.float32)
-
-
-def graph_mode(*modes):
-    if not set(modes).issubset({"eager", "static"}):
-        raise ValueError("graph mode must be in (eager, static)")
-
-    def decorator(func):
-        def wrapper(*args, **kwargs):
-            if "eager" in set(modes):
-                func(*args, **kwargs)
-            if "static" in set(modes):
-                with Graph() as cg:
-                    cg.set_option("eager_evaluation", False)
-                    func(*args, **kwargs)
-
-        return wrapper
-
-    return decorator
-
-
-def _default_compare_fn(x, y):
-    assertTensorClose(x.numpy(), y)
-
-
-def opr_test(
-    cases,
-    func,
-    mode=("eager", "static", "dynamic_shape"),
-    compare_fn=_default_compare_fn,
-    ref_fn=None,
-    **kwargs
-):
-    """
-    mode: the list of test mode which are eager, static and dynamic_shape
-          will test all the cases if None.
-    func: the function to run opr.
-    compare_fn: the function to compare the result and expected, use assertTensorClose if None.
-    ref_fn: the function to generate expected data, should assign output if None.
-    cases: the list which have dict element, the list length should be 2 for dynamic shape test.
-           and the dict should have input,
-           and should have output if ref_fn is None.
-           should use list for multiple inputs and outputs for each case.
-    kwargs: The additional kwargs for opr func.
-
-    simple examples:
-
-        dtype = np.float32
-        cases = [{"input": [10, 20]}, {"input": [20, 30]}]
-        opr_test(cases,
-                 F.eye,
-                 ref_fn=lambda n, m: np.eye(n, m).astype(dtype),
-                 dtype=dtype)
-
-    """
-
-    def check_results(results, expected):
-        if not isinstance(results, Tuple):
-            results = (results,)
-        for r, e in zip(results, expected):
-            compare_fn(r, e)
-
-    def get_trace_fn(func, enabled, symbolic):
-        jit.trace.enabled = enabled
-        return jit.trace(func, symbolic=symbolic)
-
-    def get_param(cases, idx):
-        case = cases[idx]
-        inp = case.get("input", None)
-        outp = case.get("output", None)
-        if inp is None:
-            raise ValueError("the test case should have input")
-        if not isinstance(inp, List):
-            inp = (inp,)
-        else:
-            inp = tuple(inp)
-        if ref_fn is not None and callable(ref_fn):
-            outp = ref_fn(*inp)
-        if outp is None:
-            raise ValueError("the test case should have output or reference function")
-        if not isinstance(outp, List):
-            outp = (outp,)
-        else:
-            outp = tuple(outp)
-
-        return inp, outp
-
-    if not set(mode).issubset({"eager", "static", "dynamic_shape"}):
-        raise ValueError("opr test mode must be in (eager, static, dynamic_shape)")
-
-    if len(cases) == 0:
-        raise ValueError("should give one case at least")
-
-    if "dynamic_shape" in set(mode):
-        if len(cases) != 2:
-            raise ValueError("should give 2 cases for dynamic shape test")
-
-    if not callable(func):
-        raise ValueError("the input func should be callable")
-
-    inp, outp = get_param(cases, 0)
-
-    def run(*args, **kwargs):
-        return func(*args, **kwargs)
-
-    if "eager" in set(mode):
-        f = get_trace_fn(run, False, False)
-        results = f(*inp, **kwargs)
-        check_results(results, outp)
-
-    if "static" in set(mode) or "dynamic_shape" in set(mode):
-        f = get_trace_fn(run, True, True)
-        results = f(*inp, **kwargs)
-        check_results(results, outp)
-        if "dynamic_shape" in set(mode):
-            inp, outp = get_param(cases, 1)
-            results = f(*inp, **kwargs)
-            check_results(results, outp)
diff --git a/python_module/test/helpers/env.py b/python_module/test/helpers/env.py
deleted file mode 100644
index f5dae64d..00000000
--- a/python_module/test/helpers/env.py
+++ /dev/null
@@ -1,56 +0,0 @@
-# -*- coding: utf-8 -*-
-# The MIT License (MIT)
-#
-# Copyright (c) 2018 Laurent LAPORTE
-#
-# Permission is hereby granted, free of charge, to any person obtaining a copy
-# of this software and associated documentation files (the "Software"), to deal
-# in the Software without restriction, including without limitation the rights
-# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
-# copies of the Software, and to permit persons to whom the Software is
-# furnished to do so, subject to the following conditions:
-#
-# The above copyright notice and this permission notice shall be included in all
-# copies or substantial portions of the Software.
-#
-# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
-# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
-# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
-# SOFTWARE.
-import contextlib
-import os
-
-
-# modified_environ codes come from https://github.com/laurent-laporte-pro/stackoverflow-q2059482/blob/master/demo/environ_ctx.py
-@contextlib.contextmanager
-def modified_environ(*remove, **update):
-    """
-    Temporarily updates the ``os.environ`` dictionary in-place.
-
-    The ``os.environ`` dictionary is updated in-place so that the modification
-    is sure to work in all situations.
-
-    :param remove: Environment variables to remove.
-    :param update: Dictionary of environment variables and values to add/update.
-    """
-    env = os.environ
-    update = update or {}
-    remove = remove or []
-
-    # List of environment variables being updated or removed.
-    stomped = (set(update.keys()) | set(remove)) & set(env.keys())
-    # Environment variables and values to restore on exit.
-    update_after = {k: env[k] for k in stomped}
-    # Environment variables and values to remove on exit.
-    remove_after = frozenset(k for k in update if k not in env)
-
-    try:
-        env.update(update)
-        [env.pop(k, None) for k in remove]
-        yield
-    finally:
-        env.update(update_after)
-        [env.pop(k) for k in remove_after]
diff --git a/python_module/test/helpers/torch_util.py b/python_module/test/helpers/torch_util.py
deleted file mode 100644
index 659033c5..00000000
--- a/python_module/test/helpers/torch_util.py
+++ /dev/null
@@ -1,33 +0,0 @@
-# -*- coding: utf-8 -*-
-# MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
-#
-# Copyright (c) 2014-2020 Megvii Inc. All rights reserved.
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-import numpy as np
-import torch
-
-from megengine.core import tensor
-from megengine.utils import prod
-
-
-def _uniform(shape):
-    return np.random.random(shape).astype(np.float32)
-
-
-def init_with_same_value(mge_param, torch_param, initializer=_uniform):
-    mge_shape = mge_param.shape
-    torch_shape = torch_param.shape
-    assert prod(mge_shape) == prod(torch_shape)
-    weight = initializer(mge_shape)
-    mge_param.set_value(weight)
-    torch_param.data = torch.Tensor(weight.reshape(torch_shape))
-
-
-def gen_same_input(shape, initializer=_uniform):
-    data = initializer(shape)
-    mge_input = tensor(data)
-    torch_input = torch.Tensor(data)
-    return mge_input, torch_input
diff --git a/python_module/test/integration/manual/README.md b/python_module/test/integration/manual/README.md
deleted file mode 100644
index aad1ed6f..00000000
--- a/python_module/test/integration/manual/README.md
+++ /dev/null
@@ -1,178 +0,0 @@
-# Regression test
-* [How to run](#how-to-run)
-* [Correctness](#correctness)
-* [Performance](#performance)
-* [Debug tools](#debug-tools)
-* [To do list](#to-do-list)
-
-## How to run
-
-1. Run correctness regression test by
-
-```
-rlaunch --cpu=4 --memory=15000 --gpu=1 -- python3 verify_correctness.py
-```
-
-2. Run performance regression test by
-
-```
-rlaunch --cpu=4 --memory=15000 --gpu=1 -- python3 run_resnet50_perf.py
-```
-
-Compare with the [reference result](#performance) to verify the performance change.
-
-3. [Temporary]: Run dynamic graph test
-
-```
-cd python_module/megengine/examples/cifar10/resnet_example
-rlaunch --cpu=4 --memory=15000 --gpu=1 -- MGE_DISABLE_TRACE=1 python3 main.py --mode train --backend megengine-dynamic
-```
-
-Be sure to run a few epochs to verify the CPU/GPU memory usage and the result tends to converge. The complete run takes around 2 hours.
-
-## Correctness
-
-Pre-trained Resnet18 model on cifar10 dataset is used.
-
-The test set contains
-* forward run with static graph
-* forward run with  dynamic graph
-* forward + backward + parameter update with static graph
-* forward + backward + parameter update with dynamic graph
-
-Sample output:
-
-```
-Running fwd static ...
-Success
-Running fwd dynamic ...
-Success
-Running train static ...
-Success
-Running train dynamic ...
-Failed!!!
-import megengine operator
-[INFO] load /home/zhangfan/.local/lib/python3.6/site-packages/megengine/examples/cifar10/resnet_example/checkpoint/pytorch_init.pth done
-calculated loss: [2.3731833, 34.4626]
-expect: [ 2.3731833 34.460594 ]
-```
-
-## Performance
-
-Test cases run Resnet 50 training with batch size = 64.
-
-Run `python3 resnet50_perf.py --help` for valid options.
-
-Example script:
-
-* Run `python3 run_resnet50_perf.py`
-* You may want to submit the job to a remote server by  `rlaunch --cpu=16 --memory=100384 --gpu=8 -- python3 run_resnet50_perf.py`
-* Sample output
-```
-**************************************
-Run ResNet 50 performance test with batch size = 64
-**************************************
-Run static graph with default opt level
-Finish with GPU Usage 6710MiB
-Wall time per iter 283 ms
-Run status: finished
-**************************************
-Run static graph with conv fastrun
-Finish with GPU Usage 6540MiB
-Wall time per iter 265 ms
-Run status: finished
-**************************************
-Run static graph with conv fastrun and JIT
-Finish with GPU Usage 6540MiB
-Wall time per iter 267 ms
-Run status: finished
-**************************************
-Run static graph with JIT, conv fastrun and without running step
-Finish with GPU Usage 6540MiB
-Wall time per iter 223 ms
-Run status: finished
-**************************************
-```
-
-## Debug tools 
-
-You can pass `--run-debug-tool` to script `run_resnet50_perf.py`. Opr-level profiling result and valgrind will be invoked.
-
-### How much overhead time will it take due to usage of the profiler
-
-Please compare the same job with/without profiler. The timing statistic reported by profiler does not include the overhead time from itself.
-
-### How can I get more information from profiler?
-
-Refer to the main function in `megengine.utils.profile_analyze`.
-
-### How can I profile main memory usage?
-
-Valgrind massif tool can be used. The script also prints memory usage summary on screen as:
-
-```
-
-    GB
-1.836^                                                             #          
-     |                                                           @@#::::::@:::
-     |                                                         @@@ #::::::@:::
-     |                                 ::::::::::::@:::::::::@:@@@ #::::::@:::
-     |                                ::::: :::::: @ ::: ::: @:@@@ #::::::@:::
-     |                              @@::::: :::::: @ ::: ::: @:@@@ #::::::@:::
-     |                            ::@@::::: :::::: @ ::: ::: @:@@@ #::::::@:::
-     |                          @:: @@::::: :::::: @ ::: ::: @:@@@ #::::::@:::
-     |                        @@@:: @@::::: :::::: @ ::: ::: @:@@@ #::::::@:::
-     |                       :@@@:: @@::::: :::::: @ ::: ::: @:@@@ #::::::@:::
-     |                     @::@@@:: @@::::: :::::: @ ::: ::: @:@@@ #::::::@:::
-     |                    @@::@@@:: @@::::: :::::: @ ::: ::: @:@@@ #::::::@:::
-     |                  @:@@::@@@:: @@::::: :::::: @ ::: ::: @:@@@ #::::::@:::
-     |                 :@ @@::@@@:: @@::::: :::::: @ ::: ::: @:@@@ #::::::@:::
-     |              ::::@ @@::@@@:: @@::::: :::::: @ ::: ::: @:@@@ #::::::@:::
-     |            :::: :@ @@::@@@:: @@::::: :::::: @ ::: ::: @:@@@ #::::::@:::
-     |          :@: :: :@ @@::@@@:: @@::::: :::::: @ ::: ::: @:@@@ #::::::@:::
-     |         :@@: :: :@ @@::@@@:: @@::::: :::::: @ ::: ::: @:@@@ #::::::@:::
-     |       @@:@@: :: :@ @@::@@@:: @@::::: :::::: @ ::: ::: @:@@@ #::::::@:::
-     |      @@ :@@: :: :@ @@::@@@:: @@::::: :::::: @ ::: ::: @:@@@ #::::::@:::
-   0 +----------------------------------------------------------------------->Gi
-     0                                                                   19.39
-
-```
-You can change "--run-iter" value to adjust iters to profile.
-The detailed profiling is printed to `massif.out.ms_print`.
-
-### How can I understand the profiler result?
-
-The dumped profiling file `prof.json` can be interpolated by [megengine/utils/profile_analyze.py](../../utils/profile_analyze.py).
-The following information is printed from the profiler:
-
-```
------------------  --------
-total device time  0.318062
-total host time    0.275643
------------------  --------
-
-╒════════════════════╤══════════════╤═══════════════════════════╤═══════════════╤═════════╤══════════╤═════════════╤═════════════╤══════════════╕
-│ device self time   │ cumulative   │ operator info             │ computation   │ FLOPS   │ memory   │ bandwidth   │ in_shapes   │ out_shapes   │
-╞════════════════════╪══════════════╪═══════════════════════════╪═══════════════╪═════════╪══════════╪═════════════╪═════════════╪══════════════╡
-│ #0                 │ 0.114        │ Elemwise                  │ 6.53          │ 57.40   │ 51.63    │ 454.02      │ None        │ None         │
-│ 0.114              │ 35.8%        │ 1481                      │ GFLO          │ GFLOPS  │ GiB      │ GiB/s       │             │              │
-│ 35.8%              │              │ N/A                       │               │         │          │             │             │              │
-├────────────────────┼──────────────┼───────────────────────────┼───────────────┼─────────┼──────────┼─────────────┼─────────────┼──────────────┤
-│ #1                 │ 0.176        │ ConvolutionBackwardFilter │ 523.15        │ 8.35    │ 5.28     │ 84.24       │ None        │ None         │
-│ 0.0627             │ 55.5%        │ 53                        │ GFLO          │ TFLOPS  │ GiB      │ GiB/s       │             │              │
-│ 19.7%              │              │ N/A                       │               │         │          │             │             │              │
-├────────────────────┼──────────────┼───────────────────────────┼───────────────┼─────────┼──────────┼─────────────┼─────────────┼──────────────┤
-│ #2                 │ 0.221        │ ConvolutionBackwardData   │ 508.05        │ 11.31   │ 5.05     │ 112.42      │ None        │ None         │
-│ 0.0449             │ 69.6%        │ 52                        │ GFLO          │ TFLOPS  │ GiB      │ GiB/s       │             │              │
-│ 14.1%              │              │ N/A                       │               │         │          │             │             │              │
-├────────────────────┼──────────────┼───────────────────────────┼───────────────┼─────────┼──────────┼─────────────┼─────────────┼──────────────┤
-```
-Please read [megengine/utils/profile_analyze.py](../../utils/profile_analyze.py) for more usages.
-
-## To do list
-
-* Change numerical tolerance after XPU-280 is done
-* Add scripts to facilitate log analysis
-* Profile GPU memory
-* Incorporate with QA system
-* Add more regression tests
diff --git a/python_module/test/integration/manual/resnet50_perf.py b/python_module/test/integration/manual/resnet50_perf.py
deleted file mode 100644
index 9f36e8f0..00000000
--- a/python_module/test/integration/manual/resnet50_perf.py
+++ /dev/null
@@ -1,210 +0,0 @@
-# -*- coding: utf-8 -*-
-# MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
-#
-# Copyright (c) 2014-2020 Megvii Inc. All rights reserved.
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-import argparse
-import json
-import os
-import subprocess
-import sys
-import time
-
-import numpy as np
-
-import megengine as mge
-import megengine.distributed as dist
-import megengine.functional as F
-from megengine._internal.plugin import CompGraphProfiler
-from megengine.core import Graph, tensor
-from megengine.core.graph import get_default_graph
-from megengine.functional.debug_param import (
-    get_conv_execution_strategy,
-    set_conv_execution_strategy,
-)
-from megengine.jit import trace
-from megengine.module import BatchNorm2d, Conv2d, Linear, MaxPool2d, Module
-from megengine.optimizer import SGD
-
-sys.path.append(os.path.join(os.path.dirname(__file__), "..", "..", "..", "examples"))
-
-
-def init_profiler(comp_graph=get_default_graph()):
-    profiler = CompGraphProfiler(comp_graph)
-    return profiler
-
-
-def dump_profiler(profiler, filename):
-    with open(filename, "w") as fout:
-        json.dump(profiler.get(), fout, indent=2)
-
-
-def print_gpu_usage():
-    stdout = subprocess.getoutput("nvidia-smi")
-    for line in stdout.split("\n"):
-        for item in line.split(" "):
-            if "MiB" in item:
-                print("Finish with GPU Usage", item)
-                break
-
-
-def run_perf(
-    batch_size=64,
-    warm_up=True,
-    dump_prof=None,
-    opt_level=2,
-    conv_fastrun=False,
-    run_step=True,
-    track_bn_stats=True,
-    warm_up_iter=20,
-    run_iter=100,
-    num_gpu=None,
-    device=0,
-    server=None,
-    port=None,
-    scale_batch_size=False,
-    eager=False,
-):
-
-    # pylint: disable = import-outside-toplevel
-    from resnet50 import Resnet50
-
-    if conv_fastrun:
-        set_conv_execution_strategy("PROFILE")
-
-    if num_gpu:
-        dist.init_process_group(args.server, args.port, num_gpu, device, device)
-        if scale_batch_size:
-            batch_size = batch_size // num_gpu
-        print("Run with data parallel, batch size = {} per GPU".format(batch_size))
-
-    data = tensor(np.random.randn(batch_size, 3, 224, 224).astype("float32"))
-    label = tensor(np.random.randint(1000, size=[batch_size,], dtype=np.int32))
-
-    net = Resnet50(track_bn_stats=track_bn_stats)
-    opt = SGD(net.parameters(), lr=0.01, momentum=0.9, weight_decay=1e-4)
-
-    def train_func(data, label):
-        logits = net(data)
-        loss = F.cross_entropy_with_softmax(logits, label)
-
-        if num_gpu:
-            loss = loss / num_gpu
-
-        opt.zero_grad()
-        opt.backward(loss)
-        return loss
-
-    train_func = trace(
-        train_func,
-        symbolic=(not eager),
-        opt_level=opt_level,
-        profiling=not (dump_prof is None),
-    )
-
-    if warm_up:
-        print("Warm up ...")
-        for _ in range(warm_up_iter):
-            opt.zero_grad()
-            train_func(data, label)
-            if run_step:
-                opt.step()
-    print_gpu_usage()
-    print("Running train ...")
-    start = time.time()
-    for _ in range(run_iter):
-        opt.zero_grad()
-        train_func(data, label)
-        if run_step:
-            opt.step()
-
-    time_used = time.time() - start
-
-    if dump_prof:
-        with open(dump_prof, "w") as fout:
-            json.dump(train_func.get_profile(), fout, indent=2)
-
-    return time_used / run_iter
-
-
-def str2bool(v):
-    if isinstance(v, bool):
-        return v
-    if v.lower() in ("yes", "true", "t", "y", "1"):
-        return True
-    elif v.lower() in ("no", "false", "f", "n", "0"):
-        return False
-    else:
-        raise argparse.ArgumentTypeError("Boolean value expected.")
-
-
-if __name__ == "__main__":
-    parser = argparse.ArgumentParser(
-        description="Running regression test on Resnet 50",
-        formatter_class=argparse.ArgumentDefaultsHelpFormatter,
-    )
-    parser.add_argument("--batch-size", type=int, default=64, help="batch size ")
-    parser.add_argument(
-        "--warm-up", type=str2bool, default=True, help="whether to warm up"
-    )
-    parser.add_argument(
-        "--dump-prof",
-        type=str,
-        default=None,
-        help="pass the json file path to dump the profiling result",
-    )
-    parser.add_argument("--opt-level", type=int, default=2, help="graph opt level")
-    parser.add_argument(
-        "--conv-fastrun",
-        type=str2bool,
-        default=False,
-        help="whether to use conv fastrun mode",
-    )
-    parser.add_argument(
-        "--run-step",
-        type=str2bool,
-        default=True,
-        help="whether to run optimizer.step()",
-    )
-    parser.add_argument(
-        "--track-bn-stats",
-        type=str2bool,
-        default=True,
-        help="whether to track bn stats",
-    )
-    parser.add_argument(
-        "--warm-up-iter", type=int, default=20, help="number of iters to warm up"
-    )
-    parser.add_argument(
-        "--run-iter", type=int, default=100, help="number of iters to collect wall time"
-    )
-    parser.add_argument("--server", default="0.0.0.0")
-    parser.add_argument("--port", type=int, default=2222)
-    parser.add_argument(
-        "--scale-batch-size",
-        type=str2bool,
-        default=False,
-        help="whether to divide batch size by number of GPUs",
-    )
-    parser.add_argument(
-        "--eager", type=str2bool, default=False, help="whether to use eager mode"
-    )
-
-    # Data parallel related
-    parser.add_argument("--num-gpu", type=int, default=None)
-    parser.add_argument("--device", type=int, default=0)
-    args = parser.parse_args()
-
-    print(vars(args))
-
-    os.environ["MGB_JIT_BACKEND"] = "NVRTC"
-
-    t = run_perf(**vars(args))
-
-    print("**********************************")
-    print("Wall time per iter {:.0f} ms".format(t * 1000))
-    print("**********************************")
-    get_default_graph().clear_device_memory()
diff --git a/python_module/test/integration/manual/run_resnet50_perf.py b/python_module/test/integration/manual/run_resnet50_perf.py
deleted file mode 100644
index a8a24d99..00000000
--- a/python_module/test/integration/manual/run_resnet50_perf.py
+++ /dev/null
@@ -1,162 +0,0 @@
-# -*- coding: utf-8 -*-
-# MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
-#
-# Copyright (c) 2014-2020 Megvii Inc. All rights reserved.
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-import argparse
-import os
-import pathlib
-import subprocess
-
-from megengine.utils.profile_analyze import main as profiler
-
-home = pathlib.Path(__file__).parent.absolute()
-script_path = os.path.join(str(home), "resnet50_perf.py")
-script_path = "python3 " + script_path
-
-prof_path = "prof.json"
-
-log_path = "log.txt"
-
-
-def print_log(msg: str, log: str = log_path):
-    print(msg)
-    with open(log, "a") as f:
-        print(msg, file=f)
-
-
-def run_cmd(cmd: str, log: str = log_path) -> bool:
-    stdout = subprocess.getoutput(cmd)
-    token = "Wall time"
-    gpu_msg = "GPU Usage"
-    run_finished = False
-    for line in stdout.split("\n"):
-        if token in line:
-            print(line)
-            print_log("Run status: finished")
-            run_finished = True
-        if gpu_msg in line:
-            print(line)
-    if not run_finished:
-        print_log("Run status: failed")
-    with open(log, "a") as f:
-        print(stdout, file=f)
-
-    return run_finished
-
-
-if __name__ == "__main__":
-
-    parser = argparse.ArgumentParser(description="ResNet50 train performance")
-    parser.add_argument(
-        "--run-debug-tool", action="store_true", help="run profiler and valgrind"
-    )
-    parser.add_argument(
-        "--run-parallel", action="store_true", help="run data parallel performance"
-    )
-    parser.add_argument("--run-eager", action="store_false", help="run eager graph")
-    args = parser.parse_args()
-
-    f = open(log_path, "w")
-    f.close()
-
-    print_log("**************************************")
-    print_log("Run ResNet 50 performance test with batch size = 64")
-
-    print_log("**************************************")
-    print_log("Run static graph with default opt level")
-    cmd = script_path
-    run_cmd(cmd)
-
-    print_log("**************************************")
-    print_log("Run static graph with conv fastrun")
-    cmd = script_path + " --conv-fastrun=yes"
-    run_cmd(cmd)
-
-    print_log("**************************************")
-    print_log("Run static graph with conv fastrun and JIT")
-    cmd = script_path + " --conv-fastrun=yes --opt-level=3"
-    run_cmd(cmd)
-
-    print_log("**************************************")
-    print_log("Run static graph with JIT, conv fastrun and without running step")
-    cmd = script_path + " --conv-fastrun=yes --opt-level=3 --run-step=no"
-    run_cmd(cmd)
-
-    if args.run_eager:
-        print_log("**************************************")
-        print_log("Run static graph with default opt level and batch-size=8")
-        cmd = script_path + " --batch-size=8"
-        run_cmd(cmd)
-        print_log("**************************************")
-        print_log("Run eager graph with default opt level and batch-size=8")
-        cmd = script_path
-        run_cmd("MGE_DISABLE_TRACE=1 " + cmd + " --eager=yes")
-
-    if args.run_debug_tool:
-
-        print_log("**************************************")
-        print_log("Run with dump_prof")
-        cmd = script_path + " --dump-prof=" + prof_path
-        if run_cmd(cmd):
-            print("Printing profiling result")
-            profiler([prof_path, "--aggregate-by=type", "--aggregate=sum", "-t 10"])
-
-        print_log("**************************************")
-        print_log("Run with valgrind massif")
-        massif_out = "massif.out"
-        # Use 0.01% as valgrind massif threashold
-        # A smaller value reports more details but it may take longer time to analyze the log
-        # Change it accordingly.
-        mem_threshold = 0.01
-        cmd = (
-            "valgrind --tool=massif --threshold={} --massif-out-file=".format(
-                mem_threshold
-            )
-            + massif_out
-            + " "
-        )
-        cmd = cmd + script_path + " --warm-up=no --run-iter=20"
-        run_cmd(cmd)
-        ms_print_file = "massif.out.ms_print"
-        cmd = (
-            "ms_print --threshold={} ".format(mem_threshold)
-            + massif_out
-            + " > "
-            + ms_print_file
-        )
-        os.system(cmd)
-        cmd = "head -n 33 " + ms_print_file
-        os.system(cmd)
-        print_log("Read {} for detailed massif output".format(ms_print_file))
-
-    if args.run_parallel:
-        print_log("**************************************")
-        tmp_out = "/dev/null"
-        # Change server and port to run at your system
-        server = "localhost"
-        port = "2222"
-        for num_gpu in (2, 4, 8):
-            print_log("Run with {} GPUs".format(num_gpu))
-
-            cmd = script_path + " --num-gpu={} --server={} --port={} ".format(
-                num_gpu, server, port
-            )
-            for i in range(num_gpu - 1):
-                irank = num_gpu - 1 - i
-                os.system(
-                    cmd
-                    + " --device={}".format(irank)
-                    + " 1>{} 2>{} &".format(tmp_out, tmp_out)
-                )
-            if not run_cmd(cmd):
-                break
-
-    print_log("**************************************")
-    print_log("**************************************")
-    print("Finish run, summary:")
-    cmd = 'grep "Run with\|Wall time\|Run status\|Error\|GPU Usage" ' + log_path
-    os.system(cmd)
diff --git a/python_module/test/integration/mnist_model_with_test.mge b/python_module/test/integration/mnist_model_with_test.mge
deleted file mode 100644
index 126837d4..00000000
Binary files a/python_module/test/integration/mnist_model_with_test.mge and /dev/null differ
diff --git a/python_module/test/integration/mnist_model_with_test_cpu.mge b/python_module/test/integration/mnist_model_with_test_cpu.mge
deleted file mode 100644
index b0e8ad5c..00000000
Binary files a/python_module/test/integration/mnist_model_with_test_cpu.mge and /dev/null differ
diff --git a/python_module/test/integration/test_converge.py b/python_module/test/integration/test_converge.py
deleted file mode 100644
index e7e4f6c4..00000000
--- a/python_module/test/integration/test_converge.py
+++ /dev/null
@@ -1,114 +0,0 @@
-# -*- coding: utf-8 -*-
-# MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
-#
-# Copyright (c) 2014-2020 Megvii Inc. All rights reserved.
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-import itertools
-
-import numpy as np
-import pytest
-
-import megengine as mge
-from megengine.core import tensor
-from megengine.functional import cross_entropy_with_softmax, tanh
-from megengine.jit import trace
-from megengine.module import Linear, Module
-from megengine.optimizer import SGD
-
-batch_size = 64
-data_shape = (batch_size, 2)
-label_shape = (batch_size,)
-
-
-def minibatch_generator():
-    while True:
-        inp_data = np.zeros((batch_size, 2))
-        label = np.zeros(batch_size, dtype=np.int32)
-        for i in range(batch_size):
-            # [x0, x1], sampled from U[-1, 1]
-            inp_data[i, :] = np.random.rand(2) * 2 - 1
-            label[i] = 0 if np.prod(inp_data[i]) < 0 else 1
-        yield inp_data.astype(np.float32), label.astype(np.int32)
-
-
-def calculate_precision(data: np.ndarray, pred: np.ndarray) -> float:
-    """ Calculate precision for given data and prediction.
-
-    :type data: [[x, y], ...]
-    :param data: Input data
-    :type pred: [[x_pred, y_pred], ...]
-    :param pred: Network output data
-    """
-    correct = 0
-    assert len(data) == len(pred)
-    for inp_data, pred_output in zip(data, pred):
-        label = 0 if np.prod(inp_data) < 0 else 1
-        pred_label = np.argmax(pred_output)
-        if pred_label == label:
-            correct += 1
-    return float(correct) / len(data)
-
-
-class XORNet(Module):
-    def __init__(self):
-        self.mid_layers = 14
-        self.num_class = 2
-        super().__init__()
-
-        self.fc0 = Linear(self.num_class, self.mid_layers, bias=True)
-        self.fc1 = Linear(self.mid_layers, self.mid_layers, bias=True)
-
-        self.fc2 = Linear(self.mid_layers, self.num_class, bias=True)
-
-    def forward(self, x):
-        x = self.fc0(x)
-        x = tanh(x)
-        x = self.fc1(x)
-        x = tanh(x)
-        x = self.fc2(x)
-        return x
-
-
-@pytest.mark.slow
-def test_training_converge():
-    net = XORNet()
-    opt = SGD(
-        net.parameters(requires_grad=True), lr=0.01, momentum=0.9, weight_decay=5e-4
-    )
-
-    @trace
-    def train(data, label):
-        pred = net(data)
-        opt.zero_grad()
-        loss = cross_entropy_with_softmax(pred, label)
-        opt.backward(loss)
-        return loss
-
-    @trace
-    def infer(data):
-        return net(data)
-
-    train_dataset = minibatch_generator()
-    losses = []
-
-    for data, label in itertools.islice(train_dataset, 2000):
-        # opt.zero_grad()
-        loss = train(data, label)
-        loss = loss[0][0]
-        opt.step()
-        losses.append(loss.numpy())
-
-    assert np.mean(losses[-100:]) < 0.1, "Final training Loss must be low enough"
-
-    ngrid = 10
-    x = np.linspace(-1.0, 1.0, ngrid)
-    xx, yy = np.meshgrid(x, x)
-    xx = xx.reshape((ngrid * ngrid, 1))
-    yy = yy.reshape((ngrid * ngrid, 1))
-    data = np.concatenate((xx, yy), axis=1).astype(np.float32)
-
-    pred = infer(data).numpy()
-    assert calculate_precision(data, pred) == 1.0, "Test precision must be high enough"
diff --git a/python_module/test/integration/test_correctness.py b/python_module/test/integration/test_correctness.py
deleted file mode 100644
index 70a9861f..00000000
--- a/python_module/test/integration/test_correctness.py
+++ /dev/null
@@ -1,199 +0,0 @@
-# -*- coding: utf-8 -*-
-# MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
-#
-# Copyright (c) 2014-2020 Megvii Inc. All rights reserved.
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-import os
-import re
-import subprocess
-import sys
-
-import numpy as np
-
-import megengine as mge
-import megengine.functional as F
-from megengine import jit, tensor
-from megengine.functional.debug_param import set_conv_execution_strategy
-from megengine.jit import SublinearMemoryConfig
-from megengine.module import AvgPool2d, BatchNorm2d, Conv2d, Linear, Module
-from megengine.optimizer import SGD
-from megengine.test import assertTensorClose
-
-
-def get_gpu_name():
-    try:
-        gpu_info = subprocess.check_output(
-            ["nvidia-smi", "--query-gpu=gpu_name", "--format=csv,noheader"]
-        )
-        gpu_info = gpu_info.decode("ascii").split("\n")[0]
-    except:
-        gpu_info = "None"
-    return gpu_info
-
-
-def get_cpu_name():
-    cpu_info = "None"
-    try:
-        cpu_info = subprocess.check_output(["cat", "/proc/cpuinfo"]).decode("ascii")
-        for line in cpu_info.split("\n"):
-            if "model name" in line:
-                return re.sub(".*model name.*:", "", line, 1).strip()
-    except:
-        pass
-    return cpu_info
-
-
-def get_xpu_name():
-    if mge.is_cuda_available():
-        return get_gpu_name()
-    else:
-        return get_cpu_name()
-
-
-class MnistNet(Module):
-    def __init__(self, has_bn=False):
-        super().__init__()
-        self.conv0 = Conv2d(1, 20, kernel_size=5, bias=True)
-        self.pool0 = AvgPool2d(2)
-        self.conv1 = Conv2d(20, 20, kernel_size=5, bias=True)
-        self.pool1 = AvgPool2d(2)
-        self.fc0 = Linear(20 * 4 * 4, 500, bias=True)
-        self.fc1 = Linear(500, 10, bias=True)
-        self.bn0 = None
-        self.bn1 = None
-        if has_bn:
-            self.bn0 = BatchNorm2d(20)
-            self.bn1 = BatchNorm2d(20)
-
-    def forward(self, x):
-        x = self.conv0(x)
-        if self.bn0:
-            x = self.bn0(x)
-        x = F.relu(x)
-        x = self.pool0(x)
-        x = self.conv1(x)
-        if self.bn1:
-            x = self.bn1(x)
-        x = F.relu(x)
-        x = self.pool1(x)
-        x = F.flatten(x, 1)
-        x = self.fc0(x)
-        x = F.relu(x)
-        x = self.fc1(x)
-        return x
-
-
-def train(data, label, net, opt):
-
-    pred = net(data)
-    loss = F.cross_entropy_with_softmax(pred, label)
-    opt.backward(loss)
-    return loss
-
-
-def update_model(model_path):
-    """
-    Update the dumped model with test cases for new reference values.
-
-    The model with pre-trained weights is trained for one iter with the test data attached.
-    The loss and updated net state dict is dumped.
-
-    .. code-block:: python
-
-        from test_correctness import update_model
-        update_model('mnist_model_with_test.mge') # for gpu
-        update_model('mnist_model_with_test_cpu.mge') # for cpu
-
-    """
-    net = MnistNet(has_bn=True)
-    checkpoint = mge.load(model_path)
-    net.load_state_dict(checkpoint["net_init"])
-    lr = checkpoint["sgd_lr"]
-    opt = SGD(net.parameters(), lr=lr)
-
-    data = tensor(dtype=np.float32)
-    label = tensor(dtype=np.int32)
-    data.set_value(checkpoint["data"])
-    label.set_value(checkpoint["label"])
-
-    opt.zero_grad()
-    loss = train(data, label, net=net, opt=opt)
-    opt.step()
-
-    xpu_name = get_xpu_name()
-
-    checkpoint.update(
-        {"net_updated": net.state_dict(), "loss": loss.numpy(), "xpu": xpu_name}
-    )
-    mge.save(checkpoint, model_path)
-
-
-def run_test(
-    model_path, use_jit, use_symbolic, sublinear_memory_config=None, max_err=None,
-):
-
-    """
-    Load the model with test cases and run the training for one iter.
-    The loss and updated weights are compared with reference value to verify the correctness.
-
-    Dump a new file with updated result by calling update_model
-    if you think the test fails due to numerical rounding errors instead of bugs.
-    Please think twice before you do so.
-
-    """
-    net = MnistNet(has_bn=True)
-    checkpoint = mge.load(model_path)
-    net.load_state_dict(checkpoint["net_init"])
-    lr = checkpoint["sgd_lr"]
-    opt = SGD(net.parameters(), lr=lr)
-
-    data = tensor(dtype=np.float32)
-    label = tensor(dtype=np.int32)
-    data.set_value(checkpoint["data"])
-    label.set_value(checkpoint["label"])
-
-    if max_err is None:
-        max_err = 1e-5
-
-    train_func = train
-    if use_jit:
-        train_func = jit.trace(
-            train_func,
-            symbolic=use_symbolic,
-            sublinear_memory_config=sublinear_memory_config,
-        )
-
-    opt.zero_grad()
-    loss = train_func(data, label, net=net, opt=opt)
-    opt.step()
-
-    assertTensorClose(loss.numpy(), checkpoint["loss"], max_err=max_err)
-
-    for param, param_ref in zip(
-        net.state_dict().items(), checkpoint["net_updated"].items()
-    ):
-        assert param[0] == param_ref[0]
-        assertTensorClose(param[1], param_ref[1], max_err=max_err)
-
-
-def test_correctness():
-
-    if mge.is_cuda_available():
-        model_name = "mnist_model_with_test.mge"
-    else:
-        model_name = "mnist_model_with_test_cpu.mge"
-    model_path = os.path.join(os.path.dirname(__file__), model_name)
-    set_conv_execution_strategy("HEURISTIC_REPRODUCIBLE")
-
-    run_test(model_path, False, False)
-    run_test(model_path, True, False)
-    run_test(model_path, True, True)
-
-    # sublinear
-    config = SublinearMemoryConfig(genetic_nr_iter=10)
-    run_test(
-        model_path, True, True, sublinear_memory_config=config, max_err=1e-5,
-    )
diff --git a/python_module/test/integration/test_distributed.py b/python_module/test/integration/test_distributed.py
deleted file mode 100644
index ba2e3080..00000000
--- a/python_module/test/integration/test_distributed.py
+++ /dev/null
@@ -1,97 +0,0 @@
-# -*- coding: utf-8 -*-
-# MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
-#
-# Copyright (c) 2014-2020 Megvii Inc. All rights reserved.
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-import multiprocessing as mp
-import platform
-import subprocess
-import sys
-
-import numpy as np
-import pytest
-
-
-def worker(master_ip, master_port, world_size, rank, dev, trace):
-    import megengine.distributed as dist
-    import megengine.functional as F
-    from megengine import is_cuda_available
-    from megengine import jit
-    from megengine.module import Linear, Module
-    from megengine.optimizer import SGD
-
-    if not is_cuda_available():
-        return
-
-    class MLP(Module):
-        def __init__(self):
-            super().__init__()
-            self.fc0 = Linear(3 * 224 * 224, 500)
-            self.fc1 = Linear(500, 10)
-
-        def forward(self, x):
-            x = self.fc0(x)
-            x = F.relu(x)
-            x = self.fc1(x)
-            return x
-
-    dist.init_process_group(
-        master_ip=master_ip, master_port=3456, world_size=world_size, rank=rank, dev=dev
-    )
-    net = MLP()
-
-    opt = SGD(net.parameters(requires_grad=True), lr=0.02)
-
-    data = np.random.random((64, 3 * 224 * 224)).astype(np.float32)
-    label = np.random.randint(0, 10, size=(64,)).astype(np.int32)
-
-    jit.trace.enabled = trace
-
-    @jit.trace()
-    def train_func(data, label):
-        pred = net(data)
-        loss = F.cross_entropy_with_softmax(pred, label)
-        opt.backward(loss)
-        return loss
-
-    for i in range(5):
-        opt.zero_grad()
-        loss = train_func(data, label)
-        opt.step()
-
-
-def start_workers(worker, world_size, trace=False):
-    def run_subproc(rank):
-        cmd = "from test.integration.test_distributed import worker\n"
-        cmd += "worker('localhost', 3456, {}, {}, {}, {})".format(
-            world_size, rank, rank, "True" if trace else "False"
-        )
-        cmd = [sys.executable, "-c", cmd]
-        ret = subprocess.run(
-            cmd, stdout=sys.stdout, stderr=sys.stderr, universal_newlines=True
-        )
-        assert ret.returncode == 0, "subprocess failed"
-
-    procs = []
-    for rank in range(world_size):
-        p = mp.Process(target=run_subproc, args=(rank,))
-        p.start()
-        procs.append(p)
-
-    for p in procs:
-        p.join()
-        assert p.exitcode == 0
-
-
-@pytest.mark.skipif(
-    platform.system() == "Darwin", reason="do not imp GPU mode at macos now"
-)
-@pytest.mark.skipif(
-    platform.system() == "Windows", reason="do not imp GPU mode at Windows now"
-)
-def test_distributed():
-    start_workers(worker, 2, trace=True)
-    start_workers(worker, 2, trace=False)
diff --git a/python_module/test/integration/test_equivalence.py b/python_module/test/integration/test_equivalence.py
deleted file mode 100644
index e215450f..00000000
--- a/python_module/test/integration/test_equivalence.py
+++ /dev/null
@@ -1,184 +0,0 @@
-# -*- coding: utf-8 -*-
-# MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
-#
-# Copyright (c) 2014-2020 Megvii Inc. All rights reserved.
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-import copy
-import itertools
-import os
-from typing import Callable
-
-import numpy as np
-import pytest
-
-import megengine as mge
-import megengine.module.init as init
-from megengine.core import tensor
-from megengine.functional import cross_entropy_with_softmax, relu
-from megengine.jit import trace
-from megengine.module import Linear, Module
-from megengine.optimizer import SGD, Optimizer
-from megengine.test import assertTensorClose
-
-batch_size = 64
-data_shape = (batch_size, 2)
-label_shape = (batch_size,)
-
-
-def minibatch_generator():
-    while True:
-        inp_data = np.zeros((batch_size, 2))
-        label = np.zeros(batch_size, dtype=np.int32)
-
-        for i in range(batch_size):
-            # [x0, x1], sampled from U[-1, 1]
-            inp_data[i, :] = np.random.rand(2) * 2 - 1
-            label[i] = 0 if np.prod(inp_data[i]) < 0 else 1
-
-        yield inp_data.astype(np.float32), label.astype(np.int32)
-
-
-class SimpleNet(Module):
-    def __init__(self):
-        self.mid_layers = 14
-        self.num_class = 2
-        super().__init__()
-
-        self.fc0 = Linear(self.num_class, self.mid_layers, bias=True)
-        fan_in, _ = init.calculate_fan_in_and_fan_out(self.fc0.weight)
-        init.normal_(self.fc0.weight, std=np.sqrt(float(1.0) / fan_in))
-        init.zeros_(self.fc0.bias)
-
-        self.fc1 = Linear(self.mid_layers, self.mid_layers, bias=True)
-        fan_in, _ = init.calculate_fan_in_and_fan_out(self.fc1.weight)
-        init.normal_(self.fc1.weight, std=np.sqrt(float(1.0) / fan_in))
-        init.zeros_(self.fc1.bias)
-
-        self.fc2 = Linear(self.mid_layers, self.num_class, bias=True)
-        fan_in, _ = init.calculate_fan_in_and_fan_out(self.fc2.weight)
-        init.normal_(self.fc2.weight, std=np.sqrt(float(1.0) / fan_in))
-        init.zeros_(self.fc2.bias)
-
-    def forward(self, x):
-        x = self.fc0(x)
-        x = relu(x)  # Should use tanh but it's not stable now.
-        x = self.fc1(x)
-        x = relu(x)  # Should use tanh but it's not stable now.
-        x = self.fc2(x)
-        return x
-
-
-def generate_eager_step(net: Module, opt_factory: Callable[[Module], Optimizer]):
-    data_inp = tensor(np.zeros(data_shape), dtype=np.float32)
-    label_inp = tensor(np.zeros(label_shape), dtype=np.int32)
-    opt = opt_factory(net)
-
-    def step(data, label):
-        opt.zero_grad()
-        data_inp.set_value(data)
-        label_inp.set_value(label)
-
-        pred = net(data_inp)
-        loss = cross_entropy_with_softmax(pred, label_inp)
-        opt.backward(loss)
-        opt.step()
-
-        return loss.numpy()[0]
-
-    return step
-
-
-def generate_static_step(net: Module, opt_factory: Callable[[Module], Optimizer]):
-    data = tensor(np.zeros(data_shape), dtype=np.float32)
-    label = tensor(np.zeros(label_shape), dtype=np.int32)
-    opt = opt_factory(net)
-
-    # Save state to reset parameters later.
-    state = copy.deepcopy(net.state_dict())
-
-    # Evaluate network in eager mode once.
-    pred = net(data)
-    loss = cross_entropy_with_softmax(pred, label)
-    opt.zero_grad()
-    grads = opt.backward(loss)
-
-    f = mge.graph.compile(loss, grads)
-
-    def step(data, label):
-        opt.zero_grad()
-        out = f(data=data, label=label)
-        opt.step()
-        loss = out[0][0]
-        return loss
-
-    # Reset parameters.
-    net.load_state_dict(state)
-    return step
-
-
-def generate_trace_step(
-    net: Module, opt_factory: Callable[[Module], Optimizer], enable: bool
-):
-    opt = opt_factory(net)
-
-    @trace
-    def train(data, label):
-        pred = net(data)
-        loss = cross_entropy_with_softmax(pred, label)
-        opt.zero_grad()
-        opt.backward(loss)
-        return loss
-
-    train.enabled = enable
-
-    def step(data, label):
-        out = train(data, label)
-        opt.step()
-        loss = out[0][0]
-        return loss
-
-    return step
-
-
-def assert_network_equvilence(nets):
-    net_state = [net.state_dict() for net in nets]
-
-    for state in net_state[1:]:
-        assert len(net_state[0]) == len(state)
-
-    for k, v in net_state[0].items():
-        for state in net_state[1:]:
-            assert k in state
-            assertTensorClose(v, state[k])
-
-
-@pytest.mark.slow
-def test_eager_equvilence():
-    eager_net = SimpleNet()
-    trace_enable_net = copy.deepcopy(eager_net)
-    trace_disable_net = copy.deepcopy(eager_net)
-
-    opt_factory = lambda net: SGD(
-        net.parameters(requires_grad=True), lr=0.01, momentum=0.01
-    )
-
-    estep = generate_eager_step(eager_net, opt_factory)
-    te_step = generate_trace_step(trace_enable_net, opt_factory, True)
-    td_step = generate_trace_step(trace_disable_net, opt_factory, False)
-
-    assert_network_equvilence([eager_net, trace_enable_net, trace_disable_net])
-
-    # Use hard code number as limit, may increase if needed.
-    for data, label in itertools.islice(minibatch_generator(), 200):
-        eloss = estep(data, label)
-        te_loss = te_step(data, label)
-        td_loss = td_step(data, label)
-
-        assertTensorClose(eloss, te_loss)
-        assertTensorClose(eloss, td_loss)
-        assert_network_equvilence(
-            [eager_net, trace_enable_net, trace_disable_net,]
-        )
diff --git a/python_module/test/integration/test_fastrun.py b/python_module/test/integration/test_fastrun.py
deleted file mode 100644
index 8cce8c59..00000000
--- a/python_module/test/integration/test_fastrun.py
+++ /dev/null
@@ -1,12 +0,0 @@
-import numpy as np
-
-import megengine as mge
-from megengine.functional.debug_param import set_conv_execution_strategy
-from megengine.module.conv import Conv2d
-
-
-def test_fastrun():
-    set_conv_execution_strategy("PROFILE")
-    x = Conv2d(1, 1, kernel_size=1, bias=True)
-    a = mge.tensor(np.random.randn(1, 1, 1, 1).astype(np.float32))
-    a = x(a)
diff --git a/python_module/test/integration/test_parampack.py b/python_module/test/integration/test_parampack.py
deleted file mode 100644
index c9acc47c..00000000
--- a/python_module/test/integration/test_parampack.py
+++ /dev/null
@@ -1,274 +0,0 @@
-# -*- coding: utf-8 -*-
-# MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
-#
-# Copyright (c) 2014-2020 Megvii Inc. All rights reserved.
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-import itertools
-
-import numpy as np
-import pytest
-
-import megengine as mge
-from megengine.core import tensor
-from megengine.functional import cross_entropy_with_softmax, tanh
-from megengine.jit import trace
-from megengine.module import Linear, Module, ParamPack
-from megengine.optimizer import SGD
-
-batch_size = 64
-data_shape = (batch_size, 2)
-label_shape = (batch_size,)
-
-
-def minibatch_generator():
-    while True:
-        inp_data = np.zeros((batch_size, 2))
-        label = np.zeros(batch_size, dtype=np.int32)
-        for i in range(batch_size):
-            # [x0, x1], sampled from U[-1, 1]
-            inp_data[i, :] = np.random.rand(2) * 2 - 1
-            label[i] = 0 if np.prod(inp_data[i]) < 0 else 1
-        yield inp_data.astype(np.float32), label.astype(np.int32)
-
-
-def calculate_precision(data: np.ndarray, pred: np.ndarray) -> float:
-    """ Calculate precision for given data and prediction.
-
-    :type data: [[x, y], ...]
-    :param data: Input data
-    :type pred: [[x_pred, y_pred], ...]
-    :param pred: Network output data
-    """
-    correct = 0
-    assert len(data) == len(pred)
-    for inp_data, pred_output in zip(data, pred):
-        label = 0 if np.prod(inp_data) < 0 else 1
-        pred_label = np.argmax(pred_output)
-        if pred_label == label:
-            correct += 1
-    return float(correct) / len(data)
-
-
-class XORNet(Module):
-    def __init__(self):
-        self.mid_layers = 14
-        self.num_class = 2
-        super().__init__()
-
-        self.fc0 = Linear(self.num_class, self.mid_layers, bias=True)
-        self.fc1 = Linear(self.mid_layers, self.mid_layers, bias=True)
-
-        self.fc2 = Linear(self.mid_layers, self.num_class, bias=True)
-
-    def forward(self, x):
-        x = self.fc0(x)
-        x = tanh(x)
-        x = self.fc1(x)
-        x = tanh(x)
-        x = self.fc2(x)
-        return x
-
-
-@pytest.mark.slow
-def test_static_graph_parampack():
-    net = XORNet()
-    net = ParamPack(
-        net, nr_ignore_first=0, max_size_per_group=10, max_nr_params_per_group=100
-    )
-    opt = SGD(
-        net.parameters(requires_grad=True), lr=0.01, momentum=0.9, weight_decay=5e-4
-    )
-
-    @trace(symbolic=True)
-    def train(data, label):
-        pred = net(data)
-        opt.zero_grad()
-        loss = cross_entropy_with_softmax(pred, label)
-        opt.backward(loss)
-        return loss
-
-    @trace(symbolic=True)
-    def infer(data):
-        return net(data)
-
-    train_dataset = minibatch_generator()
-    losses = []
-
-    for data, label in itertools.islice(train_dataset, 2000):
-        loss = train(data, label)
-        loss = loss[0][0]
-        opt.step()
-        losses.append(loss.numpy())
-
-    assert np.mean(losses[-100:]) < 0.1, "Final training Loss must be low enough"
-
-    ngrid = 10
-    x = np.linspace(-1.0, 1.0, ngrid)
-    xx, yy = np.meshgrid(x, x)
-    xx = xx.reshape((ngrid * ngrid, 1))
-    yy = yy.reshape((ngrid * ngrid, 1))
-    data = np.concatenate((xx, yy), axis=1).astype(np.float32)
-
-    pred = infer(data).numpy()
-    assert calculate_precision(data, pred) == 1.0, "Test precision must be high enough"
-
-
-@pytest.mark.slow
-def test_nopack_parampack():
-    net = XORNet()
-    net = ParamPack(net, max_size_per_group=0, max_nr_params_per_group=0)
-    opt = SGD(
-        net.parameters(requires_grad=True), lr=0.01, momentum=0.9, weight_decay=5e-4
-    )
-
-    @trace(symbolic=True)
-    def train(data, label):
-        pred = net(data)
-        opt.zero_grad()
-        loss = cross_entropy_with_softmax(pred, label)
-        opt.backward(loss)
-        return loss
-
-    @trace(symbolic=True)
-    def infer(data):
-        return net(data)
-
-    train_dataset = minibatch_generator()
-    losses = []
-
-    for data, label in itertools.islice(train_dataset, 2000):
-        loss = train(data, label)
-        loss = loss[0][0]
-        opt.step()
-        losses.append(loss.numpy())
-    assert np.mean(losses[-100:]) < 0.1, "Final training Loss must be low enough"
-
-    ngrid = 10
-    x = np.linspace(-1.0, 1.0, ngrid)
-    xx, yy = np.meshgrid(x, x)
-    xx = xx.reshape((ngrid * ngrid, 1))
-    yy = yy.reshape((ngrid * ngrid, 1))
-    data = np.concatenate((xx, yy), axis=1).astype(np.float32)
-
-    pred = infer(data).numpy()
-    assert calculate_precision(data, pred) == 1.0, "Test precision must be high enough"
-
-
-@pytest.mark.slow
-def test_dynamic_graph_parampack():
-    net = XORNet()
-    net = ParamPack(
-        net, nr_ignore_first=0, max_size_per_group=10, max_nr_params_per_group=100
-    )
-    opt = SGD(
-        net.parameters(requires_grad=True), lr=0.01, momentum=0.9, weight_decay=5e-4
-    )
-
-    @trace(symbolic=False)
-    def train(data, label):
-        pred = net(data)
-        opt.zero_grad()
-        loss = cross_entropy_with_softmax(pred, label)
-        opt.backward(loss)
-        return loss
-
-    @trace(symbolic=False)
-    def infer(data):
-        return net(data)
-
-    train_dataset = minibatch_generator()
-    losses = []
-
-    for data, label in itertools.islice(train_dataset, 2000):
-        loss = train(data, label)
-        loss = loss[0][0]
-        opt.step()
-        losses.append(loss.numpy())
-
-    assert np.mean(losses[-100:]) < 0.1, "Final training Loss must be low enough"
-
-    ngrid = 10
-    x = np.linspace(-1.0, 1.0, ngrid)
-    xx, yy = np.meshgrid(x, x)
-    xx = xx.reshape((ngrid * ngrid, 1))
-    yy = yy.reshape((ngrid * ngrid, 1))
-    data = np.concatenate((xx, yy), axis=1).astype(np.float32)
-
-    pred = infer(data).numpy()
-    assert calculate_precision(data, pred) == 1.0, "Test precision must be high enough"
-
-
-@pytest.mark.slow
-def test_correctness_parampack():
-    net1 = XORNet()
-    net2 = XORNet()
-    params1 = net1.parameters()
-    params2 = net2.parameters()
-    for param1, param2 in zip(params1, params2):
-        param1.set_value(param2.numpy())
-    net1 = ParamPack(
-        net1, nr_ignore_first=0, max_size_per_group=10, max_nr_params_per_group=100
-    )
-    opt1 = SGD(
-        net1.parameters(requires_grad=True), lr=0.01, momentum=0.9, weight_decay=5e-4
-    )
-
-    opt2 = SGD(
-        net2.parameters(requires_grad=True), lr=0.01, momentum=0.9, weight_decay=5e-4
-    )
-
-    @trace(symbolic=False)
-    def train1(data, label):
-        pred = net1(data)
-        opt1.zero_grad()
-        loss = cross_entropy_with_softmax(pred, label)
-        opt1.backward(loss)
-        return loss
-
-    @trace(symbolic=False)
-    def train2(data, label):
-        pred = net2(data)
-        opt2.zero_grad()
-        loss = cross_entropy_with_softmax(pred, label)
-        opt2.backward(loss)
-        return loss
-
-    @trace(symbolic=False)
-    def infer1(data):
-        return net1(data)
-
-    @trace(symbolic=False)
-    def infer2(data):
-        return net2(data)
-
-    train_dataset = minibatch_generator()
-
-    for data, label in itertools.islice(train_dataset, 2000):
-        train1(data, label)
-        opt1.step()
-
-        train2(data, label)
-        opt2.step()
-
-    data, _ = next(train_dataset)
-    pred1 = infer1(data).numpy()
-    pred2 = infer2(data).numpy()
-    assert np.allclose(pred1, pred2)
-
-
-def test_parampack_group_func():
-    net = XORNet()
-    net = ParamPack(
-        net,
-        nr_ignore_first=1,
-        max_size_per_group=10,
-        max_nr_params_per_group=100,
-        group_func=lambda n, p: "weight" in n,
-    )
-    for p in net.parameters(requires_grad=True):
-        assert p.pack_group_key is not None
-    for n, p in net.named_parameters(requires_grad=True):
-        assert p.pack_group_key is not None
diff --git a/python_module/test/regression/.gitignore b/python_module/test/regression/.gitignore
deleted file mode 100644
index 328c8678..00000000
--- a/python_module/test/regression/.gitignore
+++ /dev/null
@@ -1,2 +0,0 @@
-data
-log
diff --git a/python_module/test/regression/__init__.py b/python_module/test/regression/__init__.py
deleted file mode 100644
index e69de29b..00000000
diff --git a/python_module/test/regression/test_MGE-103.py b/python_module/test/regression/test_MGE-103.py
deleted file mode 100644
index e0d387c1..00000000
--- a/python_module/test/regression/test_MGE-103.py
+++ /dev/null
@@ -1,34 +0,0 @@
-# -*- coding: utf-8 -*-
-# MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
-#
-# Copyright (c) 2014-2020 Megvii Inc. All rights reserved.
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-import numpy as np
-import pytest
-
-import megengine as mge
-from megengine.core import tensor
-from megengine.jit import trace
-from megengine.module import BatchNorm2d
-from megengine.test import assertTensorClose
-
-
-@pytest.mark.regression
-def test_batchnorm_change_batchsize():
-    data_shape = (2, 3, 8, 8)
-    real_shape = (4, 3, 8, 8)
-    data = np.random.random(data_shape).astype(np.float32)
-    d = np.random.random(real_shape).astype(np.float32)
-
-    bn = BatchNorm2d(3)
-    f = trace(bn)
-    f(data)
-
-    y1 = f(d)
-
-    y0 = bn(tensor(d))
-
-    assertTensorClose(y0.numpy(), y1.numpy())
diff --git a/python_module/test/regression/test_MGE-22.py b/python_module/test/regression/test_MGE-22.py
deleted file mode 100644
index b719b9be..00000000
--- a/python_module/test/regression/test_MGE-22.py
+++ /dev/null
@@ -1,39 +0,0 @@
-# -*- coding: utf-8 -*-
-# MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
-#
-# Copyright (c) 2014-2020 Megvii Inc. All rights reserved.
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-import numpy as np
-import pytest
-
-from megengine.core import tensor
-from megengine.module import Linear, Module
-from megengine.optimizer import SGD
-
-
-class Blur(Module):
-    def __init__(self, dim1=16, dim2=128, dim3=1):
-        super().__init__()
-
-        self.fc1 = Linear(dim1, dim2)
-        self.fc2 = Linear(dim2, dim3)
-
-    def forward(self, x):
-        x = self.fc1(x)
-        x = self.fc2(x)
-        return x.mean(axis=1, keepdims=True)
-
-
-@pytest.mark.regression
-def test_blur():
-    net = Blur()
-    data = tensor(np.random.random((32, 16)).astype("float32"))
-
-    opt = SGD(net.parameters(requires_grad=True), lr=0.1)
-    opt.zero_grad()
-
-    loss = net(data)
-    opt.backward(loss.sum())
diff --git a/python_module/test/regression/test_MGE-323.py b/python_module/test/regression/test_MGE-323.py
deleted file mode 100644
index 47e73f2a..00000000
--- a/python_module/test/regression/test_MGE-323.py
+++ /dev/null
@@ -1,21 +0,0 @@
-# -*- coding: utf-8 -*-
-# MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
-#
-# Copyright (c) 2014-2020 Megvii Inc. All rights reserved.
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-import numpy as np
-
-import megengine as mge
-
-
-def test_mge_323():
-    # Regression: set_value does not update eager_val
-    x = mge.tensor([0])
-    _ = x * 2
-    x.set_value([1, 1])
-    np.testing.assert_array_equal(x.numpy(), [1, 1])
-    assert x.shape == (2,)
-    np.testing.assert_array_equal(x * 2, [2, 2])
diff --git a/python_module/test/regression/test_MGE-81.py b/python_module/test/regression/test_MGE-81.py
deleted file mode 100644
index f32b3232..00000000
--- a/python_module/test/regression/test_MGE-81.py
+++ /dev/null
@@ -1,36 +0,0 @@
-# -*- coding: utf-8 -*-
-# MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
-#
-# Copyright (c) 2014-2020 Megvii Inc. All rights reserved.
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-import numpy as np
-
-import megengine as mge
-import megengine.functional as F
-import megengine.module as M
-from megengine.core import tensor
-
-
-def test_mge_81():
-    np.random.seed(0)
-    N, D = 3, 4
-    x = mge.Parameter(value=np.random.normal(size=(N, D)).astype(np.float32))
-    y = mge.Parameter(value=np.random.normal(size=(N, D)).astype(np.float32))
-    z = mge.Parameter(value=np.random.normal(size=(N, D)).astype(np.float32))
-    a = x * y
-    b = a + z
-    c = F.sum(b)
-    grad_x = F.grad(c, x, use_virtual_grad=False)
-    grad_y = F.grad(c, y, use_virtual_grad=False)
-    grad_z = F.grad(c, z, use_virtual_grad=False)
-    print(grad_x.numpy())
-    print(grad_y.numpy())
-    print(grad_z.numpy())
-    m = M.BatchNorm2d(4)
-    input = tensor(np.zeros((64, 4, 32, 32), dtype=np.float32))
-    _ = m(input)
-    m = M.BatchNorm2d(4, affine=False)
-    _ = m(input)
diff --git a/python_module/test/run.sh b/python_module/test/run.sh
deleted file mode 100755
index 7f2d9038..00000000
--- a/python_module/test/run.sh
+++ /dev/null
@@ -1,19 +0,0 @@
-#!/bin/bash -e
-
-ignore_list="--ignore test/unit/module/test_pytorch.py \
-            --ignore test/pytorch_comparison \
-            --ignore test/unit/hub/test_hub.py \
-            --ignore test/unit/data \
-            --ignore test/integration/manual \
-            --ignore megengine/module/pytorch \
-            --ignore test/unit/module/test_external.py"
-test_dirs="megengine test"
-
-pushd $(dirname "${BASH_SOURCE[0]}")/.. >/dev/null
-    python3 -m pytest -xv -m 'isolated_distributed' \
-        --json-report --json-report-file=time_python_test.json \
-        $ignore_list $test_dirs
-    python3 -m pytest -xv -m 'not internet and not isolated_distributed' \
-        --json-report --json-report-file=time_python_test.json \
-        $ignore_list $test_dirs
-popd >/dev/null
diff --git a/python_module/test/unit/__init__.py b/python_module/test/unit/__init__.py
deleted file mode 100644
index e69de29b..00000000
diff --git a/python_module/test/unit/core/__init__.py b/python_module/test/unit/core/__init__.py
deleted file mode 100644
index e69de29b..00000000
diff --git a/python_module/test/unit/core/test_dynamic_profiling.py b/python_module/test/unit/core/test_dynamic_profiling.py
deleted file mode 100644
index 53e603b8..00000000
--- a/python_module/test/unit/core/test_dynamic_profiling.py
+++ /dev/null
@@ -1,56 +0,0 @@
-# -*- coding: utf-8 -*-
-# MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
-#
-# Copyright (c) 2014-2020 Megvii Inc. All rights reserved.
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-import json
-
-import numpy as np
-
-import megengine.functional as F
-from megengine import graph, tensor
-
-
-def test_dynamic_profiling():
-    with graph.Graph():
-        sz = 16
-
-        cg = graph.get_default_graph()
-
-        x = tensor(np.arange(0, sz, dtype=np.float32))
-        y = F.relu(x)
-
-        str1 = cg.get_mem_allocation_info()
-        if str1 == "":
-            return
-        json_str1 = json.loads(str1)
-
-        z = F.add_update(x, y)
-
-        json_str2 = json.loads(cg.get_mem_allocation_info())
-
-        diff = lambda l1, l2: [x for x in l1 if x not in l2]
-
-        jdiff = diff(json_str2, json_str1)
-        assert (
-            len(jdiff) == 1
-        ), "add_update operator should produce only one opr internally"
-
-        dest_key = list(jdiff[0].keys())[0]
-        assert (
-            jdiff[0][dest_key]["output"][0]["memory"] == sz * 4
-        ), "output of add_update operator has wrong allocated size"
-
-        # check add_update is inplace or not
-        dest_ptr = jdiff[0][dest_key]["output"][0]["dev_ptr"]
-
-        found = False
-        for li in json_str1:
-            if "0" in li.keys():
-                src_ptr = li["0"]["output"][0]["dev_ptr"]
-                found = dest_ptr == src_ptr
-
-        assert found == True, "add_update is not inplace"
diff --git a/python_module/test/unit/core/test_function.py b/python_module/test/unit/core/test_function.py
deleted file mode 100644
index c58978bb..00000000
--- a/python_module/test/unit/core/test_function.py
+++ /dev/null
@@ -1,169 +0,0 @@
-# MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
-#
-# Copyright (c) 2014-2020 Megvii Inc. All rights reserved.
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-import copy
-
-import numpy as np
-
-import megengine.functional as F
-from megengine.core import Function, tensor
-from megengine.jit import trace
-from megengine.test import assertTensorClose
-
-
-def test_a_plus_b():
-    data_shape = (1, 9, 2, 6)
-    av = np.random.random(data_shape).astype(np.float32)
-    bv = np.random.random(data_shape).astype(np.float32)
-    a = tensor(av)
-    b = tensor(bv)
-
-    class MulFunc(Function):
-        def forward(self, a, b):
-            return a * b
-
-        def backward(self, grad_o):
-            return (grad_o * b * 2, grad_o * a * 3)
-
-    c = MulFunc()(a, b).sum()
-    assertTensorClose(c.numpy(), (av * bv).sum())
-    assertTensorClose(F.grad(c, a, use_virtual_grad=False).numpy(), bv * 2)
-    assertTensorClose(F.grad(c, b, use_virtual_grad=False).numpy(), av * 3)
-
-
-def test_skip_invalid_grad():
-    data_shape = (1, 9, 2, 6)
-    av = np.random.random(data_shape).astype(np.float32)
-    bv = np.random.random(data_shape).astype(np.float32)
-    a = tensor(av)
-    b = tensor(bv)
-    cookie = tensor(np.random.random(data_shape).astype(np.float32))
-
-    class EqWithFakeGrad(Function):
-        def forward(self, a, b):
-            return a == b
-
-        def backward(self, grad_o):
-            _ = grad_o
-            return cookie, cookie
-
-    c = EqWithFakeGrad()(a, b).sum()
-    assertTensorClose(c.numpy(), (av == bv).sum().astype(np.float32))
-    assertTensorClose(F.grad(c, a, use_virtual_grad=False).numpy(), cookie)
-    assertTensorClose(F.grad(c, b, use_virtual_grad=False).numpy(), cookie)
-
-
-def test_ste():
-    class STE(Function):
-        def forward(self, x):
-            maxv, minv = x.max(), x.min()
-            scale = F.maximum(maxv, -minv) / 127
-            return F.round(x / scale) * scale
-
-        def backward(self, grad_y):
-            return grad_y
-
-    data_shape = (1, 9, 2, 6)
-    av = np.random.random(data_shape).astype(np.float32)
-    a = tensor(av)
-    q = STE()(a)
-    q_2 = (q * 2.0).sum()
-    assertTensorClose(
-        F.grad(q_2, a, use_virtual_grad=False).numpy(),
-        np.broadcast_to(np.array([2.0], dtype=np.float32), data_shape),
-    )
-
-
-def test_deepcopy():
-    class Sigmoid(Function):
-        def __init__(self, param):
-            super().__init__()
-            self.param = param
-
-        def forward(self, x):
-            y = 1 / (1 + F.exp(-x))
-            self.save_for_backward(y)
-            return y
-
-        def backward(self, grad_y):
-            (y,) = self.saved_tensors
-            return grad_y * y * (1 - y)
-
-    origin = Sigmoid(0)
-    new = copy.deepcopy(Sigmoid(0))
-    assert new.param == origin.param
-
-
-def test_save_context():
-    class Sigmoid(Function):
-        def forward(self, x):
-            y = 1 / (1 + F.exp(-x))
-            self.save_for_backward(y)
-            return y
-
-        def backward(self, grad_y):
-            (y,) = self.saved_tensors
-            return grad_y * y * (1 - y)
-
-    def run_saved_context(a, net=None):
-        return net(a)
-
-    def run(use_trace, symbolic):
-        a = tensor(np.array([1926.0817], dtype=np.float32))
-        net = Sigmoid()
-        func_run = run_saved_context
-        if use_trace:
-            func_run = trace(run_saved_context, symbolic=symbolic)
-        s = func_run(a, net=net)
-        s2 = F.sigmoid(a)
-        assertTensorClose(s.numpy(), s2.numpy())
-        assertTensorClose(
-            F.grad(s, a, use_virtual_grad=False).numpy(),
-            F.grad(s2, a, use_virtual_grad=False).numpy(),
-        )
-
-    run(False, False)
-    run(True, False)
-    run(True, True)
-
-
-def test_none_in_out_grad():
-    class Test(Function):
-        def forward(self, a, b):
-            return a, b
-
-        def backward(self, grad_a, grad_b):
-            assert grad_b is None
-            return (grad_a, 0)
-
-    a = tensor(np.array([1.0], dtype=np.float32))
-    b = tensor(np.array([2.0], dtype=np.float32))
-    aa, bb = Test()(a, b)
-    assertTensorClose(
-        F.grad(aa, a, use_virtual_grad=False).numpy(), np.array([1.0], dtype=np.float32)
-    )
-    assertTensorClose(
-        F.grad(aa, b, use_virtual_grad=False).numpy(), np.array([0.0], dtype=np.float32)
-    )
-
-
-def test_zero_grad():
-    class StopGradient(Function):
-        def forward(self, a):
-            return a
-
-        def backward(self, *_):
-            return None
-
-    a = tensor(np.array([1.0], dtype=np.float32))
-    b = a * 3.0
-    c = a * 4.0
-    loss = StopGradient()(b) + c
-    assertTensorClose(
-        F.grad(loss, a, use_virtual_grad=False).numpy(),
-        np.array([4.0], dtype=np.float32),
-    )
diff --git a/python_module/test/unit/core/test_graph.py b/python_module/test/unit/core/test_graph.py
deleted file mode 100644
index 5f5fbe95..00000000
--- a/python_module/test/unit/core/test_graph.py
+++ /dev/null
@@ -1,74 +0,0 @@
-# -*- coding: utf-8 -*-
-# MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
-#
-# Copyright (c) 2014-2020 Megvii Inc. All rights reserved.
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-import numpy as np
-import pytest
-from helpers import MLP
-
-import megengine._internal as mgb
-import megengine.functional as F
-from megengine.core import Graph
-from megengine.module import Linear, Module
-from megengine.optimizer import SGD
-from megengine.test import assertTensorClose
-
-
-def test_compile_multi_times_eager():
-    return  # XXX: rewrite or remove this test
-    data = Input("data", shape=(2, 28))
-    label = Input("label", shape=(2,), dtype=np.int32)
-
-    mlp = MLP()
-    opt = SGD(mlp.parameters(requires_grad=True), lr=0.01)
-
-    pred0 = mlp(data)
-    pred = F.softmax(pred0)
-    loss = F.square_loss(pred, label.reshape(2, 1))
-    opt.zero_grad()
-    grads = opt.backward(loss)
-    opt.step()
-
-    f0 = compile(pred, None)
-    f1 = compile([pred, loss], grads, copy=False)
-    for _ in range(3):
-        data = np.random.random((2, 28)).astype(np.float32)
-        label = np.random.randint(0, 10, (2,)).astype(np.float32)
-        out0 = f0(data=data)
-        out1 = f1(data=data, label=label)
-        assertTensorClose(out0[0], out1[0])
-
-
-def test_compile_multi_times_static():
-    return  # XXX: rewrite or remove this test
-    with Graph() as cg:
-        cg.set_option("eager_evaluation", False)
-        data = Input("data", shape=(2, 28))
-        label = Input("label", shape=(2,), dtype=np.int32)
-
-        mlp = MLP()
-        opt = SGD(mlp.parameters(requires_grad=True), lr=0.01)
-
-        pred0 = mlp(data)
-        pred = F.softmax(pred0)
-        loss = F.square_loss(pred, label.reshape(2, 1))
-        opt.zero_grad()
-        grads = opt.backward(loss)
-        opt.step()
-
-        f0 = compile(pred, None)
-        f1 = compile([pred, loss], grads, copy=True)
-
-        data = np.random.random((2, 28)).astype(np.float32)
-        label = np.random.randint(0, 10, (2,)).astype(np.float32)
-        out0 = f0(data=data)
-        out1 = f1(data=data, label=label)
-        assertTensorClose(out0[0], out1[0])
-
-        _ = compile([pred, loss], grads, copy=False)
-        with pytest.raises(mgb.MegBrainError):
-            f0(data=data)
diff --git a/python_module/test/unit/core/test_index.py b/python_module/test/unit/core/test_index.py
deleted file mode 100644
index 4121c08f..00000000
--- a/python_module/test/unit/core/test_index.py
+++ /dev/null
@@ -1,311 +0,0 @@
-# -*- coding: utf-8 -*-
-# MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
-#
-# Copyright (c) 2014-2020 Megvii Inc. All rights reserved.
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-import numpy as np
-
-from megengine import Tensor, tensor
-from megengine.jit import trace
-from megengine.test import assertTensorClose
-
-
-def check_equal(np_tensor, mge_tensor):
-    assertTensorClose(np_tensor, mge_tensor.numpy())
-
-
-def test_index():
-    a_shape = (10, 10, 10, 10, 10)
-    a = np.random.random(a_shape).astype(dtype=np.float32)
-    b = tensor(a)
-    test_set = {}
-    test_set["a"] = np.random.random(a_shape).astype(dtype=np.float32)
-    test_set["b"] = tensor(test_set["a"])
-    test_set["c"] = tensor(test_set["a"])
-
-    def check_id_2(np_idx, mge_idx):
-        # print('start :', mge_idx)
-        def get_b(symbolic, *args):
-            # print('get_b:', args)
-            def get_func(inp):
-                for i in mge_idx:
-                    if isinstance(i, (list, Tensor)):
-                        return inp.ai[mge_idx]
-                return inp[mge_idx]
-
-            func = trace(get_func, symbolic=symbolic)
-            return func(*args)
-
-        def set_b(symbolic, *args):
-            # print('set_b:', args)
-            def set_func(inp, val):
-                for i in mge_idx:
-                    if isinstance(i, (list, Tensor)):
-                        return inp.set_ai(val)[mge_idx]
-                return inp.set_subtensor(val)[mge_idx]
-
-            func = trace(set_func, symbolic=symbolic)
-            return func(*args)
-
-        sub_a = a[np_idx]
-        for symbolic in [True, False]:
-            sub_b = get_b(symbolic, b)
-            check_equal(sub_a, sub_b)
-            # do not support set
-            # print(mge_idx)
-            if not mge_idx:
-                continue
-            go_flag = False
-            for i in mge_idx:
-                if i is np.newaxis:
-                    go_flag = True
-                    break
-            if go_flag:
-                continue
-            if not symbolic:
-                test_set["b"] = set_b(symbolic, test_set["b"], sub_b)
-                check_equal(test_set["a"], test_set["b"])
-            else:
-                test_set["a"][np_idx] = sub_a
-                test_set["c"] = set_b(symbolic, test_set["c"], sub_b)
-                check_equal(test_set["a"], test_set["c"])
-
-    def check_idx(*idx):
-        check_id_2(idx, idx)
-
-    def tensor_wrap(*idx):
-        check_idx(*idx)
-        tensor_idx = []
-        numpy_idx = []
-        for i in idx:
-            numpy_idx.append(np.asarray(i).astype(np.int32))
-            tensor_idx.append(tensor(numpy_idx[-1]))
-        a_idx = tuple(numpy_idx)
-        b_idx = tuple(tensor_idx)
-        check_id_2(a_idx, b_idx)
-
-    def test_one_dim():
-        check_idx(-7)
-        check_idx(-7, -7)
-        check_idx(-7, -7, -7)
-        check_idx(-7, -7, -7, -7)
-        check_idx(-7, -7, -7, -7, -7)
-        check_idx(7, 7, 7, 7, 7)
-        check_idx(7, 7, 7, 7)
-        check_idx(7, 7, 7)
-        check_idx(7, 7)
-        check_idx(7)
-        check_idx()
-
-    def test_slice():
-
-        check_idx(slice(1, 7))
-        check_idx(slice(1, 7))
-        check_idx(slice(7, None))
-        check_idx(slice(1, 7, 2))
-        check_idx(slice(-7, 5))
-        check_idx(slice(7, None, 7))
-        check_idx(slice(None, 7, 7))
-
-    def test_new_axis():
-        check_idx(7, np.newaxis, 7)
-        check_idx(7, np.newaxis, slice(3, 7))
-
-    def test_ellipsis():
-        check_idx(..., 7)
-
-        check_idx(7, ...)
-
-        check_idx(7, ..., 7, 7)
-
-        check_idx(7, ..., 7, -7)
-
-        check_idx(7, ..., slice(1, 7), -7)
-
-    def test_integer_array():
-
-        index = [[6, 7, 8], [9, 7, 4], [1, 1, 1], [3, 5, 6], [7, 8, 1]]
-
-        tensor_wrap(index[0])
-        tensor_wrap(index[0], index[1])
-        tensor_wrap(index[0], index[1], index[2])
-        tensor_wrap(index[0], index[1], index[2], index[3])
-        tensor_wrap(index[0], index[1], index[2], index[3], index[4])
-
-        # multi dimension
-        index = [
-            [6, 7, 8, 8, 9, 7],
-            [9, 7, 4, 1, 8, 2],
-            [1, 1, 1, 0, 3, 3],
-            [3, 5, 6, 1, 6, 3],
-            [7, 8, 1, 1, 8, 2],
-        ]
-
-        tensor_wrap(index[0])
-        tensor_wrap(index[0], index[1])
-        tensor_wrap(index[0], index[1], index[2])
-        tensor_wrap(index[0], index[1], index[2], index[3])
-        tensor_wrap(index[0], index[1], index[2], index[3], index[4])
-
-        # braodcast
-        # index = [
-        #     [6, 7, 8, 8, 9, 7],  # 2 * 3
-        #     [2],  # 1
-        #     [1, 1, 1],  # 1 * 3
-        #     [6, 2],  # 2 * 1
-        #     [7, 8, 1, 1, 8, 2],  # 2 * 3
-        # ]
-
-        # tensor_wrap(index[0])
-        # tensor_wrap(index[0], index[1])
-        # tensor_wrap(index[0], index[1], index[2])
-        # tensor_wrap(index[0], index[1], index[2], index[3])
-        # tensor_wrap(index[0], index[1], index[2], index[3], index[4])
-
-    def test_multi_dim():
-        check_equal(a[7][7, 7, 7], b[7][7, 7, 7])
-        check_equal(a[7, 7, 7][7], b[7, 7, 7][7])
-
-        check_equal(a[7][7][7, 7], b[7][7][7, 7])
-        check_equal(a[7][7, 7][7], b[7][7, 7][7])
-        check_equal(a[7, 7][7][7], b[7, 7][7][7])
-
-        check_equal(a[7, 7, 7][7, 7], b[7, 7, 7][7, 7])
-        check_equal(a[7, 7][7, 7, 7], b[7, 7][7, 7, 7])
-
-        check_equal(a[7][1:7:2], b[7][1:7:2])
-
-        check_equal(a[7][7:], b[7][7:])
-
-        check_equal(a[7][-7:-1], b[7][-7:-1])
-
-        check_equal(a[7][-1:-7:-1], b[7][-1:-7:-1])
-
-        check_equal(a[7:8][:], b[7:8][:])
-        check_equal(a[7][:], b[7][:])
-
-        check_equal(a[7][:][7], b[7][:][7])
-        check_equal(a[:][7][7], b[:][7][7])
-
-        check_equal(a[7][7], b[7][7])
-        check_equal(a[7][7][7], b[7][7][7])
-        check_equal(a[7][7][7][7], b[7][7][7][7])
-        check_equal(a[7][7][7][7][7], b[7][7][7][7][7])
-
-    def test_hard():
-        check_idx(slice(None, None), [6, 7, 8], slice(0, 7), [6, 7, 8])
-        check_idx(slice(None, None), slice(0, 7), [6, 7, 8], [6, 7, 8])
-        # check_idx(slice(None, None), slice(0, 7), [[6], [7], [8]], [[6, 7, 8]])
-        # check_idx(slice(None, None), [[6, 7, 8]], slice(1, 3), [[6], [7], [8]])
-        # check_idx(slice(None, None), [[6, 7, 8]], [[6], [7], [8]], slice(1, 3))
-        check_idx([6, 7, 8], [6, 7, 8], 7, slice(2, 7))
-        # check_idx(Ellipsis, 1, [[[6]]], [[[0]]], slice(1, 4, 2))
-        # check_idx(slice(2, 4, 2), 1, Ellipsis, 0, [[[7]], [[3]]])
-        # check_idx(slice(7, 10, 2), 3, Ellipsis, [[[5, 0]]], [[[3, 1]]])
-        # check_idx(slice(7, 9, 1), [[[4]]], 8, Ellipsis, [[[6]], [[9]]])
-
-    def test_super_random():
-        from random import randint
-        from random import random as rand
-
-        def true_or_false(ture_prob):
-            return rand() < ture_prob
-
-        def random_list(limit, size, one_base=False):
-            if one_base:
-                return [randint(1, limit) for _ in range(0, size)]
-            else:
-                # 0 <= x < limit
-                return [randint(0, limit - 1) for _ in range(0, size)]
-
-        def generate_random_int_matrix(limit, shape):
-            if len(shape) == 0:
-                return []
-            if len(shape) == 1:
-                return random_list(limit, shape[0])
-            return [
-                generate_random_int_matrix(limit, shape[1:]) for _ in range(0, shape[0])
-            ]
-
-        def generate_boardcast_shape(limit_shape):
-            # new_len = randint(1, len(limit_shape))
-            new_len = len(limit_shape)
-            return [(1 if true_or_false(0.3) else i) for i in limit_shape[:new_len]]
-
-        def g_slice(size):
-            start = randint(0, size)
-            if start == size:
-                start = None
-            end = randint(1 if start is None else start + 1, size + 1)
-            if end == size + 1:
-                end = None
-            return slice(start, end, 1 if true_or_false(0.3) else 2)
-
-        def g_int(size):
-            return randint(0, size - 1)
-
-        def g_inedx(limit_shape):
-            new_len = randint(len(limit_shape) // 2, len(limit_shape))
-            output = []
-            # [5] -> (0 ~ 4)
-
-            cur_dim, cur_new = len(limit_shape), 0
-            use_int_array = False
-            i = 0
-            while len(output) < new_len:
-                flag = rand()
-                single_idx = None
-                old_dim, old_new, old_use_int_array = cur_dim, cur_new, use_int_array
-                if flag < 0.3:
-                    single_idx = g_int(limit_shape[i])
-                    cur_dim -= 1
-                elif flag < 0.5:
-                    single_idx = g_slice(limit_shape[i])
-                elif flag < 0.9:
-                    if not use_int_array:
-                        board_cast_dim = random_list(10, 1, one_base=True)
-                        cur_dim += len(board_cast_dim)
-                        use_int_array = True
-                    cur_dim -= 1
-                    integer_array_shape = generate_boardcast_shape(board_cast_dim)
-                    single_idx = generate_random_int_matrix(
-                        limit_shape[i], integer_array_shape
-                    )
-                else:
-                    cur_dim += 1
-                    cur_new += 1
-                    single_idx = np.newaxis
-                # MAX_DIM  < 7
-                if cur_dim > 7 or cur_new + len(limit_shape) > 7:
-                    cur_dim, cur_new, use_int_array = (
-                        old_dim,
-                        old_new,
-                        old_use_int_array,
-                    )
-                    continue
-                if not single_idx is np.newaxis:
-                    i += 1
-                output.append(single_idx)
-                # print('[cur_dim]: ', cur_dim, output)
-
-            if cur_dim < 7 and rand() < 0.3 and new_len < len(limit_shape):
-                output.insert(randint(0, len(output)), Ellipsis)
-
-            return tuple(output)
-
-        for i in range(0, 17):
-            idx = g_inedx(a_shape)
-            # print('[task {}] {}'.format(i, idx))
-            check_idx(*idx)
-
-    test_one_dim()
-    test_multi_dim()
-    test_slice()
-    test_new_axis()
-    test_ellipsis()
-    test_integer_array()
-    test_hard()
-    test_super_random()
diff --git a/python_module/test/unit/core/test_recoverable.py b/python_module/test/unit/core/test_recoverable.py
deleted file mode 100644
index 86735d1e..00000000
--- a/python_module/test/unit/core/test_recoverable.py
+++ /dev/null
@@ -1,40 +0,0 @@
-# -*- coding: utf-8 -*-
-# MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
-#
-# Copyright (c) 2014-2020 Megvii Inc. All rights reserved.
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-import numpy as np
-import pytest
-
-import megengine as mge
-import megengine._internal as mgb
-from megengine.core import tensor
-from megengine.test import assertTensorClose
-
-
-def test_recoverable():
-    a = tensor()
-    b = tensor()
-    a_np = np.random.random((4, 3)).astype("float32")
-    b_np = np.random.random((3, 7)).astype("float32")
-    a.set_value(a_np)
-    b.set_value(b_np)
-
-    # Do some normal computation.
-    a2 = a * 2
-    ab = a @ b
-
-    # Raise a computation error.
-    with pytest.raises(mgb.MegBrainError):
-        _ = a * b
-
-    # Variable a2 and ab should be still usable after error happened.
-    assertTensorClose(a2.numpy(), a_np * 2)
-    assertTensorClose(ab.numpy(), a_np @ b_np)
-
-    # Should allow computation as well.
-    ab2 = ab ** 2
-    assertTensorClose(ab2.numpy(), (a_np @ b_np) ** 2)
diff --git a/python_module/test/unit/core/test_release_memory.py b/python_module/test/unit/core/test_release_memory.py
deleted file mode 100644
index cf2ef0ac..00000000
--- a/python_module/test/unit/core/test_release_memory.py
+++ /dev/null
@@ -1,56 +0,0 @@
-# -*- coding: utf-8 -*-
-# MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
-#
-# Copyright (c) 2014-2020 Megvii Inc. All rights reserved.
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-import time
-
-import numpy as np
-import pytest
-from helpers import has_gpu
-
-import megengine as mge
-import megengine.functional as F
-from megengine.optimizer import SGD
-
-
-@pytest.mark.skip
-@pytest.mark.slow
-def test_release_memory():
-    mnist_datasets = load_mnist_datasets()
-    data_train, label_train = mnist_datasets["train"]
-
-    batch_size = 15000
-    data_shape = (batch_size, 1, 28, 28)
-    label_shape = (batch_size,)
-
-    data = nn.Input("data", shape=data_shape, dtype=np.float32)
-    label = nn.Input(
-        "label", shape=label_shape, dtype=np.int32, value=np.zeros(label_shape)
-    )
-
-    net = MnistNet()
-    opt = SGD(net.parameters(), lr=0.01)
-
-    pred = F.softmax(net(data))
-    loss = F.cross_entropy(pred, label)
-
-    opt.zero_grad()
-    opt.backward(loss)
-    add_updates = opt.step()
-
-    mge.graph._default_graph.get_default().clear_device_memory()
-
-    f = mge.graph.compile(loss, add_updates)
-
-    for _ in range(3):
-        train_loss = 0.0
-        for i in range(0, data_train.shape[0], batch_size):
-            opt.zero_grad()
-            data = data_train[i : i + batch_size, :, :, :]
-            label = label_train[i : i + batch_size]
-            loss = f(data=data, label=label)[0]
-            train_loss += loss[0]
diff --git a/python_module/test/unit/core/test_reshape_broadcast.py b/python_module/test/unit/core/test_reshape_broadcast.py
deleted file mode 100644
index 87469906..00000000
--- a/python_module/test/unit/core/test_reshape_broadcast.py
+++ /dev/null
@@ -1,108 +0,0 @@
-# -*- coding: utf-8 -*-
-# MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
-#
-# Copyright (c) 2014-2020 Megvii Inc. All rights reserved.
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-import numpy as np
-import pytest
-
-from megengine import tensor
-from megengine.test import assertTensorClose
-
-
-def test_reshape_tuple():
-    inp = tensor(np.arange(1, 17, dtype=np.int32).reshape(4, 4))
-    out = tensor(np.arange(100, 116, dtype=np.int32).reshape(1, 16))
-    out = out.reshape(inp.shape)
-
-    assertTensorClose(out.numpy(), np.arange(100, 116, dtype=np.int32).reshape(4, 4))
-
-
-def test_reshape_asterisk():
-    inp = tensor(np.arange(1, 17, dtype=np.int32).reshape(4, 4))
-    out = tensor(np.arange(100, 116, dtype=np.int32).reshape(1, 16))
-    out = out.reshape(*inp.shape)
-
-    assertTensorClose(out.numpy(), np.arange(100, 116, dtype=np.int32).reshape(4, 4))
-
-
-def test_reshape_shapeof():
-    inp = tensor(np.arange(1, 17, dtype=np.int32).reshape(4, 4))
-    out = tensor(np.arange(100, 116, dtype=np.int32).reshape(1, 16))
-    out = out.reshape(inp.shapeof())
-
-    assertTensorClose(out.numpy(), np.arange(100, 116, dtype=np.int32).reshape(4, 4))
-
-
-def test_reshape_tensor():
-    out = tensor(np.arange(100, 116, dtype=np.int32).reshape(1, 16))
-    out = out.reshape(tensor([4, 4]))
-
-    assertTensorClose(out.numpy(), np.arange(100, 116, dtype=np.int32).reshape(4, 4))
-
-
-def test_reshape_tensor_fused():
-    out = tensor(np.arange(100, 116, dtype=np.int32).reshape(1, 16))
-    out = out.reshape(tensor([4, 4]), 1)
-
-    assertTensorClose(out.numpy(), np.arange(100, 116, dtype=np.int32).reshape(4, 4, 1))
-
-
-def test_reshape_fused():
-    out = tensor(np.arange(100, 116, dtype=np.int32).reshape(1, 16))
-    out = out.reshape(tensor(2), 2, tensor(4), 1)
-
-    assertTensorClose(
-        out.numpy(), np.arange(100, 116, dtype=np.int32).reshape(2, 2, 4, 1)
-    )
-
-
-def test_reshape_wrong_tuple():
-    out = tensor(np.arange(100, 116, dtype=np.int32).reshape(1, 16))
-    with pytest.raises(ValueError):
-        out = out.reshape((2, 2), 4)
-
-
-def test_reshape_wrong_tuple2():
-    out = tensor(np.arange(100, 116, dtype=np.int32).reshape(1, 16))
-    with pytest.raises(AssertionError):
-        out = out.reshape(4, (2, 2))
-
-
-def test_broadcast_tuple():
-    inp = tensor(np.arange(1, 17, dtype=np.int32).reshape(4, 4))
-    out = tensor(np.arange(100, 104, dtype=np.int32).reshape(1, 4))
-
-    out = out.broadcast(inp.shape)
-
-    tmp = np.array([[100, 101, 102, 103]], dtype=np.int32)
-    out2 = np.repeat(tmp, 4, axis=0)
-
-    assertTensorClose(out.numpy(), out2)
-
-
-def test_broadcast_asterisk():
-    inp = tensor(np.arange(1, 17, dtype=np.int32).reshape(4, 4))
-    out = tensor(np.arange(100, 104, dtype=np.int32).reshape(1, 4))
-
-    out = out.broadcast(*inp.shape)
-
-    tmp = np.array([[100, 101, 102, 103]], dtype=np.int32)
-    out2 = np.repeat(tmp, 4, axis=0)
-
-    assertTensorClose(out.numpy(), out2)
-
-
-def test_broadcast_shapeof():
-    inp = tensor(np.arange(1, 17, dtype=np.int32).reshape(4, 4))
-    out = tensor(np.arange(100, 104, dtype=np.int32).reshape(1, 4))
-
-    out = out.broadcast(inp.shapeof())
-
-    tmp = np.array([[100, 101, 102, 103]], dtype=np.int32)
-    out2 = np.repeat(tmp, 4, axis=0)
-
-    assertTensorClose(out.numpy(), out2)
diff --git a/python_module/test/unit/core/test_serialization.py b/python_module/test/unit/core/test_serialization.py
deleted file mode 100644
index 85c30eb0..00000000
--- a/python_module/test/unit/core/test_serialization.py
+++ /dev/null
@@ -1,47 +0,0 @@
-# -*- coding: utf-8 -*-
-# MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
-#
-# Copyright (c) 2014-2020 Megvii Inc. All rights reserved.
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-import pickle
-from tempfile import TemporaryFile
-
-import numpy as np
-
-from megengine.core import Buffer, Parameter, tensor
-from megengine.test import assertTensorClose
-
-
-def test_tensor_serialization():
-    def tensor_eq(a, b):
-        assert a.dtype == b.dtype
-        assert a.device == b.device
-        assert a.requires_grad == b.requires_grad
-        assertTensorClose(a, b)
-
-    with TemporaryFile() as f:
-        data = np.random.randint(low=0, high=7, size=[233])
-        a = tensor(data, device="xpux", dtype=np.int32)
-        pickle.dump(a, f)
-        f.seek(0)
-        b = pickle.load(f)
-        tensor_eq(a, b)
-
-    with TemporaryFile() as f:
-        a = Parameter(np.random.random(size=(233, 2)).astype(np.float32))
-        pickle.dump(a, f)
-        f.seek(0)
-        b = pickle.load(f)
-        assert isinstance(b, Parameter)
-        tensor_eq(a, b)
-
-    with TemporaryFile() as f:
-        a = Buffer(np.random.random(size=(2, 233)).astype(np.float32))
-        pickle.dump(a, f)
-        f.seek(0)
-        b = pickle.load(f)
-        assert isinstance(b, Buffer)
-        tensor_eq(a, b)
diff --git a/python_module/test/unit/core/test_tensor.py b/python_module/test/unit/core/test_tensor.py
deleted file mode 100644
index 3ac0a4c8..00000000
--- a/python_module/test/unit/core/test_tensor.py
+++ /dev/null
@@ -1,91 +0,0 @@
-# -*- coding: utf-8 -*-
-# MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
-#
-# Copyright (c) 2014-2020 Megvii Inc. All rights reserved.
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-import numpy as np
-import pytest
-
-import megengine as mge
-import megengine._internal as mgb
-
-
-def test_wrong_dtype():
-    with pytest.raises(TypeError):
-        mge.tensor(np.zeros((5, 5), dtype=np.float64))
-
-    with pytest.raises(TypeError):
-        mge.Parameter(np.zeros((5, 5), dtype=np.int64))
-
-
-def test_tensor_routine():
-    mge.tensor(np.zeros((1, 2), dtype=np.int32))
-
-    mge.tensor([1])
-
-    mge.tensor(1.5)
-
-
-def test_tensor_set_dtype():
-    def check_dtype_value(tensor, dtype_scale, value):
-        if mgb.dtype.is_quantize(tensor.dtype):
-            if np.abs(mgb.dtype.get_scale(tensor.dtype) - dtype_scale) > 1e-5:
-                raise AssertionError(
-                    "compare scale failed expect {} got {}".format(
-                        dtype_scale, mgb.dtype.get_scale(tensor.dtype)
-                    )
-                )
-            if np.abs(tensor.numpy()[0][0] - value) > 1e-5:
-                raise AssertionError(
-                    "compare value failed expect {} got {}".format(
-                        tensor.numpy()[0][0], value
-                    )
-                )
-
-    t = mge.Parameter(np.ones((3, 4), dtype="float32"))
-    t.dtype = mgb.dtype.qint8(0.1)
-    check_dtype_value(t, 0.1, 10)
-
-    t = mge.Parameter(np.ones((3, 4), dtype=mgb.dtype.qint8(1)))
-    t.dtype = mgb.dtype.qint8(0.3)
-    check_dtype_value(t, 0.3, 3)
-
-    t = mge.Buffer(np.ones((3, 4), dtype="float32"))
-    t.dtype = mgb.dtype.qint8(0.1)
-    check_dtype_value(t, 0.1, 10)
-
-    t = mge.Buffer(np.ones((3, 4), dtype=mgb.dtype.qint8(1)))
-    t.dtype = mgb.dtype.qint8(0.3)
-    check_dtype_value(t, 0.3, 3)
-
-    t = mge.Buffer(np.ones((3, 4), dtype="float32"))
-    s = t + 1
-    s.dtype = mgb.dtype.qint8(0.2)
-    check_dtype_value(s, 0.2, 10)
-
-    t.dtype = mgb.dtype.qint8(0.3)
-    s = t + 1
-    s.dtype = mgb.dtype.qint8(0.1)
-    check_dtype_value(s, 0.1, 18)
-    s.dtype = "float32"
-    check_dtype_value(s, 0, 1.8)
-
-
-def test_tensor_name():
-    p = mge.Parameter(np.ones((3, 4), dtype="float32"))
-    assert "shared" in p.name
-    with pytest.raises(ValueError):
-        p.name = "Parameter0"
-
-    b = mge.Buffer(np.ones((3, 4), dtype="float32"))
-    assert "shared" in b.name
-    with pytest.raises(ValueError):
-        b.name = "Buffer0"
-
-    s = b + 1
-    assert "ADD" in s.name
-    s.name = "WeightAdd1"
-    assert s.name == "WeightAdd1"
diff --git a/python_module/test/unit/core/test_zeros_ones.py b/python_module/test/unit/core/test_zeros_ones.py
deleted file mode 100644
index 4d14653b..00000000
--- a/python_module/test/unit/core/test_zeros_ones.py
+++ /dev/null
@@ -1,35 +0,0 @@
-# -*- coding: utf-8 -*-
-# MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
-#
-# Copyright (c) 2014-2020 Megvii Inc. All rights reserved.
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-import numpy as np
-import pytest
-
-import megengine as mge
-from megengine.test import assertTensorClose
-
-
-def test_zeros():
-    assertTensorClose(
-        mge.zeros((2, 2), dtype=np.int32).numpy(), np.zeros((2, 2), dtype=np.int32)
-    )
-
-    assertTensorClose(
-        mge.zeros(mge.tensor([2, 2], dtype=np.int32), dtype=np.int32).numpy(),
-        np.zeros((2, 2), dtype=np.int32),
-    )
-
-
-def test_ones():
-    assertTensorClose(
-        mge.ones((2, 2), dtype=np.int32).numpy(), np.ones((2, 2), dtype=np.int32)
-    )
-
-    assertTensorClose(
-        mge.ones(mge.tensor([2, 2], dtype=np.int32), dtype=np.int32).numpy(),
-        np.ones((2, 2), dtype=np.int32),
-    )
diff --git a/python_module/test/unit/data/__init__.py b/python_module/test/unit/data/__init__.py
deleted file mode 100644
index e69de29b..00000000
diff --git a/python_module/test/unit/data/test_dataloader.py b/python_module/test/unit/data/test_dataloader.py
deleted file mode 100644
index 6bb0f3e3..00000000
--- a/python_module/test/unit/data/test_dataloader.py
+++ /dev/null
@@ -1,183 +0,0 @@
-# -*- coding: utf-8 -*-
-# MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
-#
-# Copyright (c) 2014-2020 Megvii Inc. All rights reserved.
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-import os
-import time
-
-import numpy as np
-import pytest
-
-from megengine.data.collator import Collator
-from megengine.data.dataloader import DataLoader
-from megengine.data.dataset import ArrayDataset
-from megengine.data.sampler import RandomSampler, SequentialSampler
-from megengine.data.transform import PseudoTransform, Transform
-
-
-def init_dataset():
-    sample_num = 100
-    rand_data = np.random.randint(0, 255, size=(sample_num, 1, 32, 32), dtype=np.uint8)
-    label = np.random.randint(0, 10, size=(sample_num,), dtype=int)
-    dataset = ArrayDataset(rand_data, label)
-    return dataset
-
-
-def test_dataloader_init():
-    dataset = init_dataset()
-    with pytest.raises(ValueError):
-        dataloader = DataLoader(dataset, num_workers=2, divide=True)
-    with pytest.raises(ValueError):
-        dataloader = DataLoader(dataset, num_workers=-1)
-    with pytest.raises(ValueError):
-        dataloader = DataLoader(dataset, timeout=-1)
-    with pytest.raises(ValueError):
-        dataloader = DataLoader(dataset, num_workers=0, divide=True)
-
-    dataloader = DataLoader(dataset)
-    assert isinstance(dataloader.sampler, SequentialSampler)
-    assert isinstance(dataloader.transform, PseudoTransform)
-    assert isinstance(dataloader.collator, Collator)
-
-    dataloader = DataLoader(
-        dataset, sampler=RandomSampler(dataset, batch_size=6, drop_last=False)
-    )
-    assert len(dataloader) == 17
-    dataloader = DataLoader(
-        dataset, sampler=RandomSampler(dataset, batch_size=6, drop_last=True)
-    )
-    assert len(dataloader) == 16
-
-
-def test_dataloader_serial():
-    dataset = init_dataset()
-    dataloader = DataLoader(
-        dataset, sampler=RandomSampler(dataset, batch_size=4, drop_last=False)
-    )
-    for (data, label) in dataloader:
-        assert data.shape == (4, 1, 32, 32)
-        assert label.shape == (4,)
-
-
-def test_dataloader_parallel():
-    # set max shared memory to 100M
-    os.environ["MGE_PLASMA_MEMORY"] = "100000000"
-
-    dataset = init_dataset()
-    dataloader = DataLoader(
-        dataset,
-        sampler=RandomSampler(dataset, batch_size=4, drop_last=False),
-        num_workers=2,
-        divide=False,
-    )
-    for (data, label) in dataloader:
-        assert data.shape == (4, 1, 32, 32)
-        assert label.shape == (4,)
-
-    dataloader = DataLoader(
-        dataset,
-        sampler=RandomSampler(dataset, batch_size=4, drop_last=False),
-        num_workers=2,
-        divide=True,
-    )
-    for (data, label) in dataloader:
-        assert data.shape == (4, 1, 32, 32)
-        assert label.shape == (4,)
-
-
-def test_dataloader_parallel_timeout():
-    dataset = init_dataset()
-
-    class TimeoutTransform(Transform):
-        def __init__(self):
-            pass
-
-        def apply(self, input):
-            time.sleep(10)
-            return input
-
-    dataloader = DataLoader(
-        dataset,
-        sampler=RandomSampler(dataset, batch_size=4, drop_last=False),
-        transform=TimeoutTransform(),
-        num_workers=2,
-        timeout=2,
-    )
-    with pytest.raises(RuntimeError, match=r".*timeout.*"):
-        data_iter = iter(dataloader)
-        batch_data = next(data_iter)
-
-
-def test_dataloader_parallel_worker_exception():
-    dataset = init_dataset()
-
-    class FakeErrorTransform(Transform):
-        def __init__(self):
-            pass
-
-        def apply(self, input):
-            y = x + 1
-            return input
-
-    dataloader = DataLoader(
-        dataset,
-        sampler=RandomSampler(dataset, batch_size=4, drop_last=False),
-        transform=FakeErrorTransform(),
-        num_workers=2,
-    )
-    with pytest.raises(RuntimeError, match=r"worker.*died"):
-        data_iter = iter(dataloader)
-        batch_data = next(data_iter)
-
-
-def _multi_instances_parallel_dataloader_worker():
-    dataset = init_dataset()
-
-    for divide_flag in [True, False]:
-        train_dataloader = DataLoader(
-            dataset,
-            sampler=RandomSampler(dataset, batch_size=4, drop_last=False),
-            num_workers=2,
-            divide=divide_flag,
-        )
-        val_dataloader = DataLoader(
-            dataset,
-            sampler=RandomSampler(dataset, batch_size=10, drop_last=False),
-            num_workers=2,
-            divide=divide_flag,
-        )
-        for idx, (data, label) in enumerate(train_dataloader):
-            assert data.shape == (4, 1, 32, 32)
-            assert label.shape == (4,)
-            if idx % 5 == 0:
-                for val_data, val_label in val_dataloader:
-                    assert val_data.shape == (10, 1, 32, 32)
-                    assert val_label.shape == (10,)
-
-
-def test_dataloader_parallel_multi_instances():
-    # set max shared memory to 100M
-    os.environ["MGE_PLASMA_MEMORY"] = "100000000"
-
-    _multi_instances_parallel_dataloader_worker()
-
-
-def test_dataloader_parallel_multi_instances_multiprocessing():
-    # set max shared memory to 100M
-    os.environ["MGE_PLASMA_MEMORY"] = "100000000"
-
-    import multiprocessing as mp
-
-    # mp.set_start_method("spawn")
-    processes = []
-    for i in range(4):
-        p = mp.Process(target=_multi_instances_parallel_dataloader_worker)
-        p.start()
-        processes.append(p)
-
-    for p in processes:
-        p.join()
diff --git a/python_module/test/unit/data/test_dataset.py b/python_module/test/unit/data/test_dataset.py
deleted file mode 100644
index d68d3784..00000000
--- a/python_module/test/unit/data/test_dataset.py
+++ /dev/null
@@ -1,43 +0,0 @@
-# -*- coding: utf-8 -*-
-# MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
-#
-# Copyright (c) 2014-2020 Megvii Inc. All rights reserved.
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-import os
-import sys
-
-import numpy as np
-import pytest
-
-from megengine.data.dataset import ArrayDataset, Dataset, MapDataset, StreamDataset
-
-
-def test_abstract_cls():
-    with pytest.raises(TypeError):
-        Dataset()
-    with pytest.raises(TypeError):
-        MapDataset()
-    with pytest.raises(TypeError):
-        StreamDataset()
-
-
-def test_array_dataset():
-    size = (10,)
-    data_shape = (3, 256, 256)
-    label_shape = (1,)
-    data = np.random.randint(0, 255, size + data_shape)
-    label = np.random.randint(0, 9, size + label_shape)
-    dataset = ArrayDataset(data, label)
-    assert dataset[0][0].shape == data_shape
-    assert dataset[0][1].shape == label_shape
-    assert len(dataset) == size[0]
-
-
-def test_array_dataset_dim_error():
-    data = np.random.randint(0, 255, (10, 3, 256, 256))
-    label = np.random.randint(0, 9, (1,))
-    with pytest.raises(ValueError):
-        ArrayDataset(data, label)
diff --git a/python_module/test/unit/data/test_sampler.py b/python_module/test/unit/data/test_sampler.py
deleted file mode 100644
index bc399e86..00000000
--- a/python_module/test/unit/data/test_sampler.py
+++ /dev/null
@@ -1,81 +0,0 @@
-# -*- coding: utf-8 -*-
-# MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
-#
-# Copyright (c) 2014-2020 Megvii Inc. All rights reserved.
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-import copy
-import os
-import sys
-
-import numpy as np
-import pytest
-
-from megengine.data.dataset import ArrayDataset
-from megengine.data.sampler import RandomSampler, ReplacementSampler, SequentialSampler
-
-
-def test_sequential_sampler():
-    indices = list(range(100))
-    sampler = SequentialSampler(ArrayDataset(indices))
-    assert indices == list(each[0] for each in sampler)
-
-
-def test_RandomSampler():
-    indices = list(range(20))
-    indices_copy = copy.deepcopy(indices)
-    sampler = RandomSampler(ArrayDataset(indices_copy))
-    sample_indices = sampler
-    assert indices != list(each[0] for each in sample_indices)
-    assert indices == sorted(list(each[0] for each in sample_indices))
-
-
-def test_random_sampler_seed():
-    seed = [0, 1]
-    indices = list(range(20))
-    indices_copy1 = copy.deepcopy(indices)
-    indices_copy2 = copy.deepcopy(indices)
-    indices_copy3 = copy.deepcopy(indices)
-    sampler1 = RandomSampler(ArrayDataset(indices_copy1), seed=seed[0])
-    sampler2 = RandomSampler(ArrayDataset(indices_copy2), seed=seed[0])
-    sampler3 = RandomSampler(ArrayDataset(indices_copy3), seed=seed[1])
-    assert indices != list(each[0] for each in sampler1)
-    assert indices != list(each[0] for each in sampler2)
-    assert indices != list(each[0] for each in sampler3)
-    assert indices == sorted(list(each[0] for each in sampler1))
-    assert indices == sorted(list(each[0] for each in sampler2))
-    assert indices == sorted(list(each[0] for each in sampler3))
-    assert list(each[0] for each in sampler1) == list(each[0] for each in sampler2)
-    assert list(each[0] for each in sampler1) != list(each[0] for each in sampler3)
-
-
-def test_ReplacementSampler():
-    num_samples = 30
-    indices = list(range(20))
-    weights = list(range(20))
-    sampler = ReplacementSampler(
-        ArrayDataset(indices), num_samples=num_samples, weights=weights
-    )
-    assert len(list(each[0] for each in sampler)) == num_samples
-
-
-def test_sampler_drop_last_false():
-    batch_size = 5
-    drop_last = False
-    indices = list(range(24))
-    sampler = SequentialSampler(
-        ArrayDataset(indices), batch_size=batch_size, drop_last=drop_last
-    )
-    assert len([each for each in sampler]) == len(sampler)
-
-
-def test_sampler_drop_last_true():
-    batch_size = 5
-    drop_last = True
-    indices = list(range(24))
-    sampler = SequentialSampler(
-        ArrayDataset(indices), batch_size=batch_size, drop_last=drop_last
-    )
-    assert len([each for each in sampler]) == len(sampler)
diff --git a/python_module/test/unit/data/test_transform.py b/python_module/test/unit/data/test_transform.py
deleted file mode 100644
index d438d8b5..00000000
--- a/python_module/test/unit/data/test_transform.py
+++ /dev/null
@@ -1,108 +0,0 @@
-# -*- coding: utf-8 -*-
-# MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
-#
-# Copyright (c) 2014-2020 Megvii Inc. All rights reserved.
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-import numpy as np
-
-from megengine.data.transform import *
-
-data_shape = (100, 100, 3)
-label_shape = (4,)
-ToMode_target_shape = (3, 100, 100)
-CenterCrop_size = (90, 70)
-CenterCrop_target_shape = CenterCrop_size + (3,)
-RandomResizedCrop_size = (50, 50)
-RandomResizedCrop_target_shape = RandomResizedCrop_size + (3,)
-
-
-def generate_data():
-    return [
-        (
-            (np.random.rand(*data_shape) * 255).astype(np.uint8),
-            np.random.randint(10, size=label_shape),
-        )
-        for _ in range(*label_shape)
-    ]
-
-
-def test_ToMode():
-    t = ToMode(mode="CHW")
-    aug_data = t.apply_batch(generate_data())
-    aug_data_shape = [(a.shape, b.shape) for a, b in aug_data]
-    target_shape = [(ToMode_target_shape, label_shape)] * 4
-    assert aug_data_shape == target_shape
-
-
-def test_CenterCrop():
-    t = CenterCrop(output_size=CenterCrop_size)
-    aug_data = t.apply_batch(generate_data())
-    aug_data_shape = [(a.shape, b.shape) for a, b in aug_data]
-    target_shape = [(CenterCrop_target_shape, label_shape)] * 4
-    assert aug_data_shape == target_shape
-
-
-def test_ColorJitter():
-    t = ColorJitter()
-    aug_data = t.apply_batch(generate_data())
-    aug_data_shape = [(a.shape, b.shape) for a, b in aug_data]
-    target_shape = [(data_shape, label_shape)] * 4
-    assert aug_data_shape == target_shape
-
-
-def test_RandomHorizontalFlip():
-    t = RandomHorizontalFlip(prob=1)
-    aug_data = t.apply_batch(generate_data())
-    aug_data_shape = [(a.shape, b.shape) for a, b in aug_data]
-    target_shape = [(data_shape, label_shape)] * 4
-    assert aug_data_shape == target_shape
-
-
-def test_RandomVerticalFlip():
-    t = RandomVerticalFlip(prob=1)
-    aug_data = t.apply_batch(generate_data())
-    aug_data_shape = [(a.shape, b.shape) for a, b in aug_data]
-    target_shape = [(data_shape, label_shape)] * 4
-    assert aug_data_shape == target_shape
-
-
-def test_RandomResizedCrop():
-    t = RandomResizedCrop(output_size=RandomResizedCrop_size)
-    aug_data = t.apply_batch(generate_data())
-    aug_data_shape = [(a.shape, b.shape) for a, b in aug_data]
-    target_shape = [(RandomResizedCrop_target_shape, label_shape)] * 4
-    assert aug_data_shape == target_shape
-
-
-def test_Normalize():
-    t = Normalize()
-    aug_data = t.apply_batch(generate_data())
-    aug_data_shape = [(a.shape, b.shape) for a, b in aug_data]
-    target_shape = [(data_shape, label_shape)] * 4
-    assert aug_data_shape == target_shape
-
-
-def test_RandomCrop():
-    t = RandomCrop((150, 120), padding_size=10, padding_value=[1, 2, 3])
-    aug_data = t.apply_batch(generate_data())
-    aug_data_shape = [(a.shape, b.shape) for a, b in aug_data]
-    target_shape = [((150, 120, 3), label_shape)] * 4
-    assert aug_data_shape == target_shape
-
-
-def test_Compose():
-    t = Compose(
-        [
-            CenterCrop(output_size=CenterCrop_size),
-            RandomHorizontalFlip(prob=1),
-            ToMode(mode="CHW"),
-        ]
-    )
-    aug_data = t.apply_batch(generate_data())
-    aug_data_shape = [(a.shape, b.shape) for a, b in aug_data]
-    print(aug_data_shape)
-    target_shape = [((3, 90, 70), label_shape)] * 4
-    assert aug_data_shape == target_shape
diff --git a/python_module/test/unit/distributed/test_functional.py b/python_module/test/unit/distributed/test_functional.py
deleted file mode 100644
index 8bbaf7e0..00000000
--- a/python_module/test/unit/distributed/test_functional.py
+++ /dev/null
@@ -1,467 +0,0 @@
-# MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
-#
-# Copyright (c) 2014-2020 Megvii Inc. All rights reserved.
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-
-import multiprocessing as mp
-import platform
-
-import numpy as np
-import pytest
-
-import megengine as mge
-import megengine.distributed as dist
-from megengine.core import Parameter, tensor
-
-
-def _init_process_group_wrapper(world_size, rank, dev, backend, q):
-    if rank == 0:
-        dist.init_process_group("localhost", 0, world_size, rank, dev, backend)
-        q.put(dist.get_master_port())
-    else:
-        port = q.get()
-        dist.init_process_group("localhost", port, world_size, rank, dev, backend)
-
-
-@pytest.mark.skipif(
-    platform.system() == "Darwin", reason="do not imp GPU mode at macos now"
-)
-@pytest.mark.skipif(
-    platform.system() == "Windows", reason="do not imp GPU mode at Windows now"
-)
-@pytest.mark.isolated_distributed
-def test_reduce_sum():
-    world_size = 2
-
-    def worker(rank, data, backend, expect, port_queue):
-        if mge.get_device_count("gpu") < world_size:
-            return
-        _init_process_group_wrapper(world_size, rank, rank, backend, port_queue)
-        inp = tensor(data)
-        output = dist.functional.reduce_sum(inp)
-        if rank == 0:
-            assert np.allclose(output.numpy(), expect)
-        else:
-            assert np.allclose(output.numpy(), 0)
-
-    def check(shape, backend):
-        port_queue = mp.Queue()
-        x = np.random.rand(*shape).astype("float32")
-        y = np.random.rand(*shape).astype("float32")
-        z = x + y
-        p0 = mp.Process(target=worker, args=(0, x, backend, z, port_queue))
-        p1 = mp.Process(target=worker, args=(1, y, backend, None, port_queue))
-
-        p0.start()
-        p1.start()
-
-        p0.join(10)
-        p1.join(10)
-
-        assert p0.exitcode == 0 and p1.exitcode == 0
-
-    for shape in [(2, 3), (8, 10), (99, 77)]:
-        for backend in ["nccl"]:
-            check(shape, backend)
-
-
-@pytest.mark.skipif(
-    platform.system() == "Darwin", reason="do not imp GPU mode at macos now"
-)
-@pytest.mark.skipif(
-    platform.system() == "Windows", reason="do not imp GPU mode at Windows now"
-)
-@pytest.mark.isolated_distributed
-def test_gather():
-    world_size = 2
-
-    def worker(rank, data, backend, expect, port_queue):
-        if mge.get_device_count("gpu") < world_size:
-            return
-        _init_process_group_wrapper(world_size, rank, rank, backend, port_queue)
-        inp = tensor(data)
-        output = dist.functional.gather(inp)
-        if rank == 0:
-            assert np.allclose(output.numpy(), expect)
-        else:
-            assert np.allclose(output.numpy(), 0)
-
-    def check(shape, backend):
-        port_queue = mp.Queue()
-        x = np.random.rand(*shape).astype("float32")
-        y = np.random.rand(*shape).astype("float32")
-        z = np.concatenate((x, y))
-        p0 = mp.Process(target=worker, args=(0, x, backend, z, port_queue))
-        p1 = mp.Process(target=worker, args=(1, y, backend, None, port_queue))
-
-        p0.start()
-        p1.start()
-
-        p0.join(10)
-        p1.join(10)
-
-        assert p0.exitcode == 0 and p1.exitcode == 0
-
-    for shape in [(2, 3), (8, 10), (99, 77)]:
-        for backend in ["nccl"]:
-            check(shape, backend)
-
-
-@pytest.mark.skipif(
-    platform.system() == "Darwin", reason="do not imp GPU mode at macos now"
-)
-@pytest.mark.skipif(
-    platform.system() == "Windows", reason="do not imp GPU mode at Windows now"
-)
-@pytest.mark.isolated_distributed
-def test_broadcast():
-    world_size = 2
-
-    def worker(rank, data, backend, expect, port_queue):
-        if mge.get_device_count("gpu") < world_size:
-            return
-        _init_process_group_wrapper(world_size, rank, rank, backend, port_queue)
-        inp = tensor(data)
-        output = dist.functional.broadcast(inp)
-        assert np.allclose(output.numpy(), expect)
-
-    def check(shape, backend):
-        port_queue = mp.Queue()
-        x = np.random.rand(*shape).astype("float32")
-        y = x + 1
-        p0 = mp.Process(target=worker, args=(0, x, backend, x, port_queue))
-        p1 = mp.Process(target=worker, args=(1, y, backend, x, port_queue))
-
-        p0.start()
-        p1.start()
-
-        p0.join(10)
-        p1.join(10)
-
-        assert p0.exitcode == 0 and p1.exitcode == 0
-
-    for shape in [(2, 3), (8, 10), (99, 77)]:
-        for backend in ["nccl"]:
-            check(shape, backend)
-
-
-@pytest.mark.skipif(
-    platform.system() == "Darwin", reason="do not imp GPU mode at macos now"
-)
-@pytest.mark.skipif(
-    platform.system() == "Windows", reason="do not imp GPU mode at Windows now"
-)
-@pytest.mark.isolated_distributed
-def test_scatter():
-    world_size = 2
-
-    def worker(rank, data, backend, expect, port_queue):
-        if mge.get_device_count("gpu") < world_size:
-            return
-        _init_process_group_wrapper(world_size, rank, rank, backend, port_queue)
-        inp = tensor(data)
-        output = dist.functional.scatter(inp)
-        assert np.allclose(output.numpy(), expect)
-
-    def check(shape, backend):
-        port_queue = mp.Queue()
-        x = np.random.rand(*shape).astype("float32")
-        y = x + 1
-        p0 = mp.Process(
-            target=worker, args=(0, x, backend, x[: shape[0] // 2], port_queue)
-        )
-        p1 = mp.Process(
-            target=worker, args=(1, y, backend, x[shape[0] // 2 :], port_queue)
-        )
-
-        p0.start()
-        p1.start()
-
-        p0.join(10)
-        p1.join(10)
-
-        assert p0.exitcode == 0 and p1.exitcode == 0
-
-    for shape in [(2, 3), (8, 10), (100, 77)]:
-        for backend in ["nccl"]:
-            check(shape, backend)
-
-
-@pytest.mark.skipif(
-    platform.system() == "Darwin", reason="do not imp GPU mode at macos now"
-)
-@pytest.mark.skipif(
-    platform.system() == "Windows", reason="do not imp GPU mode at Windows now"
-)
-@pytest.mark.isolated_distributed
-def test_all_to_all():
-    world_size = 2
-
-    def worker(rank, data, backend, expect, port_queue):
-        if mge.get_device_count("gpu") < world_size:
-            return
-        _init_process_group_wrapper(world_size, rank, rank, backend, port_queue)
-        inp = tensor(data)
-        output = dist.functional.all_to_all(inp)
-        assert np.allclose(output.numpy(), expect)
-
-    def check(shape, backend):
-        port_queue = mp.Queue()
-        x = np.random.rand(*shape).astype("float32")
-        y = np.random.rand(*shape).astype("float32")
-        a = np.concatenate((x[: shape[0] // 2], y[: shape[0] // 2]))
-        b = np.concatenate((x[shape[0] // 2 :], y[shape[0] // 2 :]))
-        p0 = mp.Process(target=worker, args=(0, x, backend, a, port_queue))
-        p1 = mp.Process(target=worker, args=(1, y, backend, b, port_queue))
-
-        p0.start()
-        p1.start()
-
-        p0.join(10)
-        p1.join(10)
-
-        assert p0.exitcode == 0 and p1.exitcode == 0
-
-    for shape in [(2, 3), (8, 10), (100, 77)]:
-        for backend in ["nccl"]:
-            check(shape, backend)
-
-
-@pytest.mark.skipif(
-    platform.system() == "Darwin", reason="do not imp GPU mode at macos now"
-)
-@pytest.mark.skipif(
-    platform.system() == "Windows", reason="do not imp GPU mode at Windows now"
-)
-@pytest.mark.isolated_distributed
-def test_all_gather():
-    world_size = 2
-
-    def worker(rank, data, backend, expect, port_queue):
-        if mge.get_device_count("gpu") < world_size:
-            return
-        _init_process_group_wrapper(world_size, rank, rank, backend, port_queue)
-        inp = tensor(data)
-        output = dist.functional.all_gather(inp)
-        assert np.allclose(output.numpy(), expect)
-
-    def check(shape, backend):
-        port_queue = mp.Queue()
-        x = np.random.rand(*shape).astype("float32")
-        y = np.random.rand(*shape).astype("float32")
-        z = np.concatenate((x, y))
-        p0 = mp.Process(target=worker, args=(0, x, backend, z, port_queue))
-        p1 = mp.Process(target=worker, args=(1, y, backend, z, port_queue))
-
-        p0.start()
-        p1.start()
-
-        p0.join(10)
-        p1.join(10)
-
-        assert p0.exitcode == 0 and p1.exitcode == 0
-
-    for shape in [(2, 3), (8, 10), (99, 77)]:
-        for backend in ["nccl"]:
-            check(shape, backend)
-
-
-@pytest.mark.skipif(
-    platform.system() == "Darwin", reason="do not imp GPU mode at macos now"
-)
-@pytest.mark.skipif(
-    platform.system() == "Windows", reason="do not imp GPU mode at Windows now"
-)
-@pytest.mark.isolated_distributed
-def test_reduce_scatter_sum():
-    world_size = 2
-
-    def worker(rank, data, backend, expect, port_queue):
-        if mge.get_device_count("gpu") < world_size:
-            return
-        _init_process_group_wrapper(world_size, rank, rank, backend, port_queue)
-        inp = tensor(data)
-        output = dist.functional.reduce_scatter_sum(inp)
-        assert np.allclose(output.numpy(), expect)
-
-    def check(shape, backend):
-        port_queue = mp.Queue()
-        x = np.random.rand(*shape).astype("float32")
-        y = np.random.rand(*shape).astype("float32")
-        z = x + y
-        p0 = mp.Process(
-            target=worker, args=(0, x, backend, z[: shape[0] // 2], port_queue)
-        )
-        p1 = mp.Process(
-            target=worker, args=(1, y, backend, z[shape[0] // 2 :], port_queue)
-        )
-
-        p0.start()
-        p1.start()
-
-        p0.join(10)
-        p1.join(10)
-
-        assert p0.exitcode == 0 and p1.exitcode == 0
-
-    for shape in [(2, 4), (8, 10), (88, 44)]:
-        for backend in ["nccl"]:
-            check(shape, backend)
-
-
-@pytest.mark.skipif(
-    platform.system() == "Darwin", reason="do not imp GPU mode at macos now"
-)
-@pytest.mark.skipif(
-    platform.system() == "Windows", reason="do not imp GPU mode at Windows now"
-)
-@pytest.mark.isolated_distributed
-def test_all_reduce_sum():
-    world_size = 2
-
-    def worker(rank, data, backend, expect, port_queue):
-        if mge.get_device_count("gpu") < world_size:
-            return
-        _init_process_group_wrapper(world_size, rank, rank, backend, port_queue)
-        inp = tensor(data)
-        output = dist.functional.all_reduce_sum(inp)
-        assert np.allclose(output.numpy(), expect)
-
-    def check(shape, backend):
-        port_queue = mp.Queue()
-        x = np.random.rand(*shape).astype("float32")
-        y = np.random.rand(*shape).astype("float32")
-        z = x + y
-        p0 = mp.Process(target=worker, args=(0, x, backend, z, port_queue))
-        p1 = mp.Process(target=worker, args=(1, y, backend, z, port_queue))
-
-        p0.start()
-        p1.start()
-
-        p0.join(10)
-        p1.join(10)
-
-        assert p0.exitcode == 0 and p1.exitcode == 0
-
-    for shape in [(2, 3), (8, 10), (99, 77)]:
-        for backend in ["nccl"]:
-            check(shape, backend)
-
-
-@pytest.mark.skipif(
-    platform.system() == "Darwin", reason="do not imp GPU mode at macos now"
-)
-@pytest.mark.skipif(
-    platform.system() == "Windows", reason="do not imp GPU mode at Windows now"
-)
-@pytest.mark.isolated_distributed
-def test_all_reduce_max():
-    world_size = 2
-
-    def worker(rank, data, backend, expect, port_queue):
-        if mge.get_device_count("gpu") < world_size:
-            return
-        _init_process_group_wrapper(world_size, rank, rank, backend, port_queue)
-        inp = tensor(data)
-        output = dist.functional.all_reduce_max(inp)
-        assert np.allclose(output.numpy(), expect)
-
-    def check(shape, backend):
-        port_queue = mp.Queue()
-        x = np.random.rand(*shape).astype("float32")
-        y = np.random.rand(*shape).astype("float32")
-        z = np.maximum(x, y)
-        p0 = mp.Process(target=worker, args=(0, x, backend, z, port_queue))
-        p1 = mp.Process(target=worker, args=(1, y, backend, z, port_queue))
-
-        p0.start()
-        p1.start()
-
-        p0.join(10)
-        p1.join(10)
-
-        assert p0.exitcode == 0 and p1.exitcode == 0
-
-    for shape in [(2, 3), (8, 10), (99, 77)]:
-        for backend in ["nccl"]:
-            check(shape, backend)
-
-
-@pytest.mark.skipif(
-    platform.system() == "Darwin", reason="do not imp GPU mode at macos now"
-)
-@pytest.mark.skipif(
-    platform.system() == "Windows", reason="do not imp GPU mode at Windows now"
-)
-@pytest.mark.isolated_distributed
-def test_all_reduce_min():
-    world_size = 2
-
-    def worker(rank, data, backend, expect, port_queue):
-        if mge.get_device_count("gpu") < world_size:
-            return
-        _init_process_group_wrapper(world_size, rank, rank, backend, port_queue)
-        inp = tensor(data)
-        output = dist.functional.all_reduce_min(inp)
-        assert np.allclose(output.numpy(), expect)
-
-    def check(shape, backend):
-        port_queue = mp.Queue()
-        x = np.random.rand(*shape).astype("float32")
-        y = np.random.rand(*shape).astype("float32")
-        z = np.minimum(x, y)
-        p0 = mp.Process(target=worker, args=(0, x, backend, z, port_queue))
-        p1 = mp.Process(target=worker, args=(1, y, backend, z, port_queue))
-
-        p0.start()
-        p1.start()
-
-        p0.join(10)
-        p1.join(10)
-
-        assert p0.exitcode == 0 and p1.exitcode == 0
-
-    for shape in [(2, 3), (8, 10), (99, 77)]:
-        for backend in ["nccl"]:
-            check(shape, backend)
-
-
-@pytest.mark.skipif(
-    platform.system() == "Darwin", reason="do not imp GPU mode at macos now"
-)
-@pytest.mark.skipif(
-    platform.system() == "Windows", reason="do not imp GPU mode at Windows now"
-)
-@pytest.mark.isolated_distributed
-def test_bcast_param():
-    world_size = 2
-
-    def worker(rank, data, backend, expect, port_queue):
-        if mge.get_device_count("gpu") < world_size:
-            return
-        _init_process_group_wrapper(world_size, rank, rank, backend, port_queue)
-        inp = Parameter(data)
-        dist.functional.bcast_param(inp)
-        assert np.allclose(inp.numpy(), expect)
-
-    def check(shape, backend):
-        port_queue = mp.Queue()
-        x = np.random.rand(*shape).astype("float32")
-        y = x + 1
-        p0 = mp.Process(target=worker, args=(0, x, backend, x, port_queue))
-        p1 = mp.Process(target=worker, args=(1, y, backend, x, port_queue))
-
-        p0.start()
-        p1.start()
-
-        p0.join(10)
-        p1.join(10)
-
-        assert p0.exitcode == 0 and p1.exitcode == 0
-
-    for shape in [(2, 3), (8, 10), (99, 77)]:
-        for backend in ["nccl"]:
-            check(shape, backend)
diff --git a/python_module/test/unit/distributed/test_util.py b/python_module/test/unit/distributed/test_util.py
deleted file mode 100644
index 04b34244..00000000
--- a/python_module/test/unit/distributed/test_util.py
+++ /dev/null
@@ -1,187 +0,0 @@
-# MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
-#
-# Copyright (c) 2014-2020 Megvii Inc. All rights reserved.
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-import multiprocessing as mp
-import platform
-import queue
-from time import sleep
-
-import pytest
-
-import megengine as mge
-import megengine._internal as mgb
-import megengine.distributed as dist
-
-_LOCALHOST = "127.0.0.1"
-
-
-def _assert_q_empty(q):
-    try:
-        res = q.get(timeout=1)
-    except Exception as e:
-        assert isinstance(e, queue.Empty)
-    else:
-        assert False, "queue is not empty"
-
-
-def _assert_q_val(q, val):
-    ret = q.get()
-    assert ret == val
-
-
-def _init_process_group_wrapper(world_size, rank, dev, backend, q):
-    if rank == 0:
-        dist.init_process_group(_LOCALHOST, 0, world_size, rank, dev, backend)
-        q.put(dist.get_master_port())
-    else:
-        port = q.get()
-        dist.init_process_group(_LOCALHOST, port, world_size, rank, dev, backend)
-
-
-@pytest.mark.skipif(
-    platform.system() == "Darwin", reason="do not imp GPU mode at macos now"
-)
-@pytest.mark.skipif(
-    platform.system() == "Windows", reason="do not imp GPU mode at Windows now"
-)
-@pytest.mark.isolated_distributed
-def test_create_mm_server():
-    def worker():
-        if not mge.is_cuda_available():
-            return
-        port = mgb.config.create_mm_server("0.0.0.0", 0)
-        assert port > 0
-        res = mgb.config.create_mm_server("0.0.0.0", port)
-        assert res == -1
-
-    p = mp.Process(target=worker)
-
-    p.start()
-
-    p.join(10)
-
-    assert p.exitcode == 0
-
-
-@pytest.mark.skipif(
-    platform.system() == "Darwin", reason="do not imp GPU mode at macos now"
-)
-@pytest.mark.skipif(
-    platform.system() == "Windows", reason="do not imp GPU mode at Windows now"
-)
-@pytest.mark.isolated_distributed
-def test_init_process_group():
-    world_size = 2
-
-    def worker(rank, backend, q):
-        if not mge.is_cuda_available():
-            return
-        _init_process_group_wrapper(world_size, rank, rank, backend, q)
-        assert dist.is_distributed() == True
-        assert dist.get_master_ip() == _LOCALHOST
-        assert dist.get_master_port() > 0
-        assert dist.get_world_size() == world_size
-        assert dist.get_rank() == rank
-        assert dist.get_backend() == backend
-
-    def check(backend):
-        Q = mp.Queue()
-        p0 = mp.Process(target=worker, args=(0, backend, Q))
-        p1 = mp.Process(target=worker, args=(1, backend, Q))
-
-        p0.start()
-        p1.start()
-
-        p0.join(10)
-        p1.join(10)
-
-        assert p0.exitcode == 0 and p1.exitcode == 0
-
-    check("nccl")
-
-
-@pytest.mark.skipif(
-    platform.system() == "Darwin", reason="do not imp GPU mode at macos now"
-)
-@pytest.mark.skipif(
-    platform.system() == "Windows", reason="do not imp GPU mode at Windows now"
-)
-@pytest.mark.isolated_distributed
-def test_group_barrier():
-    world_size = 2
-    ip = "127.0.0.1"
-    backend = "nccl"
-
-    def worker(rank, q):
-        if not mge.is_cuda_available():
-            return
-        _init_process_group_wrapper(world_size, rank, rank, backend, q)
-        dist.group_barrier()
-        if rank == 0:
-            dist.group_barrier()
-            q.put(0)  # to be observed in rank 1
-        else:
-            _assert_q_empty(q)  # q.put(0) is not executed in rank 0
-            dist.group_barrier()
-            _assert_q_val(q, 0)  # q.put(0) executed in rank 0
-
-    Q = mp.Queue()
-    p0 = mp.Process(target=worker, args=(0, Q))
-    p1 = mp.Process(target=worker, args=(1, Q))
-
-    p0.start()
-    p1.start()
-
-    p0.join(10)
-    p1.join(10)
-
-    assert p0.exitcode == 0 and p1.exitcode == 0
-
-
-@pytest.mark.skipif(
-    platform.system() == "Darwin", reason="do not imp GPU mode at macos now"
-)
-@pytest.mark.skipif(
-    platform.system() == "Windows", reason="do not imp GPU mode at Windows now"
-)
-@pytest.mark.isolated_distributed
-def test_synchronized():
-    world_size = 2
-    backend = "nccl"
-
-    @dist.synchronized
-    def func(rank, q):
-        q.put(rank)
-
-    def worker(rank, q):
-        if not mge.is_cuda_available():
-            return
-        _init_process_group_wrapper(world_size, rank, rank, backend, q)
-        dist.group_barrier()
-        if rank == 0:
-            func(0, q)  # q.put(0)
-            q.put(2)
-        else:
-            _assert_q_val(q, 0)  # func executed in rank 0
-            _assert_q_empty(q)  # q.put(2) is not executed
-            func(1, q)
-            _assert_q_val(
-                q, 1
-            )  # func in rank 1 executed earlier than q.put(2) in rank 0
-            _assert_q_val(q, 2)  # q.put(2) executed in rank 0
-
-    Q = mp.Queue()
-    p0 = mp.Process(target=worker, args=(0, Q))
-    p1 = mp.Process(target=worker, args=(1, Q))
-
-    p0.start()
-    p1.start()
-
-    p0.join(10)
-    p1.join(10)
-
-    assert p0.exitcode == 0 and p1.exitcode == 0
diff --git a/python_module/test/unit/functional/__init__.py b/python_module/test/unit/functional/__init__.py
deleted file mode 100644
index e69de29b..00000000
diff --git a/python_module/test/unit/functional/test_elemwise.py b/python_module/test/unit/functional/test_elemwise.py
deleted file mode 100644
index c02bd58b..00000000
--- a/python_module/test/unit/functional/test_elemwise.py
+++ /dev/null
@@ -1,65 +0,0 @@
-# -*- coding: utf-8 -*-
-# MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
-#
-# Copyright (c) 2014-2020 Megvii Inc. All rights reserved.
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-import numpy as np
-
-import megengine.functional as F
-from megengine import tensor
-from megengine.test import assertTensorClose
-
-
-def test_abs():
-    assertTensorClose(
-        F.abs(tensor([-3.0, -4.0, -5.0])).numpy(),
-        np.abs(np.array([-3.0, -4.0, -5.0], dtype=np.float32)),
-    )
-
-    assertTensorClose(F.abs(-3.0), np.abs(np.float32(-3.0)))
-
-
-def test_multiply():
-    assertTensorClose(
-        F.multiply(-3.0, -4.0), np.multiply(np.float32(-3.0), np.float32(-4.0))
-    )
-
-    assertTensorClose(
-        F.multiply(tensor([3.0, 4.0]), 4.0).numpy(),
-        np.multiply(np.array([3.0, 4.0], dtype=np.float32), 4.0),
-    )
-
-    assertTensorClose(
-        F.multiply(4.0, tensor([3.0, 4.0])).numpy(),
-        np.multiply(4.0, np.array([3.0, 4.0], dtype=np.float32)),
-    )
-
-    assertTensorClose(
-        F.multiply(tensor([3.0, 4.0]), tensor([3.0, 4.0])).numpy(),
-        np.multiply(
-            np.array([3.0, 4.0], dtype=np.float32),
-            np.array([3.0, 4.0], dtype=np.float32),
-        ),
-    )
-
-
-def test_clamp():
-    """Fix an issue when `lower` or `upper` is 0, it will be recognized as `False` and
-    `F.clamp` will fall into wrong conditions unexpectedly.
-    """
-    x = np.linspace(-6, 6, dtype="float32")
-    assertTensorClose(F.clamp(tensor(x) + 3, 0, 6).numpy(), np.clip(x + 3, 0, 6))
-    assertTensorClose(F.clamp(tensor(x) - 3, -6, 0).numpy(), np.clip(x - 3, -6, 0))
-
-
-def test_isnan():
-    for case in [[1, float("nan"), 0]]:
-        assertTensorClose(F.isnan(tensor(case)), np.isnan(case).astype("uint8"))
-
-
-def test_isinf():
-    for case in [[1, float("inf"), 0]]:
-        assertTensorClose(F.isinf(tensor(case)), np.isinf(case).astype("uint8"))
diff --git a/python_module/test/unit/functional/test_functional.py b/python_module/test/unit/functional/test_functional.py
deleted file mode 100644
index a3739b3d..00000000
--- a/python_module/test/unit/functional/test_functional.py
+++ /dev/null
@@ -1,496 +0,0 @@
-# -*- coding: utf-8 -*-
-# MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
-#
-# Copyright (c) 2014-2020 Megvii Inc. All rights reserved.
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-import numpy as np
-import pytest
-from helpers import opr_test
-
-import megengine._internal as mgb
-import megengine.functional as F
-from megengine import Buffer, Parameter, is_cuda_available, jit, tensor
-from megengine.test import assertTensorClose
-
-
-def test_flatten():
-    data0_shape = (2, 3, 4, 5)
-    data1_shape = (4, 5, 6, 7)
-    data0 = np.random.random(data0_shape).astype(np.float32)
-    data1 = np.random.random(data1_shape).astype(np.float32)
-
-    def compare_fn(x, y):
-        assert x.numpy().shape == y
-
-    output0 = (2 * 3 * 4 * 5,)
-    output1 = (4 * 5 * 6 * 7,)
-    cases = [{"input": data0, "output": output0}, {"input": data1, "output": output1}]
-    opr_test(cases, F.flatten, compare_fn=compare_fn)
-
-    output0 = (2, 3 * 4 * 5)
-    output1 = (4, 5 * 6 * 7)
-    cases = [{"input": data0, "output": output0}, {"input": data1, "output": output1}]
-    opr_test(cases, F.flatten, compare_fn=compare_fn, start_axis=1)
-
-    output0 = (2, 3, 4 * 5)
-    output1 = (4, 5, 6 * 7)
-    cases = [{"input": data0, "output": output0}, {"input": data1, "output": output1}]
-    opr_test(cases, F.flatten, compare_fn=compare_fn, start_axis=2)
-
-    output0 = (2, 3 * 4, 5)
-    output1 = (4, 5 * 6, 7)
-    cases = [{"input": data0, "output": output0}, {"input": data1, "output": output1}]
-    opr_test(cases, F.flatten, compare_fn=compare_fn, start_axis=1, end_axis=2)
-
-
-def test_where():
-    maskv0 = np.array([[1, 0], [0, 1]], dtype=np.int32)
-    xv0 = np.array([[1, np.inf], [np.nan, 4]], dtype=np.float32)
-    yv0 = np.array([[5, 6], [7, 8]], dtype=np.float32)
-
-    maskv1 = np.array([[1, 0, 1], [1, 0, 0], [1, 1, 0]], dtype=np.int32)
-    xv1 = np.array([[1, np.inf, 2], [0, np.nan, 4], [1, 5, 7]], dtype=np.float32)
-    yv1 = np.array([[5, 6, 9], [2, 7, 8], [2, 1, 9]], dtype=np.float32)
-
-    cases = [
-        {"input": [maskv0, xv0, yv0]},
-        {"input": [maskv1, xv1, yv1]},
-    ]
-    opr_test(cases, F.where, ref_fn=np.where)
-
-    maskv2 = np.array([1, 1, 1], dtype=np.int32)
-    xv2 = np.array([1, 3, 2], dtype=np.float32)
-    yv2 = np.array([5, 6, 9], dtype=np.float32)
-
-    maskv3 = np.array([0, 0, 0], dtype=np.int32)
-    xv3 = np.array([1, 3, 2], dtype=np.float32)
-    yv3 = np.array([5, 6, 9], dtype=np.float32)
-
-    cases = [
-        {"input": [maskv2, xv2, yv2]},
-        {"input": [maskv3, xv3, yv3]},
-    ]
-    opr_test(cases, F.where, ref_fn=np.where)
-
-
-def test_eye():
-    dtype = np.float32
-    cases = [{"input": [10, 20]}, {"input": [20, 30]}]
-    opr_test(cases, F.eye, ref_fn=lambda n, m: np.eye(n, m).astype(dtype), dtype=dtype)
-
-
-def test_concat():
-    def get_data_shape(length: int):
-        return (length, 2, 3)
-
-    data1 = np.random.random(get_data_shape(5)).astype("float32")
-    data2 = np.random.random(get_data_shape(6)).astype("float32")
-    data3 = np.random.random(get_data_shape(7)).astype("float32")
-
-    def run(data1, data2):
-        return F.concat([data1, data2])
-
-    cases = [{"input": [data1, data2]}, {"input": [data1, data3]}]
-    opr_test(cases, run, ref_fn=lambda x, y: np.concatenate([x, y]))
-
-
-def test_matrix_mul():
-    shape1 = (2, 3)
-    shape2 = (3, 4)
-    shape3 = (4, 5)
-    data1 = np.random.random(shape1).astype("float32")
-    data2 = np.random.random(shape2).astype("float32")
-    data3 = np.random.random(shape3).astype("float32")
-
-    cases = [{"input": [data1, data2]}, {"input": [data2, data3]}]
-    opr_test(cases, F.matrix_mul, ref_fn=np.matmul)
-
-
-def test_batched_matrix_mul():
-    batch_size = 10
-    shape1 = (batch_size, 2, 3)
-    shape2 = (batch_size, 3, 4)
-    shape3 = (batch_size, 4, 5)
-    data1 = np.random.random(shape1).astype("float32")
-    data2 = np.random.random(shape2).astype("float32")
-    data3 = np.random.random(shape3).astype("float32")
-
-    cases = [{"input": [data1, data2]}, {"input": [data2, data3]}]
-    for i in range(0, batch_size):
-
-        def compare_fn(x, y):
-            x.numpy()[i, ...] == y
-
-        opr_test(
-            cases,
-            F.batched_matrix_mul,
-            compare_fn=compare_fn,
-            ref_fn=lambda x, y: np.matmul(x[i, ...], y[i, ...]),
-        )
-
-
-def test_sort():
-    data1_shape = (10, 3)
-    data2_shape = (12, 2)
-    data1 = np.random.random(data1_shape).astype(np.float32)
-    data2 = np.random.random(data2_shape).astype(np.float32)
-    output0 = [np.sort(data1), np.argsort(data1).astype(np.int32)]
-    output1 = [np.sort(data2), np.argsort(data2).astype(np.int32)]
-
-    cases = [
-        {"input": data1, "output": output0},
-        {"input": data2, "output": output1},
-    ]
-    opr_test(cases, F.sort)
-
-
-def test_round():
-    data1_shape = (15,)
-    data2_shape = (25,)
-    data1 = np.random.random(data1_shape).astype(np.float32)
-    data2 = np.random.random(data2_shape).astype(np.float32)
-
-    cases = [{"input": data1}, {"input": data2}]
-    opr_test(cases, F.round, ref_fn=np.round)
-
-
-def test_broadcast_to():
-    input1_shape = (20, 30)
-    output1_shape = (30, 20, 30)
-    data1 = np.random.random(input1_shape).astype(np.float32)
-
-    input2_shape = (10, 20)
-    output2_shape = (20, 10, 20)
-    data2 = np.random.random(input2_shape).astype(np.float32)
-
-    def compare_fn(x, y):
-        assert x.numpy().shape == y
-
-    cases = [
-        {"input": [data1, output1_shape], "output": output1_shape},
-        {"input": [data2, output2_shape], "output": output2_shape},
-    ]
-    opr_test(cases, F.broadcast_to, compare_fn=compare_fn)
-
-
-def test_linspace():
-    cases = [
-        {"input": [1, 9, 9]},
-        {"input": [3, 10, 8]},
-    ]
-    opr_test(
-        cases,
-        F.linspace,
-        ref_fn=lambda start, end, step: np.linspace(start, end, step, dtype=np.float32),
-    )
-
-    cases = [
-        {"input": [9, 1, 9]},
-        {"input": [10, 3, 8]},
-    ]
-    opr_test(
-        cases,
-        F.linspace,
-        ref_fn=lambda start, end, step: np.linspace(start, end, step, dtype=np.float32),
-    )
-
-
-def test_arange():
-    cases = [
-        {"input": [1, 9, 1]},
-        {"input": [2, 10, 2]},
-    ]
-    opr_test(
-        cases,
-        F.arange,
-        ref_fn=lambda start, end, step: np.arange(start, end, step, dtype=np.float32),
-    )
-
-    cases = [
-        {"input": [9, 1, -1]},
-        {"input": [10, 2, -2]},
-    ]
-    opr_test(
-        cases,
-        F.arange,
-        ref_fn=lambda start, end, step: np.arange(start, end, step, dtype=np.float32),
-    )
-
-    cases = [
-        {"input": [9.3, 1.2, -0.5]},
-        {"input": [10.3, 2.1, -1.7]},
-    ]
-    opr_test(
-        cases,
-        F.arange,
-        ref_fn=lambda start, end, step: np.arange(start, end, step, dtype=np.float32),
-    )
-
-
-def test_add_update():
-    shape = (2, 3)
-    v = np.random.random(shape).astype(np.float32)
-    b = Buffer(v)
-
-    u = F.add_update(b, 1)
-    assertTensorClose(u.numpy(), v + 1)
-    u = F.add_update(b, 1)
-    assertTensorClose(u.numpy(), v + 2)
-
-    x = np.ones((2, 2), dtype=np.float32)
-    y = x * 0.5
-    dest = tensor(x)
-    delta = tensor(y)
-    r = F.add_update(dest, delta, alpha=tensor(0.9), beta=0.1, bias=0.1)
-    assertTensorClose(r.numpy(), x * 0.9 + y * 0.1 + 0.1)
-
-
-def test_add_update_params():
-    b = np.random.random((2, 3)).astype(np.float32)
-    y = Buffer(b)
-
-    @jit.trace
-    def f(x):
-        return F.add_update(y, x)
-
-    f(np.zeros((2, 3)).astype(np.float32))
-
-    z = Buffer(np.zeros((2, 3)).astype(np.float32))
-    F.add_update(y, z, beta=0.1)
-
-    res = f(np.ones((2, 3)).astype(np.float32))
-    assertTensorClose(res, b + 1)
-
-
-def test_cross_entropy_with_softmax():
-    data1_shape = (1, 2)
-    label1_shape = (1,)
-    data2_shape = (1, 3)
-    label2_shape = (1,)
-
-    data1 = np.array([1, 0.5], dtype=np.float32).reshape(data1_shape)
-    label1 = np.array([1], dtype=np.int32).reshape(label1_shape)
-    expect1 = F.cross_entropy(F.softmax(tensor(data1)), tensor(label1)).numpy()
-
-    data2 = np.array([0.3, 0.4, 0.3], dtype=np.float32).reshape(data2_shape)
-    label2 = np.array([1], dtype=np.int32).reshape(label2_shape)
-    expect2 = F.cross_entropy(F.softmax(tensor(data2)), tensor(label2)).numpy()
-
-    cases = [
-        {"input": [data1, label1], "output": expect1,},
-        {"input": [data2, label2], "output": expect2,},
-    ]
-    opr_test(cases, F.cross_entropy_with_softmax)
-
-
-def test_cross_entropy():
-    data1_shape = (1, 2)
-    label1_shape = (1,)
-    data2_shape = (1, 3)
-    label2_shape = (1,)
-
-    data1 = np.array([0.5, 0.5], dtype=np.float32).reshape(data1_shape)
-    label1 = np.array([1], dtype=np.int32).reshape(label1_shape)
-    expect1 = np.array([-np.log(0.5)], dtype=np.float32)
-
-    data2 = np.array([0.3, 0.4, 0.3], dtype=np.float32).reshape(data2_shape)
-    label2 = np.array([1], dtype=np.int32).reshape(label2_shape)
-    expect2 = np.array([-np.log(0.4)], dtype=np.float32)
-
-    cases = [
-        {"input": [data1, label1], "output": expect1,},
-        {"input": [data2, label2], "output": expect2,},
-    ]
-    opr_test(cases, F.cross_entropy)
-
-
-def test_binary_cross_entropy():
-    data1_shape = (2, 2)
-    label1_shape = (2, 2)
-    data2_shape = (2, 3)
-    label2_shape = (2, 3)
-
-    def sigmoid(x):
-        return 1 / (1 + np.exp(-x))
-
-    def compare_fn(x, y):
-        assertTensorClose(x.numpy(), y, max_err=5e-4)
-
-    np.random.seed(123)
-    data1 = sigmoid(np.random.uniform(size=data1_shape).astype(np.float32))
-    label1 = np.random.uniform(size=label1_shape).astype(np.float32)
-    expect1 = np.array([0.6361], dtype=np.float32)
-
-    np.random.seed(123)
-    data2 = sigmoid(np.random.uniform(size=data2_shape).astype(np.float32))
-    label2 = np.random.uniform(size=label2_shape).astype(np.float32)
-    expect2 = np.array([0.6750], dtype=np.float32)
-
-    cases = [
-        {"input": [data1, label1], "output": expect1,},
-        {"input": [data2, label2], "output": expect2,},
-    ]
-    opr_test(cases, F.binary_cross_entropy, compare_fn=compare_fn)
-
-
-def test_hinge_loss():
-    np.random.seed(123)
-    # case with L1 norm
-    cases = []
-    for shape in [(2, 2), (2, 3)]:
-        data = np.random.uniform(size=shape).astype(np.float32)
-        label = 2 * np.random.randint(0, 1, size=shape).astype(np.int32) - 1
-        expect = np.clip(0, np.inf, 1 - data * label).sum(axis=1).mean()
-        cases.append({"input": [data, label], "output": tensor(expect)})
-
-    opr_test(cases, F.hinge_loss)
-
-    # cases with L2 norm
-    cases = []
-    for shape in [(2, 2), (2, 3)]:
-        data = np.random.uniform(size=shape).astype(np.float32)
-        label = 2 * np.random.randint(0, 1, size=shape).astype(np.int32) - 1
-        expect = ((np.clip(0, np.inf, 1 - data * label) ** 2).sum(axis=1)).mean()
-        cases.append({"input": [data, label], "output": tensor(expect)})
-
-    def hinge_loss_with_l2_norm(pred, label):
-        return F.hinge_loss(pred, label, "L2")
-
-    opr_test(cases, hinge_loss_with_l2_norm)
-
-
-def test_smooth_l1_loss():
-    np.random.seed(123)
-    cases = []
-    for shape in [(2, 2), (2, 3)]:
-        data = np.random.uniform(size=shape).astype(np.float32)
-        label = np.random.uniform(size=shape).astype(np.float32)
-        diff = np.abs(data - label)
-        expect = np.where(diff < 1, 0.5 * diff ** 2, diff - 0.5).mean()
-        cases.append({"input": [data, label], "output": tensor(expect)})
-
-    opr_test(cases, F.smooth_l1_loss)
-
-
-@pytest.mark.skip
-def test_conv_bias():
-    inp_scale = 0.01
-    w_scale = 0.02
-    outp_scale = 0.1
-    inp_dtype = mgb.dtype.qint8(inp_scale)
-    w_dtype = mgb.dtype.qint8(w_scale)
-    b_dtype = mgb.dtype.qint32(inp_scale * w_scale)
-    out_dtype = mgb.dtype.qint8(outp_scale)
-
-    def run(
-        N,
-        IC,
-        OC,
-        IH,
-        IW,
-        KH,
-        KW,
-        PH,
-        PW,
-        SH,
-        SW,
-        has_bias=True,
-        nonlinear_mode="IDENTITY",
-    ):
-        inp_v = np.random.normal(size=(N, IC, IH, IW))
-        w_v = np.random.normal(size=(OC, IC, KW, KW))
-        b_v = np.random.normal(size=(1, OC, 1, 1))
-        inp_scale = mgb.dtype.get_scale(inp_dtype)
-        w_scale = mgb.dtype.get_scale(w_dtype)
-        b_scale = mgb.dtype.get_scale(b_dtype)
-
-        inpv = mgb.dtype.convert_to_qint8(inp_v * inp_scale, inp_dtype)
-        wv = mgb.dtype.convert_to_qint8(w_v * w_scale, w_dtype)
-        bv = mgb.dtype.convert_to_qint32(b_v * b_scale, b_dtype)
-
-        inp_int8 = tensor(inpv, dtype=inp_dtype)
-        w_int8 = Parameter(wv, dtype=w_dtype)
-        b_int32 = Parameter(bv, dtype=b_dtype)
-
-        inp_fp32 = inp_int8.astype("float32")
-        w_fp32 = w_int8.astype("float32")
-        b_fp32 = b_int32.astype("float32")
-
-        jit.trace.enabled = True
-        b_symbolic = True
-
-        def convert_to_nchw4(var):
-            return var.reshape(
-                var.shapeof(0), var.shapeof(1) // 4, 4, var.shapeof(2), var.shapeof(3)
-            ).dimshuffle(0, 1, 3, 4, 2)
-
-        @jit.trace(symbolic=b_symbolic)
-        def run_conv2d(inp, w, b):
-            O = F.conv2d(
-                inp, w, b if has_bias else None, stride=(SH, SW), padding=(PH, PW),
-            )
-            if nonlinear_mode == "RELU":
-                return F.relu(O)
-            else:
-                return O
-
-        @jit.trace(symbolic=b_symbolic)
-        def run_conv_bias(inp, w, b, format="NCHW"):
-            b = b if has_bias else np.zeros_like(b)
-            if format == "NCHW4":
-                inp = convert_to_nchw4(inp)
-                w = convert_to_nchw4(w)
-                b = F.flatten(b)
-            return F.conv_bias_activation(
-                inp,
-                w,
-                b,
-                stride=(SH, SW),
-                padding=(PH, PW),
-                dtype=out_dtype,
-                nonlinear_mode=nonlinear_mode,
-            )
-
-        format = "NCHW4" if is_cuda_available() else "NCHW"
-
-        expected = run_conv2d(inp_fp32, w_fp32, b_fp32)
-        expected = expected.astype(out_dtype).astype("float32")
-        result = run_conv_bias(inp_int8, w_int8, b_int32, format=format).astype(
-            "float32"
-        )
-        if format == "NCHW4":
-            result = result.dimshuffle(0, 1, 4, 2, 3)
-        expected = F.flatten(expected)
-        result = F.flatten(result)
-        assertTensorClose(result.numpy(), expected.numpy())
-
-    if not is_cuda_available():
-        run(1, 4, 4, 24, 33, 1, 1, 2, 3, 1, 1, False)
-        run(10, 12, 24, 46, 46, 1, 1, 2, 1, 3, 1, False)
-        run(10, 36, 8, 46, 26, 2, 2, 2, 1, 1, 2, False)
-
-        run(1, 4, 4, 24, 33, 1, 1, 2, 3, 1, 1)
-        run(10, 12, 24, 46, 46, 1, 1, 2, 1, 3, 1)
-        run(10, 36, 8, 46, 26, 2, 2, 2, 1, 1, 2)
-
-        run(10, 36, 8, 46, 26, 2, 2, 2, 1, 1, 2, False, "RELU")
-        run(10, 36, 8, 46, 26, 2, 2, 2, 1, 1, 2, True, "RELU")
-
-
-def test_softplus():
-    x = np.arange(1000).astype(np.float32)
-    out = F.softplus(tensor(x))
-    mask = x <= 20
-    with np.errstate(over="ignore"):
-        expected = np.where(mask, np.log(1 + np.exp(x)), x)
-    assertTensorClose(out, expected)
-    beta = 2
-    out = F.softplus(tensor(x), beta=beta, threshold=30)
-    mask = beta * x <= 30
-    # ignore overflow
-    with np.errstate(over="ignore"):
-        expected = np.where(mask, np.log(1 + np.exp(x * beta)) / beta, x)
-    assertTensorClose(out, expected)
diff --git a/python_module/test/unit/functional/test_interpolate.py b/python_module/test/unit/functional/test_interpolate.py
deleted file mode 100644
index 2da981ff..00000000
--- a/python_module/test/unit/functional/test_interpolate.py
+++ /dev/null
@@ -1,60 +0,0 @@
-# -*- coding: utf-8 -*-
-# MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
-#
-# Copyright (c) 2014-2020 Megvii Inc. All rights reserved.
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-import numpy as np
-import pytest
-
-import megengine.functional as F
-from megengine import tensor
-from megengine.test import assertTensorClose
-
-
-def test_linear_interpolate():
-    inp = tensor(np.arange(1, 3, dtype=np.float32).reshape(1, 1, 2))
-
-    out = F.interpolate(inp, scale_factor=2.0, mode="LINEAR")
-    out2 = F.interpolate(inp, 4, mode="LINEAR")
-
-    assertTensorClose(
-        out.numpy(), np.array([[[1.0, 1.25, 1.75, 2.0]]], dtype=np.float32)
-    )
-    assertTensorClose(
-        out2.numpy(), np.array([[[1.0, 1.25, 1.75, 2.0]]], dtype=np.float32)
-    )
-
-
-def test_many_batch_interpolate():
-    inp = tensor(np.arange(1, 9, dtype=np.float32).reshape(2, 1, 2, 2))
-
-    out = F.interpolate(inp, [4, 4])
-    out2 = F.interpolate(inp, scale_factor=2.0)
-
-    assertTensorClose(out.numpy(), out2.numpy())
-
-
-def test_assign_corner_interpolate():
-    inp = tensor(np.arange(1, 5, dtype=np.float32).reshape(1, 1, 2, 2))
-
-    out = F.interpolate(inp, [4, 4], align_corners=True)
-    out2 = F.interpolate(inp, scale_factor=2.0, align_corners=True)
-
-    assertTensorClose(out.numpy(), out2.numpy())
-
-
-def test_error_shape_linear_interpolate():
-    inp = tensor(np.arange(1, 5, dtype=np.float32).reshape(1, 1, 2, 2))
-
-    with pytest.raises(ValueError):
-        F.interpolate(inp, scale_factor=2.0, mode="LINEAR")
-
-
-def test_inappropriate_scale_linear_interpolate():
-    inp = tensor(np.arange(1, 3, dtype=np.float32).reshape(1, 1, 2))
-
-    with pytest.raises(ValueError):
-        F.interpolate(inp, scale_factor=[2.0, 3.0], mode="LINEAR")
diff --git a/python_module/test/unit/functional/test_math.py b/python_module/test/unit/functional/test_math.py
deleted file mode 100644
index 6dc5c82c..00000000
--- a/python_module/test/unit/functional/test_math.py
+++ /dev/null
@@ -1,168 +0,0 @@
-# -*- coding: utf-8 -*-
-# MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
-#
-# Copyright (c) 2014-2020 Megvii Inc. All rights reserved.
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-
-from functools import partial
-
-import numpy as np
-from helpers import opr_test
-
-import megengine.functional as F
-from megengine.test import assertTensorClose
-
-
-def common_test_reduce(opr, ref_opr):
-    data1_shape = (5, 6, 7)
-    data2_shape = (2, 9, 12)
-    data1 = np.random.random(data1_shape).astype(np.float32)
-    data2 = np.random.random(data2_shape).astype(np.float32)
-    cases = [{"input": data1}, {"input": data2}]
-
-    if opr not in (F.argmin, F.argmax):
-        # test default axis
-        opr_test(cases, opr, ref_fn=ref_opr)
-        # test all axises in range of input shape
-        for axis in range(-3, 3):
-            # test keepdims False
-            opr_test(cases, opr, ref_fn=lambda x: ref_opr(x, axis=axis), axis=axis)
-            # test keepdims True
-            opr_test(
-                cases,
-                opr,
-                ref_fn=lambda x: ref_opr(x, axis=axis, keepdims=True),
-                axis=axis,
-                keepdims=True,
-            )
-    else:
-        # test defaut axis
-        opr_test(cases, opr, ref_fn=lambda x: ref_opr(x).astype(np.int32))
-        # test all axises in range of input shape
-        for axis in range(0, 3):
-            opr_test(
-                cases,
-                opr,
-                ref_fn=lambda x: ref_opr(x, axis=axis).astype(np.int32),
-                axis=axis,
-            )
-
-
-def test_sum():
-    common_test_reduce(opr=F.sum, ref_opr=np.sum)
-
-
-def test_prod():
-    common_test_reduce(opr=F.prod, ref_opr=np.prod)
-
-
-def test_mean():
-    common_test_reduce(opr=F.mean, ref_opr=np.mean)
-
-
-def test_min():
-    common_test_reduce(opr=F.min, ref_opr=np.min)
-
-
-def test_max():
-    common_test_reduce(opr=F.max, ref_opr=np.max)
-
-
-def test_argmin():
-    common_test_reduce(opr=F.argmin, ref_opr=np.argmin)
-
-
-def test_argmax():
-    common_test_reduce(opr=F.argmax, ref_opr=np.argmax)
-
-
-def test_sqrt():
-    d1_shape = (15,)
-    d2_shape = (25,)
-    d1 = np.random.random(d1_shape).astype(np.float32)
-    d2 = np.random.random(d2_shape).astype(np.float32)
-
-    cases = [{"input": d1}, {"input": d2}]
-    opr_test(cases, F.sqrt, ref_fn=np.sqrt)
-
-
-def test_normalize():
-
-    cases = [
-        {"input": np.random.random((2, 3, 12, 12)).astype(np.float32)} for i in range(2)
-    ]
-
-    def np_normalize(x, p=2, axis=None, eps=1e-12):
-        if axis is None:
-            norm = np.sum(x ** p) ** (1.0 / p)
-        else:
-            norm = np.sum(x ** p, axis=axis, keepdims=True) ** (1.0 / p)
-        return x / np.clip(norm, a_min=eps, a_max=np.inf)
-
-    # Test L-2 norm along all dimensions
-    opr_test(cases, F.normalize, ref_fn=np_normalize)
-
-    # Test L-1 norm along all dimensions
-    opr_test(cases, partial(F.normalize, p=1), ref_fn=partial(np_normalize, p=1))
-
-    # Test L-2 norm along the second dimension
-    opr_test(cases, partial(F.normalize, axis=1), ref_fn=partial(np_normalize, axis=1))
-
-    # Test some norm == 0
-    cases[0]["input"][0, 0, 0, :] = 0
-    cases[1]["input"][0, 0, 0, :] = 0
-    opr_test(cases, partial(F.normalize, axis=3), ref_fn=partial(np_normalize, axis=3))
-
-
-def test_logsumexp():
-    x = np.arange(10).astype(np.float32)
-    expected = np.log(np.sum(np.exp(x)))
-    cases = [{"input": x, "output": expected}]
-    compare_fn = partial(assertTensorClose, allow_special_values=True)
-    # large value check
-    n = 100
-    x = np.full(n, 10000, dtype=np.float32)
-    expected = 10000 + np.log(n)
-    cases.append({"input": x, "output": expected.astype(np.float32)})
-    opr_test(cases, F.logsumexp, axis=0, compare_fn=compare_fn)
-
-    # special value check
-    x = np.array([np.inf], dtype=np.float32)
-    expected = x
-    cases = [{"input": x, "output": expected}]
-
-    x = np.array([-np.inf, 0.0], dtype=np.float32)
-    expected = np.zeros(1).astype(np.float32)
-    cases.append({"input": x, "output": expected})
-    opr_test(cases, F.logsumexp, axis=0, compare_fn=compare_fn)
-
-    x = np.array([np.nan], dtype=np.float32)
-    expected = x
-    cases = [{"input": x, "output": expected}]
-
-    x = np.array([-np.inf, 1], dtype=np.float32)
-    expected = np.array([1.0], dtype=np.float32)
-    cases.append({"input": x, "output": expected})
-
-    opr_test(cases, F.logsumexp, axis=0, compare_fn=compare_fn)
-
-    # keepdims check
-    x = np.array([[1e10, 1e-10], [-1e10, -np.inf]], dtype=np.float32)
-    expected = np.array([[1e10], [-1e10]], dtype=np.float32)
-    cases = [{"input": x, "output": expected}]
-    x = np.array([[1e10, -1e-10, 1e-10], [1e10, 1e-10, np.inf]], dtype=np.float32)
-    expected = np.array([[1e10], [np.inf]], dtype=np.float32)
-    cases.append({"input": x, "output": expected})
-    opr_test(cases, F.logsumexp, axis=1, keepdims=True, compare_fn=compare_fn)
-
-    # multiple axes check
-    x = np.array([[1e10, 1e-10], [-1e10, -np.inf]], dtype=np.float32)
-    expected = np.array([1e10], dtype=np.float32)
-    cases = [{"input": x, "output": expected}]
-    x = np.array([[1e10, -1e-10, 1e-10], [1e10, 1e-10, np.inf]], dtype=np.float32)
-    expected = np.array([np.inf], dtype=np.float32)
-    cases.append({"input": x, "output": expected})
-    opr_test(cases, F.logsumexp, axis=(0, 1), keepdims=False, compare_fn=compare_fn)
diff --git a/python_module/test/unit/functional/test_onehot.py b/python_module/test/unit/functional/test_onehot.py
deleted file mode 100644
index 3edbe5de..00000000
--- a/python_module/test/unit/functional/test_onehot.py
+++ /dev/null
@@ -1,26 +0,0 @@
-import numpy as np
-import pytest
-
-import megengine.functional as F
-from megengine import tensor
-from megengine.test import assertTensorClose
-
-
-def test_onehot_low_dimension():
-    inp = tensor(np.arange(1, 4, dtype=np.int32))
-    out = F.one_hot(inp, num_classes=4)
-
-    assertTensorClose(
-        out.numpy(), np.eye(4, dtype=np.int32)[np.arange(1, 4, dtype=np.int32)]
-    )
-
-
-def test_onehot_high_dimension():
-    arr = np.array(
-        [[3, 2, 4, 4, 2, 4, 0, 4, 4, 1], [4, 1, 1, 3, 2, 2, 4, 2, 4, 3]], dtype=np.int32
-    )
-
-    inp = tensor(arr)
-    out = F.one_hot(inp, 10)
-
-    assertTensorClose(out.numpy(), np.eye(10, dtype=np.int32)[arr])
diff --git a/python_module/test/unit/hub/__init__.py b/python_module/test/unit/hub/__init__.py
deleted file mode 100644
index e69de29b..00000000
diff --git a/python_module/test/unit/hub/test_hub.py b/python_module/test/unit/hub/test_hub.py
deleted file mode 100644
index 5fdb32ec..00000000
--- a/python_module/test/unit/hub/test_hub.py
+++ /dev/null
@@ -1,38 +0,0 @@
-# -*- coding: utf-8 -*-
-# MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
-#
-# Copyright (c) 2014-2020 Megvii Inc. All rights reserved.
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-import pytest
-from helpers import modified_environ
-
-from megengine.hub import hub
-
-
-@pytest.mark.internet
-def test_hub_http_basic(tmp_path):
-    # Override XDG_CACHE_HOME to make sure test won't have side effect for system.
-    with modified_environ(XDG_CACHE_HOME=str(tmp_path)):
-        # Use pytorch's URL due to we don't have public address now.
-        repo_info, entry = "pytorch/vision:v0.4.2", "alexnet"
-
-        assert len(hub.list(repo_info)) > 0
-
-        assert entry in hub.list(repo_info)
-
-        assert hub.help(repo_info, entry)
-
-        assert isinstance(hub.load(repo_info, entry), object)
-
-
-@pytest.mark.internet
-def test_github_load_with_commit_id(tmp_path):
-    # Override XDG_CACHE_HOME to make sure test won't have side effect for system.
-    with modified_environ(XDG_CACHE_HOME=str(tmp_path)):
-        # Use pytorch's URL due to we don't have public address now.
-        repo_info, commit, entry = "pytorch/vision", "d2c763e1", "alexnet"
-
-        assert isinstance(hub.load(repo_info, entry, commit=commit), object)
diff --git a/python_module/test/unit/jit/__init__.py b/python_module/test/unit/jit/__init__.py
deleted file mode 100644
index e69de29b..00000000
diff --git a/python_module/test/unit/jit/test_jit.py b/python_module/test/unit/jit/test_jit.py
deleted file mode 100644
index b14b1dfc..00000000
--- a/python_module/test/unit/jit/test_jit.py
+++ /dev/null
@@ -1,257 +0,0 @@
-# -*- coding: utf-8 -*-
-# MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
-#
-# Copyright (c) 2014-2020 Megvii Inc. All rights reserved.
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-import contextlib
-import os
-import tempfile
-
-import numpy as np
-import pytest
-
-import megengine as mge
-import megengine._internal as mgb
-import megengine.functional as F
-import megengine.module as M
-from megengine import functional as F
-from megengine import jit, tensor
-from megengine.core.tensor import Tensor
-from megengine.jit import SublinearMemoryConfig
-from megengine.test import assertTensorClose
-
-
-@contextlib.contextmanager
-def mkstemp():
-    fd, path = tempfile.mkstemp()
-    try:
-        os.close(fd)
-        yield path
-    finally:
-        os.remove(path)
-
-
-def load_and_compile(fpath):
-    cg, _, outputs = mgb.load_comp_graph_from_file(fpath)
-    inputs = mgb.cgtools.get_dep_vars(outputs, "Host2DeviceCopy")
-    inputs = sorted(inputs, key=lambda i: i.name)
-    outputs = list(map(mgb.copy_output, outputs))
-    if len(outputs) == 1:
-        (outputs,) = outputs
-    return cg.compile(inputs, outputs)
-
-
-def test_symbolic():
-    @jit.trace(symbolic=False)
-    def f(x):
-        return Tensor(mgb.opr.assert_equal(x._symvar, x._symvar + 1))
-
-    with pytest.raises(mgb.exc.MegBrainError):
-        f.trace(0)
-
-    @jit.trace(symbolic=True)
-    def f(x):
-        return Tensor(mgb.opr.assert_equal(x._symvar, x._symvar + 1))
-
-    f.trace(0)
-
-
-def test_add_update_semantic():
-    for symbolic in [False, True]:
-        x = tensor(0)
-
-        @jit.trace(symbolic=symbolic)
-        def f():
-            F.add_update(x, 1)
-            return x + 1
-
-        np.testing.assert_equal(f().numpy(), [2])
-        np.testing.assert_equal(f().numpy(), [3])
-
-
-def test_dump():
-    @jit.trace(symbolic=True)
-    def f(x, y):
-        return x * y
-
-    f.trace(0, 0)
-
-    with mkstemp() as out:
-        f.dump(out)
-        g = load_and_compile(out)
-
-    np.testing.assert_allclose(g([1, 2, 3], [1, 2, 3]), [1, 4, 9])
-
-
-def test_goptions():
-    @jit.trace(symbolic=True, opt_level=0)
-    def f(x):
-        return x / x
-
-    @jit.trace(symbolic=True, opt_level=1)
-    def g(x):
-        return x / x
-
-    out = f([0.0]).numpy()
-    # out is nan
-    if out == out:
-        raise
-
-    # with gopt, x / x returns 1
-    out = g([0.0]).numpy()
-    assert out == 1
-
-
-def test_json_prof():
-    @jit.trace(symbolic=True, profiling=True)
-    def f(x):
-        return x * x
-
-    f([0.0])
-
-    out = f.get_profile()
-    assert out.get("profiler")
-
-
-def test_capture_dump():
-    p = tensor(7)
-
-    @jit.trace(symbolic=True)
-    def f(x):
-        return x * p
-
-    f.trace(0)
-
-    with mkstemp() as out:
-        f.dump(out)
-        g = load_and_compile(out)
-
-    np.testing.assert_allclose(g([1, 2, 3]), [7, 14, 21])
-
-
-def test_dump_volatile():
-    p = tensor(7)
-
-    @jit.trace(symbolic=True)
-    def f(x):
-        return x * p
-
-    f.trace(0)
-
-    with mkstemp() as out:
-        f.dump(out)
-        cg, _, outputs = mgb.load_comp_graph_from_file(out)
-
-    (out,) = outputs
-    assert mgb.cgtools.get_type(mgb.cgtools.get_inputs(out)[1]) == "SharedDeviceTensor"
-
-
-def test_graph_traversal():
-    net = M.Conv2d(3, 4, 3, 1, 1, groups=1, bias=False)
-    net.eval()
-
-    @jit.trace(symbolic=True)
-    def fun(data):
-        return net(data)
-
-    data = np.random.random([1, 3, 224, 224]).astype(np.float32)
-    fun.trace(data)
-
-    with mkstemp() as out:
-        fun.dump(out)
-        *_, outputs = mgb.load_comp_graph_from_file(out)
-
-    _, map_vars, var2oprs, *_ = mgb.cgtools.graph_traversal(outputs)
-    input_var = map_vars[1]
-    _, var_idx = var2oprs[input_var.id][0]
-
-    assert var_idx == 0
-
-
-def test_network_visitor():
-    @jit.trace(symbolic=True)
-    def f(x):
-        # this line will produce shape_of, subtensor and concat op
-        # after pruning, they will be deleted
-        target_shape = (x.shape[0], -1)
-
-        return x.reshape(*target_shape)
-
-    f.trace(tensor(np.random.random([2, 3, 4, 5]).astype(np.float32)))
-
-    with mkstemp() as out:
-        f.dump(out)
-        *_, outputs = mgb.load_comp_graph_from_file(out)
-
-    all_oprs = mgb.cgtools.get_oprs_seq(outputs)
-    pruned_oprs = mgb.cgtools.get_oprs_seq(outputs, prune_reshape=True)
-
-    assert len(all_oprs) == len(pruned_oprs) + 3
-
-
-def test_shape_tracing():
-    for symbolic in [False, True]:
-
-        @jit.trace(symbolic=symbolic)
-        def f(x):
-            a, b = x.shape
-            return a * b
-
-        assert f(np.zeros([4, 3], dtype="float32")).item() == 12
-        assert f(np.zeros([6, 4], dtype="float32")).item() == 24
-
-
-def test_shape_infer():
-    @jit.trace(symbolic=True)
-    def f(x):
-        a, b = x.shape
-        return sum(x[i] for i in range(a))
-
-    x = np.random.randn(3, 10).astype("float32")
-    assertTensorClose(f(x), x.sum(0))
-    x = np.random.randn(4, 10).astype("float32")
-    assertTensorClose(f(x), x[:3].sum(0))
-
-
-def test_dump_bn_fused():
-    class ConvBNReLU(M.Sequential):
-        def __init__(self):
-            super(ConvBNReLU, self).__init__(
-                M.Conv2d(3, 4, 3, 1, 1, groups=1, bias=False),
-                M.BatchNorm2d(4),
-                M.ReLU(),
-            )
-
-    net = ConvBNReLU()
-    net.eval()
-
-    @jit.trace(symbolic=True)
-    def fun(data):
-        return net(data)
-
-    data = np.random.random([1, 3, 224, 224]).astype(np.float32)
-    fun.trace(data)
-    with mkstemp() as out:
-        fun.dump(out, optimize_for_inference=True)
-        cg, _, outputs = mgb.load_comp_graph_from_file(out)
-
-    (out,) = outputs
-    inputs = mgb.cgtools.get_inputs(out)
-    assert len(inputs) == 2 and (
-        mgb.cgtools.get_type(inputs[0]) == "MultipleDeviceTensorHolder"
-        and mgb.cgtools.get_type(inputs[1]) == "ConvolutionForward"
-    )
-
-
-# Simply verify the options passed down
-def test_sublinear():
-    config = SublinearMemoryConfig(genetic_nr_iter=10)
-
-    @jit.trace(symbolic=True, sublinear_memory_config=config)
-    def f(x):
-        return x + x
-
-    f([0.0])
diff --git a/python_module/test/unit/module/.gitattributes b/python_module/test/unit/module/.gitattributes
deleted file mode 100644
index 816f72d6..00000000
--- a/python_module/test/unit/module/.gitattributes
+++ /dev/null
@@ -1 +0,0 @@
-*.mlu binary
diff --git a/python_module/test/unit/module/AtlasRuntimeOprTest.basic.om b/python_module/test/unit/module/AtlasRuntimeOprTest.basic.om
deleted file mode 100644
index 942fe2ed..00000000
Binary files a/python_module/test/unit/module/AtlasRuntimeOprTest.basic.om and /dev/null differ
diff --git a/python_module/test/unit/module/__init__.py b/python_module/test/unit/module/__init__.py
deleted file mode 100644
index e69de29b..00000000
diff --git a/python_module/test/unit/module/test_activation.py b/python_module/test/unit/module/test_activation.py
deleted file mode 100644
index afca1fde..00000000
--- a/python_module/test/unit/module/test_activation.py
+++ /dev/null
@@ -1,24 +0,0 @@
-# -*- coding: utf-8 -*-
-# MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
-#
-# Copyright (c) 2014-2020 Megvii Inc. All rights reserved.
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-import numpy as np
-
-import megengine as mge
-from megengine.module import LeakyReLU
-from megengine.test import assertTensorClose
-
-
-def test_leaky_relu():
-    data = np.array([-8, -12, 6, 10]).astype(np.float32)
-    negative_slope = 0.1
-
-    leaky_relu = LeakyReLU(negative_slope)
-    output = leaky_relu(mge.tensor(data))
-
-    np_output = np.maximum(0, data) + negative_slope * np.minimum(0, data)
-    assertTensorClose(output.numpy(), np_output, max_err=0)
diff --git a/python_module/test/unit/module/test_batchnorm.py b/python_module/test/unit/module/test_batchnorm.py
deleted file mode 100644
index cd651444..00000000
--- a/python_module/test/unit/module/test_batchnorm.py
+++ /dev/null
@@ -1,406 +0,0 @@
-# -*- coding: utf-8 -*-
-# MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
-#
-# Copyright (c) 2014-2020 Megvii Inc. All rights reserved.
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-import multiprocessing as mp
-import platform
-
-import numpy as np
-import pytest
-
-import megengine as mge
-import megengine.distributed as dist
-from megengine.core import tensor
-from megengine.module import BatchNorm1d, BatchNorm2d, SyncBatchNorm
-from megengine.test import assertTensorClose
-
-
-@pytest.mark.skipif(
-    platform.system() == "Darwin", reason="do not imp GPU mode at macos now"
-)
-@pytest.mark.skipif(
-    platform.system() == "Windows", reason="do not imp GPU mode at Windows now"
-)
-@pytest.mark.isolated_distributed
-def test_syncbn():
-    nr_chan = 8
-    data_shape = (3, nr_chan, 4, 16)
-    momentum = 0.9
-    eps = 1e-5
-    running_mean = np.zeros((1, nr_chan, 1, 1), dtype=np.float32)
-    running_var = np.ones((1, nr_chan, 1, 1), dtype=np.float32)
-    steps = 4
-    nr_ranks = 2
-
-    def worker(rank, data, yv_expect, running_mean, running_var):
-        if mge.get_device_count("gpu") < nr_ranks:
-            return
-        dist.init_process_group("localhost", 2333, nr_ranks, rank, rank)
-        bn = SyncBatchNorm(nr_chan, momentum=momentum, eps=eps)
-        data_tensor = tensor()
-        for i in range(steps):
-            data_tensor.set_value(data[i])
-            yv = bn(data_tensor)
-
-        assertTensorClose(yv_expect, yv.numpy(), max_err=5e-6)
-        assertTensorClose(running_mean, bn.running_mean.numpy(), max_err=5e-6)
-        assertTensorClose(running_var, bn.running_var.numpy(), max_err=5e-6)
-
-    xv = []
-    for i in range(steps):
-        xv.append(np.random.normal(loc=2.3, size=data_shape).astype(np.float32))
-        xv_transposed = np.transpose(xv[i], [0, 2, 3, 1]).reshape(
-            (data_shape[0] * data_shape[2] * data_shape[3], nr_chan)
-        )
-
-        mean = np.mean(xv_transposed, axis=0).reshape(1, nr_chan, 1, 1)
-
-        var_biased = np.var(xv_transposed, axis=0).reshape((1, nr_chan, 1, 1))
-        sd = np.sqrt(var_biased + eps)
-
-        var_unbiased = np.var(xv_transposed, axis=0, ddof=1).reshape((1, nr_chan, 1, 1))
-        running_mean = running_mean * momentum + mean * (1 - momentum)
-        running_var = running_var * momentum + var_unbiased * (1 - momentum)
-
-        yv_expect = (xv[i] - mean) / sd
-
-    data = []
-    for i in range(nr_ranks):
-        data.append([])
-        for j in range(steps):
-            data[i].append(xv[j][:, :, :, i * 8 : i * 8 + 8])
-
-    procs = []
-    for rank in range(nr_ranks):
-        p = mp.Process(
-            target=worker,
-            args=(
-                rank,
-                data[rank],
-                yv_expect[:, :, :, rank * 8 : rank * 8 + 8],
-                running_mean,
-                running_var,
-            ),
-        )
-        p.start()
-        procs.append(p)
-
-    for p in procs:
-        p.join(10)
-        assert p.exitcode == 0
-
-
-def test_batchnorm():
-    nr_chan = 8
-    data_shape = (3, nr_chan, 4)
-    momentum = 0.9
-    bn = BatchNorm1d(nr_chan, momentum=momentum)
-    running_mean = np.zeros((1, nr_chan, 1), dtype=np.float32)
-    running_var = np.ones((1, nr_chan, 1), dtype=np.float32)
-    data = tensor()
-    for i in range(3):
-        xv = np.random.normal(loc=2.3, size=data_shape).astype(np.float32)
-        mean = np.mean(np.mean(xv, axis=0, keepdims=True), axis=2, keepdims=True)
-        xv_transposed = np.transpose(xv, [0, 2, 1]).reshape(
-            (data_shape[0] * data_shape[2], nr_chan)
-        )
-
-        var_biased = np.var(xv_transposed, axis=0).reshape((1, nr_chan, 1))
-        sd = np.sqrt(var_biased + bn.eps)
-
-        var_unbiased = np.var(xv_transposed, axis=0, ddof=1).reshape((1, nr_chan, 1))
-        running_mean = running_mean * momentum + mean * (1 - momentum)
-        running_var = running_var * momentum + var_unbiased * (1 - momentum)
-
-        data.set_value(xv)
-        yv = bn(data)
-        yv_expect = (xv - mean) / sd
-
-        assertTensorClose(yv_expect, yv.numpy(), max_err=5e-6)
-        assertTensorClose(
-            running_mean.reshape(-1), bn.running_mean.numpy().reshape(-1), max_err=5e-6
-        )
-        assertTensorClose(
-            running_var.reshape(-1), bn.running_var.numpy().reshape(-1), max_err=5e-6
-        )
-
-    # test set 'training' flag to False
-    mean_backup = bn.running_mean.numpy()
-    var_backup = bn.running_var.numpy()
-    bn.training = False
-    xv = np.random.normal(loc=2.3, size=data_shape).astype(np.float32)
-    data.set_value(xv)
-    yv1 = bn(data)
-    yv2 = bn(data)
-    assertTensorClose(yv1.numpy(), yv2.numpy(), max_err=0)
-    assertTensorClose(mean_backup, bn.running_mean.numpy(), max_err=0)
-    assertTensorClose(var_backup, bn.running_var.numpy(), max_err=0)
-    yv_expect = (xv - running_mean) / np.sqrt(running_var + bn.eps)
-    assertTensorClose(yv_expect, yv1.numpy(), max_err=5e-6)
-
-
-@pytest.mark.skipif(
-    platform.system() == "Darwin", reason="do not imp GPU mode at macos now"
-)
-@pytest.mark.skipif(
-    platform.system() == "Windows", reason="do not imp GPU mode at Windows now"
-)
-def test_syncbn1d():
-    nr_chan = 8
-    data_shape = (3, nr_chan, 4)
-    momentum = 0.9
-    bn = SyncBatchNorm(nr_chan, momentum=momentum)
-    running_mean = np.zeros((1, nr_chan, 1), dtype=np.float32)
-    running_var = np.ones((1, nr_chan, 1), dtype=np.float32)
-    data = tensor()
-    for i in range(3):
-        xv = np.random.normal(loc=2.3, size=data_shape).astype(np.float32)
-        mean = np.mean(np.mean(xv, axis=0, keepdims=True), axis=2, keepdims=True)
-        xv_transposed = np.transpose(xv, [0, 2, 1]).reshape(
-            (data_shape[0] * data_shape[2], nr_chan)
-        )
-
-        var_biased = np.var(xv_transposed, axis=0).reshape((1, nr_chan, 1))
-        sd = np.sqrt(var_biased + bn.eps)
-
-        var_unbiased = np.var(xv_transposed, axis=0, ddof=1).reshape((1, nr_chan, 1))
-        running_mean = running_mean * momentum + mean * (1 - momentum)
-        running_var = running_var * momentum + var_unbiased * (1 - momentum)
-
-        data.set_value(xv)
-        yv = bn(data)
-        yv_expect = (xv - mean) / sd
-
-        assertTensorClose(yv_expect, yv.numpy(), max_err=5e-6)
-        assertTensorClose(
-            running_mean.reshape(-1), bn.running_mean.numpy().reshape(-1), max_err=5e-6
-        )
-        assertTensorClose(
-            running_var.reshape(-1), bn.running_var.numpy().reshape(-1), max_err=5e-6
-        )
-
-    # test set 'training' flag to False
-    mean_backup = bn.running_mean.numpy()
-    var_backup = bn.running_var.numpy()
-    bn.training = False
-    xv = np.random.normal(loc=2.3, size=data_shape).astype(np.float32)
-    data.set_value(xv)
-    yv1 = bn(data)
-    yv2 = bn(data)
-    assertTensorClose(yv1.numpy(), yv2.numpy(), max_err=0)
-    assertTensorClose(mean_backup, bn.running_mean.numpy(), max_err=0)
-    assertTensorClose(var_backup, bn.running_var.numpy(), max_err=0)
-    yv_expect = (xv - running_mean) / np.sqrt(running_var + bn.eps)
-    assertTensorClose(yv_expect, yv1.numpy(), max_err=5e-6)
-
-
-def test_batchnorm2d():
-    nr_chan = 8
-    data_shape = (3, nr_chan, 16, 16)
-    momentum = 0.9
-    bn = BatchNorm2d(nr_chan, momentum=momentum)
-    running_mean = np.zeros((1, nr_chan, 1, 1), dtype=np.float32)
-    running_var = np.ones((1, nr_chan, 1, 1), dtype=np.float32)
-    data = tensor()
-    for i in range(3):
-        xv = np.random.normal(loc=2.3, size=data_shape).astype(np.float32)
-        xv_transposed = np.transpose(xv, [0, 2, 3, 1]).reshape(
-            (data_shape[0] * data_shape[2] * data_shape[3], nr_chan)
-        )
-
-        mean = np.mean(xv_transposed, axis=0).reshape(1, nr_chan, 1, 1)
-
-        var_biased = np.var(xv_transposed, axis=0).reshape((1, nr_chan, 1, 1))
-        sd = np.sqrt(var_biased + bn.eps)
-
-        var_unbiased = np.var(xv_transposed, axis=0, ddof=1).reshape((1, nr_chan, 1, 1))
-        running_mean = running_mean * momentum + mean * (1 - momentum)
-        running_var = running_var * momentum + var_unbiased * (1 - momentum)
-
-        data.set_value(xv)
-        yv = bn(data)
-        yv_expect = (xv - mean) / sd
-
-        assertTensorClose(yv_expect, yv.numpy(), max_err=5e-6)
-        assertTensorClose(running_mean, bn.running_mean.numpy(), max_err=5e-6)
-        assertTensorClose(running_var, bn.running_var.numpy(), max_err=5e-6)
-
-    # test set 'training' flag to False
-    mean_backup = bn.running_mean.numpy()
-    var_backup = bn.running_var.numpy()
-    bn.training = False
-    xv = np.random.normal(loc=2.3, size=data_shape).astype(np.float32)
-    data.set_value(xv)
-    yv1 = bn(data)
-    yv2 = bn(data)
-    assertTensorClose(yv1.numpy(), yv2.numpy(), max_err=0)
-    assertTensorClose(mean_backup, bn.running_mean.numpy(), max_err=0)
-    assertTensorClose(var_backup, bn.running_var.numpy(), max_err=0)
-    yv_expect = (xv - running_mean) / np.sqrt(running_var + bn.eps)
-    assertTensorClose(yv_expect, yv1.numpy(), max_err=5e-6)
-
-
-@pytest.mark.skipif(
-    platform.system() == "Darwin", reason="do not imp GPU mode at macos now"
-)
-@pytest.mark.skipif(
-    platform.system() == "Windows", reason="do not imp GPU mode at Windows now"
-)
-def test_syncbn2d():
-    nr_chan = 8
-    data_shape = (3, nr_chan, 16, 16)
-    momentum = 0.9
-    bn = SyncBatchNorm(nr_chan, momentum=momentum)
-    running_mean = np.zeros((1, nr_chan, 1, 1), dtype=np.float32)
-    running_var = np.ones((1, nr_chan, 1, 1), dtype=np.float32)
-    data = tensor()
-    for i in range(3):
-        xv = np.random.normal(loc=2.3, size=data_shape).astype(np.float32)
-        xv_transposed = np.transpose(xv, [0, 2, 3, 1]).reshape(
-            (data_shape[0] * data_shape[2] * data_shape[3], nr_chan)
-        )
-
-        mean = np.mean(xv_transposed, axis=0).reshape(1, nr_chan, 1, 1)
-
-        var_biased = np.var(xv_transposed, axis=0).reshape((1, nr_chan, 1, 1))
-        sd = np.sqrt(var_biased + bn.eps)
-
-        var_unbiased = np.var(xv_transposed, axis=0, ddof=1).reshape((1, nr_chan, 1, 1))
-        running_mean = running_mean * momentum + mean * (1 - momentum)
-        running_var = running_var * momentum + var_unbiased * (1 - momentum)
-
-        data.set_value(xv)
-        yv = bn(data)
-        yv_expect = (xv - mean) / sd
-
-        assertTensorClose(yv_expect, yv.numpy(), max_err=5e-6)
-        assertTensorClose(running_mean, bn.running_mean.numpy(), max_err=5e-6)
-        assertTensorClose(running_var, bn.running_var.numpy(), max_err=5e-6)
-
-    # test set 'training' flag to False
-    mean_backup = bn.running_mean.numpy()
-    var_backup = bn.running_var.numpy()
-    bn.training = False
-    xv = np.random.normal(loc=2.3, size=data_shape).astype(np.float32)
-    data.set_value(xv)
-    yv1 = bn(data)
-    yv2 = bn(data)
-    assertTensorClose(yv1.numpy(), yv2.numpy(), max_err=0)
-    assertTensorClose(mean_backup, bn.running_mean.numpy(), max_err=0)
-    assertTensorClose(var_backup, bn.running_var.numpy(), max_err=0)
-    yv_expect = (xv - running_mean) / np.sqrt(running_var + bn.eps)
-    assertTensorClose(yv_expect, yv1.numpy(), max_err=5e-6)
-
-
-def test_batchnorm_no_stats():
-    nr_chan = 8
-    data_shape = (3, nr_chan, 4)
-    bn = BatchNorm1d(8, track_running_stats=False)
-    data = tensor()
-    for i in range(4):
-        if i == 2:
-            bn.training = False
-        xv = np.random.normal(loc=2.3, size=data_shape).astype(np.float32)
-        mean = np.mean(np.mean(xv, axis=0, keepdims=True), axis=2, keepdims=True)
-        var = np.var(
-            np.transpose(xv, [0, 2, 1]).reshape(
-                (data_shape[0] * data_shape[2], nr_chan)
-            ),
-            axis=0,
-        ).reshape((1, nr_chan, 1))
-        sd = np.sqrt(var + bn.eps)
-
-        data.set_value(xv)
-        yv = bn(data)
-        yv_expect = (xv - mean) / sd
-
-        assertTensorClose(yv_expect, yv.numpy(), max_err=5e-6)
-
-
-@pytest.mark.skipif(
-    platform.system() == "Darwin", reason="do not imp GPU mode at macos now"
-)
-@pytest.mark.skipif(
-    platform.system() == "Windows", reason="do not imp GPU mode at Windows now"
-)
-def test_syncbn_no_stats():
-    nr_chan = 8
-    data_shape = (3, nr_chan, 4)
-    bn = SyncBatchNorm(8, track_running_stats=False)
-    data = tensor()
-    for i in range(4):
-        if i == 2:
-            bn.training = False
-        xv = np.random.normal(loc=2.3, size=data_shape).astype(np.float32)
-        mean = np.mean(np.mean(xv, axis=0, keepdims=True), axis=2, keepdims=True)
-        var = np.var(
-            np.transpose(xv, [0, 2, 1]).reshape(
-                (data_shape[0] * data_shape[2], nr_chan)
-            ),
-            axis=0,
-        ).reshape((1, nr_chan, 1))
-        sd = np.sqrt(var + bn.eps)
-
-        data.set_value(xv)
-        yv = bn(data)
-        yv_expect = (xv - mean) / sd
-
-        assertTensorClose(yv_expect, yv.numpy(), max_err=5e-6)
-
-
-def test_batchnorm2d_no_stats():
-    nr_chan = 8
-    data_shape = (3, nr_chan, 16, 16)
-    bn = BatchNorm2d(8, track_running_stats=False)
-    data = tensor()
-    for i in range(4):
-        if i == 2:
-            bn.training = False
-        xv = np.random.normal(loc=2.3, size=data_shape).astype(np.float32)
-        xv_transposed = np.transpose(xv, [0, 2, 3, 1]).reshape(
-            (data_shape[0] * data_shape[2] * data_shape[3], nr_chan)
-        )
-
-        mean = np.mean(xv_transposed, axis=0).reshape(1, nr_chan, 1, 1)
-        var = np.var(xv_transposed, axis=0).reshape((1, nr_chan, 1, 1))
-        sd = np.sqrt(var + bn.eps)
-
-        data.set_value(xv)
-        yv = bn(data)
-        yv_expect = (xv - mean) / sd
-
-        assertTensorClose(yv_expect, yv.numpy(), max_err=5e-6)
-
-
-@pytest.mark.skipif(
-    platform.system() == "Darwin", reason="do not imp GPU mode at macos now"
-)
-@pytest.mark.skipif(
-    platform.system() == "Windows", reason="do not imp GPU mode at Windows now"
-)
-def test_syncbn2d_no_stats():
-    nr_chan = 8
-    data_shape = (3, nr_chan, 16, 16)
-    bn = SyncBatchNorm(8, track_running_stats=False)
-    data = tensor()
-    for i in range(4):
-        if i == 2:
-            bn.training = False
-        xv = np.random.normal(loc=2.3, size=data_shape).astype(np.float32)
-        xv_transposed = np.transpose(xv, [0, 2, 3, 1]).reshape(
-            (data_shape[0] * data_shape[2] * data_shape[3], nr_chan)
-        )
-
-        mean = np.mean(xv_transposed, axis=0).reshape(1, nr_chan, 1, 1)
-        var = np.var(xv_transposed, axis=0).reshape((1, nr_chan, 1, 1))
-        sd = np.sqrt(var + bn.eps)
-
-        data.set_value(xv)
-        yv = bn(data)
-        yv_expect = (xv - mean) / sd
-
-        assertTensorClose(yv_expect, yv.numpy(), max_err=5e-6)
diff --git a/python_module/test/unit/module/test_conv.py b/python_module/test/unit/module/test_conv.py
deleted file mode 100644
index f67a8aaf..00000000
--- a/python_module/test/unit/module/test_conv.py
+++ /dev/null
@@ -1,110 +0,0 @@
-# -*- coding: utf-8 -*-
-# MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
-#
-# Copyright (c) 2014-2020 Megvii Inc. All rights reserved.
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-import itertools
-
-import numpy as np
-
-from megengine import Parameter, tensor
-from megengine.module import ConvTranspose2d, LocalConv2d
-from megengine.test import assertTensorClose
-
-
-def test_conv_transpose2d():
-    SH, SW = 3, 1
-    PH, PW = 2, 0
-    N, IC, IH, IW = 4, 5, 8, 6
-    KH, KW = 3, 4
-    OC = 3
-    BIAS = True
-
-    def getsize(inp, kern, stride):
-        return (inp - 1) * stride + kern
-
-    OH = getsize(IH, KH, SH)
-    OW = getsize(IW, KW, SW)
-
-    inp = np.random.normal(size=(N, IC, IH, IW)).astype(np.float32)
-    out = np.zeros((N, OC, OH, OW), dtype=np.float32)
-    weight = np.random.normal(size=(IC, OC, KH, KW)).astype(np.float32)
-    bias = np.random.normal(size=(1, OC, 1, 1)).astype(np.float32)
-
-    # naive calculation use numpy
-    for n, ic, ih, iw in itertools.product(*map(range, [N, IC, IH, IW])):
-        oh, ow = ih * SH, iw * SW
-        out[n, :, oh : oh + KH, ow : ow + KW] += inp[n, ic, ih, iw] * weight[ic]
-    out = out[:, :, PH : OH - PH, PW : OW - PW]
-    if BIAS:
-        out += bias
-
-    # megengine conv_transpose2d calculation
-    conv_transpose2d = ConvTranspose2d(IC, OC, (KH, KW), (SH, SW), (PH, PW), bias=BIAS)
-    conv_transpose2d.weight = Parameter(weight, dtype=np.float32)
-    if BIAS:
-        conv_transpose2d.bias = Parameter(bias, dtype=np.float32)
-    y = conv_transpose2d(tensor(inp))
-
-    assertTensorClose(out, y.numpy(), max_err=2e-6)
-
-
-def test_local_conv2d():
-    batch_size = 10
-    in_channels = 4
-    out_channels = 8
-    input_height = 8
-    input_width = 8
-    kernel_size = 3
-    stride = 1
-    padding = 1
-    dilation = 1
-    groups = 1
-    local_conv2d = LocalConv2d(
-        in_channels=in_channels,
-        out_channels=out_channels,
-        input_height=input_height,
-        input_width=input_width,
-        kernel_size=kernel_size,
-        stride=stride,
-        padding=padding,
-        dilation=dilation,
-        groups=groups,
-    )
-    inputs = np.random.normal(
-        size=(batch_size, in_channels, input_height, input_width)
-    ).astype(np.float32)
-    output_height = (input_height + padding * 2 - kernel_size) // stride + 1
-    output_width = (input_width + padding * 2 - kernel_size) // stride + 1
-    weights = np.random.normal(
-        size=(
-            groups,
-            output_height,
-            output_width,
-            in_channels // groups,
-            kernel_size,
-            kernel_size,
-            out_channels // groups,
-        )
-    ).astype(np.float32)
-    local_conv2d.weight = Parameter(weights)
-    outputs = local_conv2d(tensor(inputs))
-    # naive calculation use numpy
-    # only test output_height == input_height, output_width == input_width, group == 1
-    inputs = np.pad(inputs, ((0, 0), (0, 0), (1, 1), (1, 1)))
-    expected = np.zeros(
-        (batch_size, out_channels, output_height, output_width), dtype=np.float32,
-    )
-    for n, oc, oh, ow in itertools.product(
-        *map(range, [batch_size, out_channels, output_height, output_width])
-    ):
-        ih, iw = oh * stride, ow * stride
-        expected[n, oc, ih, iw] = np.sum(
-            inputs[n, :, ih : ih + kernel_size, iw : iw + kernel_size]
-            * weights[0, oh, ow, :, :, :, oc]
-        )
-
-    assertTensorClose(outputs.numpy(), expected, max_err=1e-5)
diff --git a/python_module/test/unit/module/test_external.py b/python_module/test/unit/module/test_external.py
deleted file mode 100644
index 44f5cf21..00000000
--- a/python_module/test/unit/module/test_external.py
+++ /dev/null
@@ -1,70 +0,0 @@
-# -*- coding: utf-8 -*-
-# MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
-#
-# Copyright (c) 2014-2020 Megvii Inc. All rights reserved.
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-import os
-
-import numpy as np
-
-import megengine as mge
-from megengine import tensor
-from megengine.module import Module
-from megengine.module.external import AtlasSubgraph, CambriconSubgraph
-
-
-class CambriconModule(Module):
-    def __init__(self, data):
-        super().__init__()
-        self.cambricon = CambriconSubgraph(data, "subnet0", True)
-
-    def forward(self, inputs):
-        out = self.cambricon(inputs)
-        return out
-
-
-def test_cambricon_module():
-    model = "CambriconRuntimeOprTest.MutableBatchSize.mlu"
-    model = os.path.join(os.path.dirname(__file__), model)
-    with open(model, "rb") as f:
-        data = f.read()
-        m = CambriconModule(data)
-        inputs = []
-        inputs.append(tensor(dtype=np.float16, device="cambricon0"))
-        inputs[0].set_value(np.random.normal(size=(1, 64, 32, 32)).astype(np.float16))
-
-        def inference(inps):
-            pred = m(inps)
-            return pred
-
-        pred = inference(inputs)
-
-
-class AtlasModule(Module):
-    def __init__(self, data):
-        super().__init__()
-        self.atlas = AtlasSubgraph(data)
-
-    def forward(self, inputs):
-        out = self.atlas(inputs)
-        return out
-
-
-def test_atlas_module():
-    model = "AtlasRuntimeOprTest.basic.om"
-    model = os.path.join(os.path.dirname(__file__), model)
-    with open(model, "rb") as f:
-        data = f.read()
-        m = AtlasModule(data)
-        inputs = []
-        inputs.append(tensor(dtype=np.float32, device="atlas0"))
-        inputs[0].set_value(np.random.normal(size=(4, 3, 16, 16)).astype(np.float32))
-
-        def inference(inps):
-            pred = m(inps)
-            return pred
-
-        pred = inference(inputs)
diff --git a/python_module/test/unit/module/test_init.py b/python_module/test/unit/module/test_init.py
deleted file mode 100644
index 06bc4339..00000000
--- a/python_module/test/unit/module/test_init.py
+++ /dev/null
@@ -1,27 +0,0 @@
-# -*- coding: utf-8 -*-
-# MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
-#
-# Copyright (c) 2014-2020 Megvii Inc. All rights reserved.
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-import pytest
-
-from megengine.module import Conv2d, Linear
-from megengine.module.init import calculate_fan_in_and_fan_out
-
-
-def test_calculate_fan_in_and_fan_out():
-    l = Linear(in_features=3, out_features=8)
-    fanin, fanout = calculate_fan_in_and_fan_out(l.weight)
-    assert fanin == 3
-    assert fanout == 8
-
-    with pytest.raises(ValueError):
-        calculate_fan_in_and_fan_out(l.bias)
-
-    l = Conv2d(in_channels=2, out_channels=3, kernel_size=(5, 7))
-    fanin, fanout = calculate_fan_in_and_fan_out(l.weight)
-    assert fanin == 2 * 5 * 7
-    assert fanout == 3 * 5 * 7
diff --git a/python_module/test/unit/module/test_module.py b/python_module/test/unit/module/test_module.py
deleted file mode 100644
index 0766f6ee..00000000
--- a/python_module/test/unit/module/test_module.py
+++ /dev/null
@@ -1,464 +0,0 @@
-# -*- coding: utf-8 -*-
-# MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
-#
-# Copyright (c) 2014-2020 Megvii Inc. All rights reserved.
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-import os
-import tempfile
-from collections import OrderedDict
-from io import BytesIO
-
-import numpy as np
-import pytest
-from helpers import MLP
-
-import megengine as mge
-import megengine._internal as mgb
-import megengine.functional as F
-from megengine.core import Buffer, Parameter, Tensor, tensor
-from megengine.module import (
-    BatchNorm1d,
-    BatchNorm2d,
-    Conv2d,
-    Linear,
-    Module,
-    Sequential,
-)
-from megengine.quantization.quantize import quantize, quantize_qat
-from megengine.test import assertTensorClose
-
-
-class MyModule(Module):
-    class InnerModule(Module):
-        def __init__(self):
-            super().__init__()
-            self.bn = BatchNorm2d(4)
-
-        def forward(self, x):
-            return self.bn(x)
-
-    def __init__(self):
-        super().__init__()
-        self.i = self.InnerModule()
-        self.bn = BatchNorm2d(4)
-        self.param = Parameter(np.ones(1, dtype=np.float32))
-        self.buff = Buffer(np.ones(1, dtype=np.float32))
-
-    def forward(self, x):
-        x = self.i(x)
-        x = self.bn(x)
-        return x
-
-
-def test_module_api():
-    m = MyModule()
-    assert list(m.children()) == [m.bn, m.i]
-    assert list(m.named_children()) == [("bn", m.bn), ("i", m.i)]
-    assert list(m.modules()) == [m, m.bn, m.i, m.i.bn]
-    assert list(m.named_modules()) == [
-        ("", m),
-        ("bn", m.bn),
-        ("i", m.i),
-        ("i.bn", m.i.bn),
-    ]
-    assert list(m.named_modules(prefix="x")) == [
-        ("x", m),
-        ("x.bn", m.bn),
-        ("x.i", m.i),
-        ("x.i.bn", m.i.bn),
-    ]
-    assert list(m.buffers()) == [
-        m.bn.running_mean,
-        m.bn.running_var,
-        m.buff,
-        m.i.bn.running_mean,
-        m.i.bn.running_var,
-    ]
-    assert list(m.buffers(recursive=False)) == [m.buff]
-    assert list(m.named_buffers()) == [
-        ("bn.running_mean", m.bn.running_mean),
-        ("bn.running_var", m.bn.running_var),
-        ("buff", m.buff),
-        ("i.bn.running_mean", m.i.bn.running_mean),
-        ("i.bn.running_var", m.i.bn.running_var),
-    ]
-    assert list(m.parameters()) == [
-        m.bn.bias,
-        m.bn.weight,
-        m.i.bn.bias,
-        m.i.bn.weight,
-        m.param,
-    ]
-    assert list(m.named_parameters()) == [
-        ("bn.bias", m.bn.bias),
-        ("bn.weight", m.bn.weight),
-        ("i.bn.bias", m.i.bn.bias),
-        ("i.bn.weight", m.i.bn.weight),
-        ("param", m.param),
-    ]
-    m.eval()
-    assert (
-        m.training == False
-        and m.bn.training == False
-        and m.i.training == False
-        and m.i.bn.training == False
-    )
-    m.bn.train()
-    assert m.training == False and m.bn.training == True and m.i.bn.training == False
-    m.eval()
-    m.i.train()
-    assert (
-        m.training == False
-        and m.bn.training == False
-        and m.i.training == True
-        and m.i.bn.training == True
-    )
-    m.eval()
-    m.train()
-    assert m.training == True and m.bn.training == True and m.i.bn.training == True
-
-    def fn(m):
-        m.training = False
-
-    m.apply(fn)
-    assert m.bn.training == False and m.i.bn.training == False
-
-
-def test_module_api_reuse_submodule():
-    m = MyModule()
-    m.h = m.i  # pylint: disable=attribute-defined-outside-init
-    assert list(m.modules()) == [m, m.bn, m.i, m.i.bn]
-    assert list(m.named_modules()) == [
-        ("", m),
-        ("bn", m.bn),
-        ("h", m.i),
-        ("h.bn", m.i.bn),
-    ]
-
-
-def test_module_api_iterable_stability():
-    m = MyModule()
-    l = list(m.modules())
-    for _ in range(100):
-        assert list(m.modules()) == l
-
-
-def test_module_api_hooks():
-    net = MyModule()
-    pre_hook_num = 0
-    post_hook_num = 0
-    hooks = []
-
-    def pre_hook(module, inputs):
-        nonlocal pre_hook_num
-        pre_hook_num += 1
-        modified_inputs = tuple(inp + 1 for inp in inputs)
-        return modified_inputs
-
-    def post_hook(module, inputs, outputs):
-        nonlocal post_hook_num
-        post_hook_num += 1
-        outputs += 1
-        return outputs
-
-    net.apply(lambda module: hooks.append(module.register_forward_pre_hook(pre_hook)))
-    net.apply(lambda module: hooks.append(module.register_forward_hook(post_hook)))
-
-    shape = (1, 4, 1, 1)
-    x = tensor(np.zeros(shape, dtype=np.float32))
-    y = net(x)
-
-    assert pre_hook_num == 4
-    assert post_hook_num == 4
-    mean1 = Parameter(np.zeros(shape), dtype=np.float32)
-    bn1 = F.batch_norm2d(
-        x + 3, mean1, Parameter(np.ones(shape), dtype=np.float32), training=True
-    )
-    assertTensorClose(
-        net.i.bn.running_mean, mean1,
-    )
-    mean2 = Parameter(np.zeros(shape), dtype=np.float32)
-    bn2 = F.batch_norm2d(
-        bn1 + 3, mean2, Parameter(np.ones(shape), dtype=np.float32), training=True
-    )
-    assertTensorClose(
-        net.bn.running_mean, mean2,
-    )
-    assertTensorClose(bn2 + 2, y)
-
-    assert len(hooks) == 8
-    for handler in hooks:
-        handler.remove()
-    y = net(x)
-    assert pre_hook_num == 4
-    assert post_hook_num == 4
-
-
-class MyModule2(Module):
-    class InnerModule(Module):
-        def __init__(self):
-            super().__init__()
-            self.bn = BatchNorm2d(4)
-            self.test_bool_key = {True: 1, False: 0}
-
-        def forward(self, x):
-            x = self.bn(x)
-
-    def __init__(self):
-        super().__init__()
-        self.bn = BatchNorm2d(4)
-        self.a = [
-            BatchNorm2d(4),
-            {"x": BatchNorm2d(4), "y": [BatchNorm2d(4), self.InnerModule()], "z": 0},
-            (self.InnerModule(),),
-        ]
-
-    def forward(self, x):
-        return x
-
-
-def test_expand_structure():
-    m = MyModule2()
-    assert list(m.named_modules()) == [
-        ("", m),
-        ("a.0", m.a[0]),
-        ("a.1.x", m.a[1]["x"]),
-        ("a.1.y.0", m.a[1]["y"][0]),
-        ("a.1.y.1", m.a[1]["y"][1]),
-        ("a.1.y.1.bn", m.a[1]["y"][1].bn),
-        ("a.2.0", m.a[2][0]),
-        ("a.2.0.bn", m.a[2][0].bn),
-        ("bn", m.bn),
-    ]
-
-
-def test_flatten_others():
-    def be_others(obj):
-        return not isinstance(obj, (Tensor, Module))
-
-    m = MyModule2()
-    assert len(list(m._flatten(with_key=True, predicate=be_others))) == 0
-
-
-def test_flatten_with_parent():
-    m = MyModule2()
-    assert list(m.named_modules(with_parent=True)) == [
-        ("", m, None),
-        ("a.0", m.a[0], m),
-        ("a.1.x", m.a[1]["x"], m),
-        ("a.1.y.0", m.a[1]["y"][0], m),
-        ("a.1.y.1", m.a[1]["y"][1], m),
-        ("a.1.y.1.bn", m.a[1]["y"][1].bn, m.a[1]["y"][1]),
-        ("a.2.0", m.a[2][0], m),
-        ("a.2.0.bn", m.a[2][0].bn, m.a[2][0]),
-        ("bn", m.bn, m),
-    ]
-    assert list(m.modules(with_parent=True)) == [
-        (m, None),
-        (m.a[0], m),
-        (m.a[1]["x"], m),
-        (m.a[1]["y"][0], m),
-        (m.a[1]["y"][1], m),
-        (m.a[1]["y"][1].bn, m.a[1]["y"][1]),
-        (m.a[2][0], m),
-        (m.a[2][0].bn, m.a[2][0]),
-        (m.bn, m),
-    ]
-
-
-class MyModule3(Module):
-    class InnerModule(Module):
-        def __init__(self):
-            super().__init__()
-            self.bn = BatchNorm2d(4)
-
-        def forward(self, x):
-            x = self.bn(x)
-
-    def __init__(self):
-        super().__init__()
-        self.bn = BatchNorm2d(4)
-        self.seq = Sequential(BatchNorm2d(4), self.InnerModule(),)
-
-    def forward(self, x):
-        return x
-
-
-def test_module_api_with_sequential():
-    m = MyModule3()
-    assert list(m.named_modules()) == [
-        ("", m),
-        ("bn", m.bn),
-        ("seq", m.seq),
-        ("seq.0", m.seq[0]),
-        ("seq.1", m.seq[1]),
-        ("seq.1.bn", m.seq[1].bn),
-    ]
-
-
-def test_sequential_named_children():
-    modules = OrderedDict()
-    modules["name0"] = Linear(20, 10)
-    modules["name1"] = Linear(10, 5)
-    modules["name2"] = Linear(5, 1)
-    m = Sequential(modules)
-    l = list(m.named_children())
-    assert l[0][0] == "name0"
-    assert l[1][0] == "name1"
-    assert l[2][0] == "name2"
-
-
-def test_state_dict():
-    data_shape = (2, 28)
-    data = tensor()
-    data.set_value(np.random.random(data_shape))
-    mlp = MLP()
-    pred0 = mlp(data)
-
-    with BytesIO() as fout:
-        mge.save(mlp.state_dict(), fout)
-        fout.seek(0)
-        state_dict = mge.load(fout)
-        state_dict["extra"] = None
-        mlp1 = MLP()
-        mlp1.load_state_dict(state_dict, strict=False)
-        pred1 = mlp1(data)
-        assertTensorClose(pred0.numpy(), pred1.numpy(), max_err=5e-6)
-        with pytest.raises(KeyError):
-            mlp1.load_state_dict(state_dict)
-        del state_dict["extra"]
-        del state_dict["dense0.bias"]
-        with pytest.raises(KeyError):
-            mlp1.load_state_dict(state_dict)
-
-
-class AssertModule(Module):
-    def __init__(self):
-        super().__init__()
-        self.error_tensor_key = {True: tensor(), False: 0}
-
-    def forward(self, x):
-        return x
-
-
-def test_assert_message():
-    m = AssertModule()
-    with pytest.raises(
-        AssertionError, match="keys for Tensor and Module must be str, error key: True"
-    ):
-        list(m._flatten())
-
-
-class Simple(Module):
-    def __init__(self):
-        super().__init__()
-        self.conv0 = Conv2d(1, 1, kernel_size=3, bias=False)
-        self.conv1 = Conv2d(1, 1, kernel_size=3, bias=False)
-        self.conv1.weight = self.conv0.weight
-
-    def forward(self, inputs):
-        pass
-
-
-def test_shared_param():
-    net = Simple()
-    assert net.conv0.weight is net.conv1.weight
-    data = tensor(np.random.random((1, 1, 8, 8)).astype(np.float32))
-    assertTensorClose(net.conv0(data).numpy(), net.conv1(data).numpy())
-    with BytesIO() as f:
-        mge.save(net, f)
-        f.seek(0)
-        net1 = mge.load(f)
-    assert net1.conv0.weight is net1.conv1.weight
-    assertTensorClose(net1.conv0(data).numpy(), net1.conv1(data).numpy())
-
-    with BytesIO() as f:
-        mge.save(net.conv0, f)
-        f.seek(0)
-        conv0 = mge.load(f)
-
-    with BytesIO() as f:
-        mge.save(net.conv1, f)
-        f.seek(0)
-        conv1 = mge.load(f)
-
-    assert conv0.weight is not conv1.weight
-    assertTensorClose(conv0(data).numpy(), conv1(data).numpy())
-
-
-def test_pickle_module():
-    data_shape = (2, 28)
-    data = tensor()
-    data.set_value(np.random.random(data_shape))
-    mlp = MLP()
-    # pickle before forward
-    with BytesIO() as fout:
-        mge.save(mlp, fout)
-        fout.seek(0)
-        mlp1 = mge.load(fout)
-        pred0 = mlp1(data)
-
-    pred1 = mlp(data)
-
-    # pickle after forward
-    with BytesIO() as fout:
-        mge.save(mlp, fout)
-        fout.seek(0)
-        mlp1 = mge.load(fout)
-        pred2 = mlp1(data)
-
-    assertTensorClose(pred0.numpy(), pred1.numpy(), max_err=5e-6)
-    assertTensorClose(pred0.numpy(), pred2.numpy(), max_err=5e-6)
-
-
-def test_dump_model():
-    data_shape = (2, 28)
-    data = tensor()
-    data.set_value(np.random.random(data_shape))
-    mlp = MLP()
-    pred = mlp(data)
-    f = tempfile.NamedTemporaryFile(delete=False)
-    f_name = f.name
-    try:
-        mge.dump(pred, f_name)
-    finally:
-        f.close()
-        os.unlink(f_name)
-
-
-def test_load_quantized():
-    data_shape = (2, 28)
-    data = tensor(np.random.random(data_shape), dtype="float32")
-    data = data.astype(mgb.dtype.qint8(0.1))
-    mlp = MLP()
-    quantize_qat(mlp)
-    quantize(mlp)
-    mlp.dense0.weight = Parameter(
-        mlp.dense0.weight.astype(mgb.dtype.qint8(0.001)).numpy()
-    )
-    mlp.dense1.weight = Parameter(
-        mlp.dense1.weight.astype(mgb.dtype.qint8(0.0002)).numpy()
-    )
-    mlp.eval()
-    pred0 = mlp(data)
-
-    with BytesIO() as fout:
-        mge.save(mlp.state_dict(), fout)
-        fout.seek(0)
-        checkpoint = mge.load(fout)
-        # change mlp weight.
-        mlp.dense0.weight = Parameter(
-            mlp.dense0.weight.astype(mgb.dtype.qint8(0.00001)).numpy()
-        )
-        mlp.dense1.weight = Parameter(
-            mlp.dense1.weight.astype(mgb.dtype.qint8(0.2)).numpy()
-        )
-        mlp.load_state_dict(checkpoint)
-        pred1 = mlp(data)
-
-    assertTensorClose(
-        pred0.astype("float32").numpy(), pred1.astype("float32").numpy(), max_err=5e-6
-    )
diff --git a/python_module/test/unit/module/test_pytorch.py b/python_module/test/unit/module/test_pytorch.py
deleted file mode 100644
index d7b3ae9a..00000000
--- a/python_module/test/unit/module/test_pytorch.py
+++ /dev/null
@@ -1,140 +0,0 @@
-# -*- coding: utf-8 -*-
-# MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
-#
-# Copyright (c) 2014-2020 Megvii Inc. All rights reserved.
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-import numpy as np
-import torch
-from helpers import randomTorch
-
-import megengine as mge
-import megengine._internal as mgb
-import megengine.functional
-import megengine.optimizer as optimizer
-from megengine import get_default_device, set_default_device
-from megengine.core import Parameter, tensor
-from megengine.jit import trace
-from megengine.module import Module as MGEModule
-from megengine.module.pytorch import PyTorchModule
-from megengine.test import assertTensorClose
-
-
-def test_pytorch_forward():
-    class APlusB(torch.nn.Module):
-        def __init__(self):
-            super(APlusB, self).__init__()
-
-        def forward(self, a, b):
-            return a + b
-
-    a = randomTorch(15, 15)
-    b = randomTorch(15, 15)
-
-    def get_pytorch_forward():
-        return APlusB()(a, b)
-
-    def get_mge_forward():
-        mge_module = PyTorchModule(APlusB())
-        mge_a = tensor(a.numpy(), dtype=np.float32)
-        mge_b = tensor(b.numpy(), dtype=np.float32)
-        return mge_module(mge_a, mge_b)
-
-    assertTensorClose(get_pytorch_forward().numpy(), get_mge_forward().numpy())
-
-
-def test_pytorch_backward():
-    class APlusB(torch.nn.Module):
-        def __init__(self):
-            super(APlusB, self).__init__()
-
-        def forward(self, a, b):
-            return a + b
-
-    a = randomTorch(15, 15)
-    b = randomTorch(15, 15)
-
-    def get_pytorch_backward():
-        parameter_a = a.clone()
-        parameter_a.requires_grad = True
-        c = APlusB()(parameter_a, b)
-        d = APlusB()(c, b)
-        e = torch.sum(d)
-        e.backward()
-        return parameter_a.grad
-
-    def get_mge_backward():
-        mge_module = PyTorchModule(APlusB())
-        mge_a = Parameter(a.numpy(), dtype=np.float32)
-        mge_b = tensor(b.numpy(), dtype=np.float32)
-        mge_c = mge_module(mge_a, mge_b)
-        mge_d = mge_module(mge_c, mge_b)
-        mge_e = mge.functional.sum(mge_d)
-        return mge.functional.grad(mge_e, mge_a, use_virtual_grad=False)
-
-    assertTensorClose(get_pytorch_backward().numpy(), get_mge_backward().numpy())
-
-
-def test_pytorch_mixed():
-
-    init_param = (np.array([2.0], dtype=np.float32), np.array([3.0], dtype=np.float32))
-    lr = 1.0
-
-    class Mixed(MGEModule):
-        class SubModule(torch.nn.Module):
-            def __init__(self):
-                super().__init__()
-                self.multiplier = torch.nn.Parameter(torch.tensor(init_param[0]))
-
-            def forward(self, inp):
-                return inp * self.multiplier
-
-        def __init__(self):
-            super().__init__()
-            self.torch_module = PyTorchModule(self.SubModule())
-            self.multiplier = Parameter(init_param[1], dtype=np.float32)
-
-        def forward(self, inp):
-            return self.torch_module(inp) * self.multiplier
-
-    def run(step, enable_trace, use_symbolic):
-        def train_func(data, net=None, opt=None):
-            pred = net(data)
-            opt.backward(pred)
-            return pred
-
-        if enable_trace:
-            train_func = trace(train_func, symbolic=use_symbolic)
-
-        net = Mixed()
-        data = tensor()
-        opt = optimizer.SGD(net.parameters(), lr=lr)
-
-        saved_param = init_param
-        for i in range(step):
-            opt.zero_grad()
-            data.set_value([i + 1.0])
-            output = train_func(data, net=net, opt=opt)
-            opt.step()
-
-            expect_param = (
-                saved_param[0] - lr * saved_param[1] * data.numpy(),
-                saved_param[1] - lr * saved_param[0] * data.numpy(),
-            )
-            assertTensorClose(
-                output.numpy(), saved_param[0] * saved_param[1] * data.numpy()
-            )
-            torch_param = net.torch_module._torch_params[0].detach().cpu()
-            assertTensorClose(torch_param.numpy(), expect_param[0])
-            assertTensorClose(net.multiplier.numpy(), expect_param[1])
-            saved_param = expect_param
-
-    run(1, False, False)
-    run(1, True, True)
-    run(1, True, False)
-
-    run(2, False, False)
-    run(2, True, True)
-    run(2, True, False)
diff --git a/python_module/test/unit/module/test_qat.py b/python_module/test/unit/module/test_qat.py
deleted file mode 100644
index 6b6c5a86..00000000
--- a/python_module/test/unit/module/test_qat.py
+++ /dev/null
@@ -1,85 +0,0 @@
-from itertools import product
-
-import numpy as np
-
-from megengine import tensor
-from megengine.module import (
-    Conv2d,
-    ConvBn2d,
-    ConvRelu2d,
-    DequantStub,
-    Module,
-    QuantStub,
-)
-from megengine.quantization.quantize import disable_fake_quant, quantize_qat
-from megengine.test import assertTensorClose
-
-
-def test_qat_convbn2d():
-    in_channels = 32
-    out_channels = 64
-    kernel_size = 3
-    for groups, bias in product([1, 4], [True, False]):
-        module = ConvBn2d(
-            in_channels, out_channels, kernel_size, groups=groups, bias=bias
-        )
-        module.train()
-        qat_module = quantize_qat(module, inplace=False)
-        disable_fake_quant(qat_module)
-        inputs = tensor(np.random.randn(4, in_channels, 32, 32).astype(np.float32))
-        normal_outputs = module(inputs)
-        qat_outputs = qat_module(inputs)
-        assertTensorClose(normal_outputs, qat_outputs, max_err=5e-6)
-        assertTensorClose(
-            module.bn.running_mean, qat_module.bn.running_mean, max_err=5e-8
-        )
-        assertTensorClose(
-            module.bn.running_var, qat_module.bn.running_var, max_err=5e-7
-        )
-        module.eval()
-        normal_outputs = module(inputs)
-        qat_module.eval()
-        qat_outputs = qat_module(inputs)
-        assertTensorClose(normal_outputs, qat_outputs, max_err=5e-6)
-
-
-def test_qat_conv():
-
-    in_channels = 32
-    out_channels = 64
-    kernel_size = 3
-
-    class TestNet(Module):
-        def __init__(self, groups, bias):
-            super().__init__()
-            self.quant = QuantStub()
-            self.dequant = DequantStub()
-            self.conv = Conv2d(
-                in_channels, out_channels, kernel_size, groups=groups, bias=bias
-            )
-            self.conv_relu = ConvRelu2d(
-                out_channels, in_channels, kernel_size, groups=groups, bias=bias
-            )
-
-        def forward(self, inp):
-            out = self.quant(inp)
-            out = self.conv(out)
-            out = self.conv_relu(out)
-            out = self.dequant(out)
-            return out
-
-    inputs = tensor(np.random.randn(4, in_channels, 32, 32).astype(np.float32))
-    for groups, bias in product([1, 4], [True, False]):
-        net = TestNet(groups, bias)
-        net.train()
-        qat_net = quantize_qat(net, inplace=False)
-        disable_fake_quant(qat_net)
-        normal_outputs = net(inputs)
-        qat_outputs = qat_net(inputs)
-        assertTensorClose(normal_outputs, qat_outputs)
-
-        net.eval()
-        normal_outputs = net(inputs)
-        qat_net.eval()
-        qat_outputs = qat_net(inputs)
-        assertTensorClose(normal_outputs, qat_outputs)
diff --git a/python_module/test/unit/module/test_tensor.py b/python_module/test/unit/module/test_tensor.py
deleted file mode 100644
index 86c5726e..00000000
--- a/python_module/test/unit/module/test_tensor.py
+++ /dev/null
@@ -1,87 +0,0 @@
-# -*- coding: utf-8 -*-
-# MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
-#
-# Copyright (c) 2014-2020 Megvii Inc. All rights reserved.
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-import copy
-
-import numpy as np
-import pytest
-
-import megengine as mge
-import megengine.functional as F
-from megengine.core import Buffer, Graph, Parameter
-from megengine.module import Conv2d
-from megengine.test import assertTensorClose
-
-
-def test_set_value():
-    v0 = np.random.random((2, 3)).astype(np.float32)
-    param = Parameter(v0)
-    v1 = np.random.random((2, 3)).astype(np.float32)
-    param.set_value(v1)
-    assertTensorClose(param.numpy(), v1, max_err=5e-6)
-    v2 = np.random.random((3, 3)).astype(np.float32)
-    # TODO: add this
-    # with pytest.raises(ValueError):
-    #     param.set_value(v2)
-    assertTensorClose(param.numpy(), v1, max_err=5e-6)
-
-
-def test_fill():
-    a = Buffer(np.zeros((2, 3), dtype=np.float32))
-    a.fill(3)
-    assertTensorClose(a.numpy(), np.full((2, 3), 3, dtype=np.float32))
-    a.fill(124.568)
-    assertTensorClose(a.numpy(), np.full((2, 3), 124.568, dtype=np.float32))
-
-
-# TODO: remove or rewrite following test
-# def test_attach():
-#     p_ = np.random.random((2, 3)).astype(np.float32)
-
-#     with Graph() as g:
-#         g.set_option('eager_evaluation', False)
-#         p = Parameter(p_)
-#         v = p * 2
-#         f = compile(v, None)
-
-#     out, = f()
-#     assertTensorClose(out, p_ * 2)
-
-#     F.add_update(p, p)
-#     out, = f()
-#     assertTensorClose(out, p_ * 4)
-
-# TODO: remove or rewrite following test
-# def test_module_attach():
-#     v = np.random.random((1, 3, 64, 64)).astype(np.float32)
-#     net = Conv2d(3, 16, 3)
-
-#     with Graph() as g:
-#         g.set_option('eager_evaluation', False)
-
-#         data0 = Input("data")
-#         f = compile(net(data0), None)
-
-#     out0, = f(data=v)
-
-#     data1 = Input("data", value=v)
-#     out1 = net(data1)
-
-#     assertTensorClose(out0, out1.numpy())
-
-
-def test_shape_warning():
-    with Graph() as cg:
-        cg.set_option("eager_evaluation", False)
-        b = Buffer(np.ones((2, 3)).astype(np.float32))
-        with pytest.warns(None) as record:
-            print(b.shape)
-        if len(record) != 0:
-            raise ValueError(
-                "Getting the shape of a constant Tensor should throw no Warning"
-            )
diff --git a/python_module/test/unit/optimizer/__init__.py b/python_module/test/unit/optimizer/__init__.py
deleted file mode 100644
index e69de29b..00000000
diff --git a/python_module/test/unit/optimizer/test_lr_scheduler.py b/python_module/test/unit/optimizer/test_lr_scheduler.py
deleted file mode 100644
index e185f179..00000000
--- a/python_module/test/unit/optimizer/test_lr_scheduler.py
+++ /dev/null
@@ -1,23 +0,0 @@
-from bisect import bisect_right
-
-import numpy as np
-from helpers import MLP
-
-from megengine.optimizer import SGD, MultiStepLR
-from megengine.test import assertTensorClose
-
-
-def test_multi_step_lr():
-    mlp = MLP()
-    opt = SGD(mlp.parameters(), lr=0.01, momentum=0.9)
-    scheduler = MultiStepLR(opt, [3, 6, 8])
-
-    lr = np.array(0.01, dtype=np.float32)
-    for i in range(10):
-        for group in opt.param_groups:
-            assertTensorClose(
-                np.array(group["lr"], dtype=np.float32),
-                (lr * 0.1 ** bisect_right([3, 6, 8], i)).astype(np.float32),
-                max_err=5e-6,
-            )
-        scheduler.step()
diff --git a/python_module/test/unit/optimizer/test_optimizer.py b/python_module/test/unit/optimizer/test_optimizer.py
deleted file mode 100644
index e172df79..00000000
--- a/python_module/test/unit/optimizer/test_optimizer.py
+++ /dev/null
@@ -1,258 +0,0 @@
-# -*- coding: utf-8 -*-
-# MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
-#
-# Copyright (c) 2014-2020 Megvii Inc. All rights reserved.
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-from io import BytesIO
-
-import numpy as np
-from helpers import MLP, graph_mode
-
-import megengine.functional as F
-from megengine import load, optimizer, save
-from megengine.core import TensorDict, tensor
-from megengine.jit import trace
-from megengine.test import assertTensorClose
-
-
-def get_input():
-    batch_size, input_dim = 2, 28
-    data_shape, label_shape = (batch_size, input_dim), (batch_size,)
-    data, label = tensor(dtype=np.float32), tensor(dtype=np.int32)
-    data.set_value(np.random.random(data_shape).astype(np.float32))
-    label.set_value(np.random.randint(0, 10, label_shape))
-    return data, data_shape, label, label_shape
-
-
-@graph_mode("eager", "static")
-def test_optimizer_serialization():
-    data, data_shape, label, label_shape = get_input()
-    mlp = MLP()
-    opt = optimizer.SGD(mlp.parameters(), lr=0.01, momentum=0.9)
-    slots = TensorDict()
-    for param in mlp.parameters():
-        slots[param] = np.zeros(param.shape).astype(np.float32)
-
-    pred = mlp(data)
-    loss = F.square_loss(pred, label.reshape(-1, 1))
-    opt.zero_grad()
-    opt.backward(loss)
-    opt.step()
-    for param in mlp.parameters():
-        slots[param] = slots[param] * 0.9 + param.grad.numpy()
-
-    with BytesIO() as fout:
-        save(opt.state_dict(), fout)
-        fout.seek(0)
-        state_dict = load(fout)
-        opt1 = optimizer.SGD(mlp.parameters(), lr=0.02, momentum=0.8)
-        opt1.load_state_dict(state_dict)
-
-        data.set_value(np.random.random(data_shape).astype(np.float32))
-        label.set_value(np.random.randint(0, 10, label_shape))
-        pred = mlp(data)
-        loss = F.square_loss(pred, label.reshape(-1, 1))
-        opt1.zero_grad()
-        opt1.backward(loss)
-        orig_params = TensorDict()
-        for param in mlp.parameters():
-            orig_params[param] = np.copy(param.numpy())
-        opt1.step()
-        for param in mlp.parameters():
-            orig_param = orig_params[param]
-            slots[param] = slots[param] * 0.9 + param.grad.numpy()
-            assertTensorClose(param.numpy(), orig_param - 0.01 * slots[param])
-
-
-def _test_optimizer(opt_str, test_case, check_class, update_lr=False):
-    iter_num = 3
-    data, data_shape, label, label_shape = get_input()
-
-    net = MLP()
-    opt = getattr(optimizer, opt_str)(net.parameters(), **test_case)
-    check_func = check_class(net, **test_case)
-
-    step = 0
-
-    # eager graph
-    for i in range(iter_num):
-        if update_lr and i == 1:  # change learning rate
-            for group in opt.param_groups:
-                group["lr"] += 0.01
-            check_func.lr += 0.01
-        data.set_value(np.random.random(data_shape).astype(np.float32))
-        label.set_value(np.random.randint(0, 10, label_shape))
-        pred = net(data)
-        loss = F.square_loss(pred, label.reshape(-1, 1))
-        opt.zero_grad()
-        opt.backward(loss)
-        ori_params = TensorDict()
-        for param in net.parameters():
-            ori_params[param] = np.copy(param.numpy())
-        opt.step()
-        step += 1
-        check_func(ori_params, net.parameters(), step)
-
-    # static graph
-    @trace
-    def train_func(data, label):
-        pred = net(data)
-        loss = F.square_loss(pred, label.reshape(-1, 1))
-        opt.backward(loss)
-
-    for i in range(iter_num):
-        if update_lr and i == 1:  # change learning rate
-            for group in opt.param_groups:
-                group["lr"] += 0.01
-            check_func.lr += 0.01
-        opt.zero_grad()
-        ori_params = TensorDict()
-        for param in net.parameters():
-            ori_params[param] = np.copy(param.numpy())
-        train_func(
-            np.random.random(data_shape).astype(np.float32),
-            np.random.randint(0, 10, label_shape).astype(np.int32),
-        )
-        opt.step()
-        step += 1
-        check_func(ori_params, net.parameters(), step)
-
-
-def test_sgd():
-    class CheckValue:
-        def __init__(self, net, **kwarg):
-            self.slots = TensorDict()
-            for param in net.parameters():
-                self.slots[param] = np.zeros(param.shape).astype(np.float32)
-            for k, v in kwarg.items():
-                setattr(self, k, v)
-
-        def __call__(self, ori_params, new_params, step):
-            for param in new_params:
-                grad = param.grad.numpy()
-                if hasattr(self, "momentum"):
-                    self.slots[param] = grad + self.slots[param] * self.momentum
-                    delta = -self.lr * self.slots[param]
-                else:
-                    delta = -self.lr * grad
-                assertTensorClose(param.numpy(), ori_params[param] + delta)
-
-    cases = [
-        {"momentum": 0.9, "lr": 0.01},  # SGD with momentum
-        {"lr": 0.01},  # simple SGD
-        {"weight_decay": 0.1, "lr": 0.01},  # with weight_decay
-    ]
-    for case in cases:
-        _test_optimizer("SGD", case, CheckValue)
-        _test_optimizer("SGD", case, CheckValue, update_lr=True)
-
-
-def test_adam():
-    class CheckValue:
-        def __init__(self, net, **kwarg):
-            self.m_slots = TensorDict()
-            self.v_slots = TensorDict()
-            for param in net.parameters():
-                self.m_slots[param] = np.zeros(param.shape).astype(np.float32)
-                self.v_slots[param] = np.zeros(param.shape).astype(np.float32)
-            for k, v in kwarg.items():
-                setattr(self, k, v)
-
-        def __call__(self, ori_params, new_params, step):
-            for param in new_params:
-                grad = param.grad.numpy()
-                m = self.m_slots[param]
-                v = self.v_slots[param]
-                m *= self.betas[0]
-                m += (1 - self.betas[0]) * grad
-                v *= self.betas[1]
-                v += (1 - self.betas[1]) * grad * grad
-                delta = (m / (1 - self.betas[0] ** step)) / (
-                    np.sqrt(v / (1 - self.betas[1] ** step)) + self.eps
-                )
-                assertTensorClose(param.numpy(), ori_params[param] - self.lr * delta)
-
-    cases = [
-        {"betas": (0.8, 0.9), "eps": 1e-04, "lr": 0.01},
-        {
-            "betas": (0.8, 0.9),
-            "eps": 1e-04,
-            "lr": 0.01,
-            "weight_decay": 0.1,
-        },  # with weight_decay
-    ]
-    for case in cases:
-        _test_optimizer("Adam", case, CheckValue)
-        _test_optimizer("Adam", case, CheckValue, update_lr=True)
-
-
-def test_adagrad():
-    class CheckValue:
-        def __init__(self, net, **kwarg):
-            self.s_slots = TensorDict()
-            for param in net.parameters():
-                self.s_slots[param] = np.zeros(param.shape).astype(np.float32)
-            for k, v in kwarg.items():
-                setattr(self, k, v)
-
-        def __call__(self, ori_params, new_params, step):
-            for param in new_params:
-                grad = param.grad.numpy()
-                self.s_slots[param] += grad ** 2
-                delta = grad / (self.s_slots[param] + self.eps) ** 0.5
-                delta *= -(self.lr / (1 + (step - 1) * self.lr_decay))
-                assertTensorClose(param.numpy(), ori_params[param] + delta)
-
-    cases = [
-        {"lr": 0.01, "eps": 1e-06, "lr_decay": 0.01},
-        {"lr": 0.01, "eps": 1e-06, "lr_decay": 0.0},  # without lr_decay
-        {
-            "lr": 0.01,
-            "eps": 1e-06,
-            "lr_decay": 0.01,
-            "weight_decay": 0.1,
-        },  # with weight_decay
-    ]
-    for case in cases:
-        _test_optimizer("Adagrad", case, CheckValue)
-        _test_optimizer("Adagrad", case, CheckValue, update_lr=True)
-
-
-def test_adadelta():
-    class CheckValue:
-        def __init__(self, net, **kwarg):
-            self.s_slots = TensorDict()
-            self.a_slots = TensorDict()
-            for param in net.parameters():
-                self.s_slots[param] = np.zeros(param.shape).astype(np.float32)
-                self.a_slots[param] = np.zeros(param.shape).astype(np.float32)
-            for k, v in kwarg.items():
-                setattr(self, k, v)
-
-        def __call__(self, ori_params, new_params, step):
-            for param in new_params:
-                grad = param.grad.numpy()
-                self.s_slots[param] = self.s_slots[param] * self.rho + grad ** 2 * (
-                    1 - self.rho
-                )
-                delta = (
-                    grad
-                    * ((self.a_slots[param] + self.eps) ** 0.5)
-                    / (self.s_slots[param] + self.eps) ** 0.5
-                )
-                self.a_slots[param] = self.a_slots[param] * self.rho + delta ** 2 * (
-                    1 - self.rho
-                )
-                delta *= -self.lr
-                assertTensorClose(param.numpy(), ori_params[param] + delta)
-
-    cases = [
-        {"lr": 1.0, "eps": 1e-06, "rho": 0.9},
-        {"lr": 1.0, "eps": 1e-06, "rho": 0.9, "weight_decay": 0.9},  # with weight_decay
-    ]
-    for case in cases:
-        _test_optimizer("Adadelta", case, CheckValue)
-        _test_optimizer("Adadelta", case, CheckValue, update_lr=True)
diff --git a/python_module/test/unit/quantization/quantize.py b/python_module/test/unit/quantization/quantize.py
deleted file mode 100644
index 236ef9e1..00000000
--- a/python_module/test/unit/quantization/quantize.py
+++ /dev/null
@@ -1,80 +0,0 @@
-# MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
-#
-# Copyright (c) 2014-2020 Megvii Inc. All rights reserved.
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-from megengine import module as Float
-from megengine.module import qat as QAT
-from megengine.quantization.quantize import _get_quantable_module_names, quantize_qat
-
-
-def test_get_quantable_module_names():
-    # need to make sure names from Quantized and QAT are the same
-    def _get_qat_module_names():
-        def is_qat(key: str):
-            value = getattr(QAT, key)
-            return (
-                isinstance(value, type)
-                and issubclass(value, QAT.QATModule)
-                and value != QAT.QATModule
-            )
-
-        # source should have all quantable modules' names
-        quantable_module_names = [key for key in dir(QAT) if is_qat(key)]
-        return quantable_module_names
-
-    qat_module_names = _get_qat_module_names()
-    quantized_module_names = _get_quantable_module_names()
-    assert set(qat_module_names) == set(quantized_module_names)
-
-    for key in qat_module_names:
-        value = getattr(Float, key)
-        assert (
-            isinstance(value, type)
-            and issubclass(value, Float.Module)
-            and value != Float.Module
-        )
-
-
-def test_disable_quantize():
-    class Net(Float.Module):
-        def __init__(self):
-            super().__init__()
-            self.conv = Float.ConvBnRelu2d(3, 3, 3)
-            self.conv.disable_quantize()
-
-        def forward(self, x):
-            return self.conv(x)
-
-    net = Net()
-    qat_net = quantize_qat(net, inplace=False)
-    assert isinstance(qat_net.conv, Float.ConvBnRelu2d)
-    assert isinstance(qat_net.conv.conv, Float.Conv2d)
-
-
-def test_convert_with_custom_mapping():
-    class FloatExample(Float.Module):
-        def forward(self, x):
-            return x
-
-    class QATExample(QAT.QATModule):
-        def forward(self, x):
-            return x
-
-        @classmethod
-        def from_float_module(cls, float_module):
-            return cls()
-
-    class Net(Float.Module):
-        def __init__(self):
-            super().__init__()
-            self.example = FloatExample()
-
-        def forward(self, x):
-            return self.example(x)
-
-    net = Net()
-    qat_net = quantize_qat(net, inplace=False, mapping={FloatExample: QATExample})
-    assert isinstance(qat_net.example, QATExample)
diff --git a/python_module/test/unit/quantization/test_fake_quant.py b/python_module/test/unit/quantization/test_fake_quant.py
deleted file mode 100644
index 0fbd9eb1..00000000
--- a/python_module/test/unit/quantization/test_fake_quant.py
+++ /dev/null
@@ -1,80 +0,0 @@
-# -*- coding: utf-8 -*-
-# MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
-#
-# Copyright (c) 2014-2020 Megvii Inc. All rights reserved.
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-import numpy as np
-import pytest
-
-import megengine as mge
-import megengine._internal as mgb
-from megengine.core import tensor
-from megengine.quantization.fake_quant import TQT_Function
-from megengine.quantization.internal_fake_quant import *
-from megengine.test import assertTensorClose
-
-
-class numpy_TQT_Function:
-    def __init__(self, lowerbound, upperbound):
-        super().__init__()
-        self.lowerbound = lowerbound
-        self.upperbound = upperbound
-
-    def forward(self, inp, scale):
-        t = 2 ** scale
-        # t = F.maximum(t, 1e-4)
-        inp_scaled = inp / t
-        inp_clipped = np.maximum(
-            np.minimum(inp_scaled, self.upperbound), self.lowerbound
-        )
-        inp_rounded = np.round(inp_clipped)
-        inp_flq = inp_rounded * t
-        self.saved_tensors = (inp_scaled, inp_rounded, t)
-        return inp_flq
-
-    def backward(self, grad_inp_flq):
-        (inp_scaled, inp_rounded, t) = self.saved_tensors
-        mask_clip = (inp_scaled < -0.5 + self.lowerbound) + (
-            inp_scaled > self.upperbound + 0.5
-        )  # mask for accumulating the gradients of |data_scaled|>L
-        mask_quant = np.abs(
-            mask_clip - 1
-        )  # mask for accumulating the gradients with |data_scaled|<=L
-        grad_quant = (
-            grad_inp_flq * mask_quant * (inp_rounded - inp_scaled)
-        )  # gradient within |data_scaled|<=L
-        grad_clip = (
-            grad_inp_flq * mask_clip * inp_rounded
-        )  # gradient with   | data_scaled|>L
-        grad_s = grad_clip.sum() + grad_quant.sum()
-        # dL/ds = dL/dt * t * ln(2)
-        grad_s = grad_s * t * np.log(2)
-        grad_inp = grad_inp_flq * mask_quant
-        return grad_inp, grad_s
-
-
-def test_TQT():
-    f = TQT_Function(-127, 127)
-    nf = numpy_TQT_Function(-127, 127)
-
-    def check_inp(a, b, c, a_np, b_np, c_np):
-        assertTensorClose(
-            f.forward(a, b).numpy(), nf.forward(a_np, b_np).astype("float32")
-        )
-        c1, c2 = f.backward(c)
-        c1_np, c2_np = nf.backward(c_np)
-        assertTensorClose(c1.numpy(), c1_np.astype("float32"))
-        assertTensorClose(c2.numpy(), c2_np.astype("float32"))
-
-    a = tensor()
-    b = tensor()
-    a_np = np.random.random((4, 3)).astype("float32")
-    b_np = np.random.random((1)).astype("float32")
-    a.set_value(a_np)
-    b.set_value(b_np)
-    check_inp(a, b, b, a_np, b_np, b_np)
-
-
diff --git a/python_module/test/unit/random/test_random.py b/python_module/test/unit/random/test_random.py
deleted file mode 100644
index 2e8023e8..00000000
--- a/python_module/test/unit/random/test_random.py
+++ /dev/null
@@ -1,162 +0,0 @@
-# -*- coding: utf-8 -*-
-# MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
-#
-# Copyright (c) 2014-2020 Megvii Inc. All rights reserved.
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-import numpy as np
-
-import megengine as mge
-import megengine.functional as F
-import megengine.jit as jit
-import megengine.module as M
-import megengine.random as R
-
-
-def test_random_static_diff_result():
-    @jit.trace(symbolic=True)
-    def graph_a():
-        return R.uniform(5) + R.gaussian(5)
-
-    @jit.trace(symbolic=True)
-    def graph_b():
-        return R.uniform(5) + R.gaussian(5)
-
-    a = graph_a()
-    b = graph_b()
-    assert np.any(a.numpy() != b.numpy())
-
-
-def test_random_static_same_result():
-    @jit.trace(symbolic=True)
-    def graph_a():
-        R.manual_seed(731)
-        return R.uniform(5) + R.gaussian(5)
-
-    @jit.trace(symbolic=True)
-    def graph_b():
-        R.manual_seed(731)
-        return R.uniform(5) + R.gaussian(5)
-
-    a = graph_a()
-    b = graph_b()
-    assert np.all(a.numpy() == b.numpy())
-
-
-def test_random_dynamic_diff_result():
-    a = R.uniform(5) + R.gaussian(5)
-    b = R.uniform(5) + R.gaussian(5)
-    assert np.any(a.numpy() != b.numpy())
-
-
-def test_random_dynamic_same_result():
-    R.manual_seed(0)
-    a = R.uniform(5) + R.gaussian(5)
-    R.manual_seed(0)
-    b = R.uniform(5) + R.gaussian(5)
-    assert np.all(a.numpy() == b.numpy())
-
-
-def test_range_uniform_static_diff_result():
-    @jit.trace(symbolic=True)
-    def graph_a():
-        return R.uniform(5, low=-2, high=2)
-
-    @jit.trace(symbolic=True)
-    def graph_b():
-        return R.uniform(5, low=-2, high=2)
-
-    a = graph_a()
-    b = graph_b()
-    assert np.any(a.numpy() != b.numpy())
-
-
-def test_range_uniform_static_same_result():
-    @jit.trace(symbolic=True)
-    def graph_a():
-        R.manual_seed(731)
-        return R.uniform(5, low=-2, high=2)
-
-    @jit.trace(symbolic=True)
-    def graph_b():
-        R.manual_seed(731)
-        return R.uniform(5, low=-2, high=2)
-
-    a = graph_a()
-    b = graph_b()
-    assert np.all(a.numpy() == b.numpy())
-
-
-def test_range_uniform_dynamic_diff_result():
-    a = R.uniform(5, low=-2, high=2)
-    b = R.uniform(5, low=-2, high=2)
-    assert np.any(a.numpy() != b.numpy())
-
-
-def test_range_uniform_dynamic_same_result():
-    R.manual_seed(0)
-    a = R.uniform(5, low=-2, high=2)
-    R.manual_seed(0)
-    b = R.uniform(5, low=-2, high=2)
-    assert np.all(a.numpy() == b.numpy())
-
-
-def test_dropout_dynamic_diff_result():
-    x = mge.ones(10)
-    a = F.dropout(x, 0.5)
-    b = F.dropout(x, 0.5)
-    assert np.any(a.numpy() != b.numpy())
-
-
-def test_dropout_dynamic_same_result():
-    x = mge.ones(10)
-    R.manual_seed(0)
-    a = F.dropout(x, 0.5)
-    R.manual_seed(0)
-    b = F.dropout(x, 0.5)
-    assert np.all(a.numpy() == b.numpy())
-
-
-def test_M_dropout_static_diff_result():
-    m = M.Dropout(0.5)
-
-    @jit.trace(symbolic=True)
-    def graph_a(x):
-        return m(x)
-
-    @jit.trace(symbolic=True)
-    def graph_b(x):
-        return m(x)
-
-    x = np.ones(10, dtype="float32")
-    a = graph_a(x)
-    a = a.numpy().copy()
-    b = graph_b(x)
-    c = graph_a(x)
-    assert np.any(a != b.numpy())
-    assert np.any(a != c.numpy())
-
-
-def test_M_dropout_static_same_result():
-    m = M.Dropout(0.5)
-
-    @jit.trace(symbolic=True)
-    def graph_a(x):
-        return m(x)
-
-    @jit.trace(symbolic=True)
-    def graph_b(x):
-        return m(x)
-
-    x = np.ones(10, dtype="float32")
-    R.manual_seed(0)
-    a = graph_a(x)
-    a = a.numpy().copy()
-    R.manual_seed(0)
-    b = graph_b(x)
-    R.manual_seed(0)  # useless
-    c = graph_a(x)
-    assert np.all(a == b.numpy())
-    assert np.any(a != c.numpy())
diff --git a/scripts/cmake-build/host_build.sh b/scripts/cmake-build/host_build.sh
index 8a8f1508..d522c2ca 100755
--- a/scripts/cmake-build/host_build.sh
+++ b/scripts/cmake-build/host_build.sh
@@ -9,7 +9,6 @@ function usage() {
     echo "-t : Build with training mode, default inference only"
     echo "-m : Build with m32 mode(only for windows build), default m64"
     echo "-r : remove old build dir before make, default off"
-    echo "-n : enable new python runtime(valid when training mode with -t, default is legacy runtime)"
     echo "-h : show usage"
     echo "append other cmake config by export EXTRA_CMAKE_ARGS=..."
     echo "example: $0 -d"
@@ -23,10 +22,9 @@ MGE_WINDOWS_BUILD_ARCH=x64
 MGE_WINDOWS_BUILD_MARCH=m64
 MGE_ARCH=x86_64
 REMOVE_OLD_BUILD=false
-MGE_BUILD_IMPERATIVE_RT=OFF
 echo "EXTRA_CMAKE_ARGS: ${EXTRA_CMAKE_ARGS}"
 
-while getopts "rhdctmn" arg
+while getopts "rhdctm" arg
 do
     case $arg in
         d)
@@ -55,10 +53,6 @@ do
             MGE_WINDOWS_BUILD_MARCH=m32
             MGE_ARCH=i386
             ;;
-        n)
-            echo "Enable imperative python wrapper runtime"
-            MGE_BUILD_IMPERATIVE_RT=ON
-            ;;
         ?)
             echo "unkonw argument"
             usage
@@ -107,7 +101,6 @@ function cmake_build() {
     cmake \
         -DCMAKE_BUILD_TYPE=$BUILD_TYPE \
         -DMGE_INFERENCE_ONLY=$MGE_INFERENCE_ONLY \
-        -DMGE_BUILD_IMPERATIVE_RT=${MGE_BUILD_IMPERATIVE_RT} \
         -DMGE_WITH_CUDA=$MGE_WITH_CUDA \
         -DCMAKE_INSTALL_PREFIX=$INSTALL_DIR \
         ${EXTRA_CMAKE_ARGS} \
@@ -244,7 +237,6 @@ function cmake_build_windows() {
         vcvarsall.bat $MGE_WINDOWS_BUILD_ARCH && cmake  -G "Ninja" \
         -DMGE_ARCH=$MGE_ARCH \
         -DMGE_INFERENCE_ONLY=$MGE_INFERENCE_ONLY \
-        -DMGE_BUILD_IMPERATIVE_RT=${MGE_BUILD_IMPERATIVE_RT} \
         -DMGE_WITH_CUDA=$MGE_WITH_CUDA \
         -DCMAKE_BUILD_TYPE=$BUILD_TYPE \
         -DCMAKE_INSTALL_PREFIX:PATH=$INSTALL_DIR  \
@@ -257,12 +249,6 @@ function cmake_build_windows() {
         ${WINDOWS_BUILD_TARGET}"
 }
 
-if [ ${MGE_BUILD_IMPERATIVE_RT} = "ON" ] && [ ${MGE_INFERENCE_ONLY} = "ON" ]; then
-    echo "ERR: MGE_BUILD_IMPERATIVE_RT(-n) only valid when enable training mode(-t)"
-    echo "pls remove -n or add -t"
-    exit -1
-fi
-
 if [[ $OS =~ "NT" ]]; then
     if [ ${MGE_ARCH} = "i386" ] && [ ${MGE_INFERENCE_ONLY} = "OFF" ]; then
         echo "ERR: training mode(-t) only support 64 bit mode"
diff --git a/scripts/whl/BUILD_PYTHON_WHL_README.md b/scripts/whl/BUILD_PYTHON_WHL_README.md
index 07e55feb..a9510072 100644
--- a/scripts/whl/BUILD_PYTHON_WHL_README.md
+++ b/scripts/whl/BUILD_PYTHON_WHL_README.md
@@ -7,9 +7,8 @@
 
     ```
     1: enable rootless docker env, refs: https://docs.docker.com/engine/security/rootless/
-    2: cd ./scripts/whl/linux/manylinux2010
-    3: ./build_image.sh cpu
-    4: ./build_image.sh cuda
+    2: cd ./scripts/whl/manylinux2010
+    3: ./build_image.sh
 
     ```
 
@@ -60,12 +59,10 @@
     MegBrain delivers `wheel` package with `manylinux2010` tag defined in [PEP-571](https://www.python.org/dev/peps/pep-0571/).
 
     ```
-    ./build_wheel.sh cpu
-    
-    CUDA_ROOT_DIR=/path/to/cuda \
-    CUDNN_ROOT_DIR=/path/to/cudnn \
-    TENSORRT_ROOT_DIR=/path/to/tensorrt \
-    ./build_wheel.sh cuda
+    export CUDA_ROOT_DIR=/path/to/cuda
+    export CUDNN_ROOT_DIR=/path/to/cudnn
+    export TENSORRT_ROOT_DIR=/path/to/tensorrt
+    ./scripts/whl/manylinux2010/build_wheel.sh
     ```
 
     And you can find all of the outputs in `output` directory.
@@ -73,10 +70,14 @@
     If you just want to build for a specific Python verison, you can use `ALL_PYTHON` environment variable. eg:
 
     ```
-    ALL_PYTHON=35m ./build_wheel.sh cpu
+    ALL_PYTHON=35m ./scripts/whl/manylinux2010/build_wheel.sh
+    ```
+    If you just want to build with cpu only version, you can set `BUILD_WHL_CPU_ONLY` environment 'ON'. eg:
+
+    ```
+    BUILD_WHL_CPU_ONLY=ON ALL_PYTHON=35m ./scripts/whl/manylinux2010/build_wheel.sh
     ```
 
-    Please append `imperative`  to `build_wheel.sh` to use the new runtime, e.g., `./build_wheel.sh cpu imperative`.
 ## build for macos
     ```
     ./scripts/whl/macos/macos_build_whl.sh
@@ -86,11 +87,6 @@
     ```
     ALL_PYTHON=3.5.9 ./scripts/whl/macos/macos_build_whl.sh
     ```
-    If you want to build with imperative rt, set env BUILD_IMPERATIVE="ON", eg:
-
-    ```
-    ALL_PYTHON=3.5.9 BUILD_IMPERATIVE="ON" ./scripts/whl/macos/macos_build_whl.sh
-    ```
 ## build for windows
     ```
     ./scripts/whl/windows/windows_build_whl.sh
@@ -100,10 +96,8 @@
     ```
     ALL_PYTHON=3.5.4 ./scripts/whl/windows/windows_build_whl.sh
     ```
-    If you want to build windows whl with cuda, also a specific Python verison. eg:
+    If you just want to build with cpu only version, you can set `BUILD_WHL_CPU_ONLY` environment 'ON'. eg:
 
     ```
-    WINDOWS_WHL_WITH_CUDA="ON" ALL_PYTHON=3.5.4 ./scripts/whl/windows/windows_build_whl.sh
+    BUILD_WHL_CPU_ONLY='ON' ALL_PYTHON=3.5.4 ./scripts/whl/windows/windows_build_whl.sh
     ```
-    If you want to build with imperative rt, set env BUILD_IMPERATIVE="ON", eg:
-    BUILD_IMPERATIVE="ON" WINDOWS_WHL_WITH_CUDA="ON" ALL_PYTHON=3.5.4 ./scripts/whl/windows/windows_build_whl.sh
diff --git a/scripts/whl/macos/macos_build_whl.sh b/scripts/whl/macos/macos_build_whl.sh
index a10912dd..ef421621 100755
--- a/scripts/whl/macos/macos_build_whl.sh
+++ b/scripts/whl/macos/macos_build_whl.sh
@@ -84,11 +84,6 @@ function config_python_env() {
     fi
 }
 
-if [[ -z ${BUILD_IMPERATIVE} ]]
-then
-    BUILD_IMPERATIVE="OFF"
-fi
-
 function do_build() {
     for ver in ${ALL_PYTHON}
     do
@@ -116,65 +111,38 @@ function do_build() {
         #change PYTHON_LIBRARY and PYTHON_INCLUDE_DIR, so add
         #-r to remove build cache after a new ver build, which
         #will be more slow build than without -r
-        if [ ${BUILD_IMPERATIVE} = "ON" ]; then
-            echo "build whl with IMPERATIVE python rt"
-            ${SRC_DIR}/scripts/cmake-build/host_build.sh -t -n -r
-        else
-            echo "build whl with legacy python rt"
-            ${SRC_DIR}/scripts/cmake-build/host_build.sh -t -r
-        fi
+        echo "build whl with legacy python rt"
+        ${SRC_DIR}/scripts/cmake-build/host_build.sh -t -r
 
         #call setup.py
         BUILD_DIR=${SRC_DIR}/build_dir/host/MGE_WITH_CUDA_OFF/MGE_INFERENCE_ONLY_OFF/Release/build/
         cd ${BUILD_DIR}
 
-        if [ -d "staging" ]; then
-            echo "remove old build cache file"
-            rm -rf staging
-        fi
+        rm -rf staging
         mkdir -p staging
 
-        if [ ${BUILD_IMPERATIVE} = "ON" ]; then
-            echo "build whl with IMPERATIVE python rt"
-            cp -a imperative/python/{megengine,setup.py,requires.txt,requires-style.txt,requires-test.txt} staging/
-            cd ${BUILD_DIR}/staging/megengine/core
-            rt_file=`ls _imperative_rt.*.so`
-            echo "rt file is: ${rt_file}"
-            if [[ -z ${rt_file} ]]
-            then
-                echo "ERR: can not find valid rt file"
-                exit -1
-            fi
-            llvm-strip -s ${rt_file}
-            mv ${rt_file} _imperative_rt.so
-            echo "check so valid or not..."
-            otool_out=`otool -L _imperative_rt.so`
-            if [[ "${otool_out}" =~ "ython" ]]; then
-                echo "ERR: invalid _imperative_rt.so which depend on python lib, detail: log"
-                echo ${otool_out}
-                exit -1
-            else
-                echo "valid..."
-            fi
+        cp -a imperative/python/{megengine,setup.py,requires.txt,requires-style.txt,requires-test.txt} staging/
+        cd ${BUILD_DIR}/staging/megengine/core
+        rt_file=`ls _imperative_rt.*.so`
+        echo "rt file is: ${rt_file}"
+        if [[ -z ${rt_file} ]]
+        then
+            echo "ERR: can not find valid rt file"
+            exit -1
+        fi
+        llvm-strip -s ${rt_file}
+        mv ${rt_file} _imperative_rt.so
+        echo "check so valid or not..."
+        otool_out=`otool -L _imperative_rt.so`
+        if [[ "${otool_out}" =~ "ython" ]]; then
+            echo "ERR: invalid _imperative_rt.so which depend on python lib, detail: log"
+            echo ${otool_out}
+            exit -1
         else
-            echo "build whl with legacy python rt"
-
-            cp -a python_module/{megengine,setup.py,requires.txt,requires-style.txt,requires-test.txt} staging/
-            cd ${BUILD_DIR}/staging/megengine/_internal
-            #FIXME: set lib suffix to dylib may be better, BUT we find after distutils.file_util.copy_file
-            #will change to .so at macos even we set suffix to dylib, at the same time, macos also support .so
-            echo "check so valid or not..."
-            llvm-strip -s _mgb.so
-            otool_out=`otool -L _mgb.so`
-            if [[ "${otool_out}" =~ "ython" ]]; then
-                echo "ERR: invalid _mgb.so which depend on python lib, detail: log"
-                echo ${otool_out}
-                exit -1
-            else
-                echo "valid..."
-            fi
+            echo "valid..."
         fi
 
+
         cd ${BUILD_DIR}/staging
         ${PYTHON_DIR}/bin/python3 setup.py bdist_wheel
         cd ${BUILD_DIR}/staging/dist/
diff --git a/scripts/whl/manylinux2010/build_wheel.sh b/scripts/whl/manylinux2010/build_wheel.sh
index 02adf235..c4762356 100755
--- a/scripts/whl/manylinux2010/build_wheel.sh
+++ b/scripts/whl/manylinux2010/build_wheel.sh
@@ -6,7 +6,11 @@ OUTPUTDIR=$(readlink -f ${CWD}/output)
 USERID=$(id -u)
 TMPFS_ARGS="--tmpfs /tmp:exec"
 
-IMPERATIVE=${2:-mgb}
+BUILD_WHL_CPU_ONLY=${BUILD_WHL_CPU_ONLY}
+if [[ -z ${BUILD_WHL_CPU_ONLY} ]]
+then
+    BUILD_WHL_CPU_ONLY="OFF"
+fi
 
 echo ${BASEDIR}
 pushd ${BASEDIR}/third_party >/dev/null
@@ -16,84 +20,84 @@ popd >/dev/null
 cd ${CWD}
 mkdir -p ${OUTPUTDIR}
 
-if [[ -z ${CUDA_ROOT_DIR} ]]; then
-    echo "Environment variable CUDA_ROOT_DIR not set."
-    exit -1
-fi
-if [[ -z ${CUDNN_ROOT_DIR} ]]; then
-    echo "Environment variable CUDNN_ROOT_DIR not set."
-    exit -1
-fi
-if [[ -z ${TENSORRT_ROOT_DIR} ]]; then
-    echo "Environment variable TENSORRT_ROOT_DIR not set."
-    exit -1
-fi
-
-## YOU SHOULD MODIFY CUDA VERSION AS BELOW WHEN UPGRADE
-REQUIR_CUDA_VERSION="10010"
-REQUIR_CUDNN_VERSION="7.6.3"
-REQUIR_TENSORRT_VERSION="6.0.1.5"
-
-CUDA_ROOT_DIR_=${CUDA_ROOT_DIR%*/}
-CUDNN_ROOT_DIR_=${CUDNN_ROOT_DIR%*/}
-TENSORRT_ROOT_DIR_=${TENSORRT_ROOT_DIR%*/}
-
-CUDA_VERSION_PATH=${CUDA_ROOT_DIR_}/include/cuda.h
-CUDNN_VERSION_PATH=${CUDNN_ROOT_DIR_}/include/cudnn.h
-TENSORRT_VERSION_PATH=${TENSORRT_ROOT_DIR_}/include/NvInferVersion.h
-
-if [ ! -e $CUDA_VERSION_PATH ] ; then
-    echo file $CUDA_VERSION_PATH is not exist
-    echo please check the Environment must use CUDA-10.1 NO.$REQUIR_CUDA_VERSION
-    exit -1
-fi
-if [ ! -e $CUDNN_VERSION_PATH ] ; then
-    echo file $CUDNN_VERSION_PATH is not exist
-    echo please check the Environment must use CUDNN-V$REQUIR_CUDNN_VERSION
-    exit -1
-fi
-if [ ! -e $TENSORRT_VERSION_PATH ] ; then
-    echo file $TENSORRT_VERSION_PATH is not exist
-    echo please check the Environment must use TensorRT-$REQUIR_TENSORRT_VERSION
-    exit -1
-fi
-
-CUDA_VERSION_CONTEXT=$(head -85 ${CUDA_VERSION_PATH})
-CUDNN_VERSION_CONTEXT=$(head -62 ${CUDNN_VERSION_PATH})
-TENSORRT_VERSION_CONTEXT=$(tail -12 ${TENSORRT_VERSION_PATH})
-
-CUDA_API_VERSION=$(echo $CUDA_VERSION_CONTEXT | grep -Eo "define __CUDA_API_VERSION * +([0-9]+)")
-CUDA_VERSION=${CUDA_API_VERSION:0-5}
-echo CUDA_VERSION:$CUDA_VERSION
-
-CUDNN_VERSION_MAJOR=$(echo $CUDNN_VERSION_CONTEXT | grep -Eo "define CUDNN_MAJOR * +([0-9]+)")
-CUDNN_VERSION_MINOR=$(echo $CUDNN_VERSION_CONTEXT | grep -Eo "define CUDNN_MINOR * +([0-9]+)")
-CUDNN_VERSION_PATCH=$(echo $CUDNN_VERSION_CONTEXT | grep -Eo "define CUDNN_PATCHLEVEL * +([0-9]+)")
-CUDNN_VERSION=${CUDNN_VERSION_MAJOR:0-1}.${CUDNN_VERSION_MINOR:0-1}.${CUDNN_VERSION_PATCH:0-1}
-echo CUDNN_VERSION:$CUDNN_VERSION
-
-TENSORRT_VERSION_MAJOR=$(echo $TENSORRT_VERSION_CONTEXT | grep -Eo "NV_TENSORRT_MAJOR * +([0-9]+)")
-TENSORRT_VERSION_MINOR=$(echo $TENSORRT_VERSION_CONTEXT | grep -Eo "NV_TENSORRT_MINOR * +([0-9]+)")
-TENSORRT_VERSION_PATCH=$(echo $TENSORRT_VERSION_CONTEXT | grep -Eo "NV_TENSORRT_PATCH * +([0-9]+)")
-TENSORRT_VERSION_BUILD=$(echo $TENSORRT_VERSION_CONTEXT | grep -Eo "NV_TENSORRT_BUILD * +([0-9]+)")
-TENSORRT_VERSION=${TENSORRT_VERSION_MAJOR:0-1}.${TENSORRT_VERSION_MINOR:0-1}.${TENSORRT_VERSION_PATCH:0-1}.${TENSORRT_VERSION_BUILD:0-1}
-echo TENSORRT_VERSION:$TENSORRT_VERSION
-
-if [ $CUDA_VERSION != $REQUIR_CUDA_VERSION ] ; then
-    echo please check the Environment must use CUDA-10.1 NO.$REQUIR_CUDA_VERSION
-    exit -1
-fi
-
-if [ $CUDNN_VERSION != $REQUIR_CUDNN_VERSION ] ; then
-    echo please check the Environment must use CUDNN-V$REQUIR_CUDNN_VERSION
-    exit -1
+if [ ${BUILD_WHL_CPU_ONLY} = "OFF" ]; then
+    if [[ -z ${CUDA_ROOT_DIR} ]]; then
+        echo "Environment variable CUDA_ROOT_DIR not set."
+        exit -1
+    fi
+    if [[ -z ${CUDNN_ROOT_DIR} ]]; then
+        echo "Environment variable CUDNN_ROOT_DIR not set."
+        exit -1
+    fi
+    if [[ -z ${TENSORRT_ROOT_DIR} ]]; then
+        echo "Environment variable TENSORRT_ROOT_DIR not set."
+        exit -1
+    fi
+
+    ## YOU SHOULD MODIFY CUDA VERSION AS BELOW WHEN UPGRADE
+    REQUIR_CUDA_VERSION="10010"
+    REQUIR_CUDNN_VERSION="7.6.3"
+    REQUIR_TENSORRT_VERSION="6.0.1.5"
+
+    CUDA_ROOT_DIR_=${CUDA_ROOT_DIR%*/}
+    CUDNN_ROOT_DIR_=${CUDNN_ROOT_DIR%*/}
+    TENSORRT_ROOT_DIR_=${TENSORRT_ROOT_DIR%*/}
+
+    CUDA_VERSION_PATH=${CUDA_ROOT_DIR_}/include/cuda.h
+    CUDNN_VERSION_PATH=${CUDNN_ROOT_DIR_}/include/cudnn.h
+    TENSORRT_VERSION_PATH=${TENSORRT_ROOT_DIR_}/include/NvInferVersion.h
+
+    if [ ! -e $CUDA_VERSION_PATH ] ; then
+        echo file $CUDA_VERSION_PATH is not exist
+        echo please check the Environment must use CUDA-10.1 NO.$REQUIR_CUDA_VERSION
+        exit -1
+    fi
+    if [ ! -e $CUDNN_VERSION_PATH ] ; then
+        echo file $CUDNN_VERSION_PATH is not exist
+        echo please check the Environment must use CUDNN-V$REQUIR_CUDNN_VERSION
+        exit -1
+    fi
+    if [ ! -e $TENSORRT_VERSION_PATH ] ; then
+        echo file $TENSORRT_VERSION_PATH is not exist
+        echo please check the Environment must use TensorRT-$REQUIR_TENSORRT_VERSION
+        exit -1
+    fi
+
+    CUDA_VERSION_CONTEXT=$(head -85 ${CUDA_VERSION_PATH})
+    CUDNN_VERSION_CONTEXT=$(head -62 ${CUDNN_VERSION_PATH})
+    TENSORRT_VERSION_CONTEXT=$(tail -12 ${TENSORRT_VERSION_PATH})
+
+    CUDA_API_VERSION=$(echo $CUDA_VERSION_CONTEXT | grep -Eo "define __CUDA_API_VERSION * +([0-9]+)")
+    CUDA_VERSION=${CUDA_API_VERSION:0-5}
+    echo CUDA_VERSION:$CUDA_VERSION
+
+    CUDNN_VERSION_MAJOR=$(echo $CUDNN_VERSION_CONTEXT | grep -Eo "define CUDNN_MAJOR * +([0-9]+)")
+    CUDNN_VERSION_MINOR=$(echo $CUDNN_VERSION_CONTEXT | grep -Eo "define CUDNN_MINOR * +([0-9]+)")
+    CUDNN_VERSION_PATCH=$(echo $CUDNN_VERSION_CONTEXT | grep -Eo "define CUDNN_PATCHLEVEL * +([0-9]+)")
+    CUDNN_VERSION=${CUDNN_VERSION_MAJOR:0-1}.${CUDNN_VERSION_MINOR:0-1}.${CUDNN_VERSION_PATCH:0-1}
+    echo CUDNN_VERSION:$CUDNN_VERSION
+
+    TENSORRT_VERSION_MAJOR=$(echo $TENSORRT_VERSION_CONTEXT | grep -Eo "NV_TENSORRT_MAJOR * +([0-9]+)")
+    TENSORRT_VERSION_MINOR=$(echo $TENSORRT_VERSION_CONTEXT | grep -Eo "NV_TENSORRT_MINOR * +([0-9]+)")
+    TENSORRT_VERSION_PATCH=$(echo $TENSORRT_VERSION_CONTEXT | grep -Eo "NV_TENSORRT_PATCH * +([0-9]+)")
+    TENSORRT_VERSION_BUILD=$(echo $TENSORRT_VERSION_CONTEXT | grep -Eo "NV_TENSORRT_BUILD * +([0-9]+)")
+    TENSORRT_VERSION=${TENSORRT_VERSION_MAJOR:0-1}.${TENSORRT_VERSION_MINOR:0-1}.${TENSORRT_VERSION_PATCH:0-1}.${TENSORRT_VERSION_BUILD:0-1}
+    echo TENSORRT_VERSION:$TENSORRT_VERSION
+
+    if [ $CUDA_VERSION != $REQUIR_CUDA_VERSION ] ; then
+        echo please check the Environment must use CUDA-10.1 NO.$REQUIR_CUDA_VERSION
+        exit -1
+    fi
+
+    if [ $CUDNN_VERSION != $REQUIR_CUDNN_VERSION ] ; then
+        echo please check the Environment must use CUDNN-V$REQUIR_CUDNN_VERSION
+        exit -1
+    fi
+
+    if [ $TENSORRT_VERSION != $REQUIR_TENSORRT_VERSION ] ; then
+        echo please check the Environment must use TENSORRT-$REQUIR_TENSORRT_VERSION
+        exit -1
+    fi
 fi
 
-if [ $TENSORRT_VERSION != $REQUIR_TENSORRT_VERSION ] ; then
-    echo please check the Environment must use TENSORRT-$REQUIR_TENSORRT_VERSION
-    exit -1
-fi
-
-docker run -it --rm $TMPFS_ARGS -e UID=${USERID} -e LOCAL_VERSION=${LOCAL_VERSION} -e ALL_PYTHON=${ALL_PYTHON} -v ${CUDA_ROOT_DIR}:/usr/local/cuda -v ${CUDNN_ROOT_DIR}:/opt/cudnn -v ${TENSORRT_ROOT_DIR}:/opt/tensorrt -v ${BASEDIR}:/home/code -v ${OUTPUTDIR}:/home/output:rw env_manylinux2010:latest /home/code/scripts/whl/manylinux2010/do_build.sh $IMPERATIVE
-
-
+docker run -it --rm $TMPFS_ARGS -e UID=${USERID} -e LOCAL_VERSION=${LOCAL_VERSION} -e BUILD_WHL_CPU_ONLY=${BUILD_WHL_CPU_ONLY} -e ALL_PYTHON=${ALL_PYTHON} -v ${CUDA_ROOT_DIR}:/usr/local/cuda -v ${CUDNN_ROOT_DIR}:/opt/cudnn -v ${TENSORRT_ROOT_DIR}:/opt/tensorrt -v ${BASEDIR}:/home/code -v ${OUTPUTDIR}:/home/output:rw env_manylinux2010:latest /home/code/scripts/whl/manylinux2010/do_build.sh
diff --git a/scripts/whl/manylinux2010/do_build.sh b/scripts/whl/manylinux2010/do_build.sh
index bf06afd0..10286006 100755
--- a/scripts/whl/manylinux2010/do_build.sh
+++ b/scripts/whl/manylinux2010/do_build.sh
@@ -5,80 +5,70 @@ then
     ALL_PYTHON="35m 36m 37m 38"
 fi
 
-EXTRA_CMAKE_ARGS=
-if [[ "$1" == imperative ]]; then
-    BUILD_IMPERATIVE=ON
-    SO_NAME=_imperative_rt
-    SO_PATH=megengine/core
-else
-    BUILD_IMPERATIVE=OFF
-    SO_NAME=_mgb
-    SO_PATH=megengine/_internal
+BUILD_WHL_CPU_ONLY=${BUILD_WHL_CPU_ONLY}
+if [[ -z ${BUILD_WHL_CPU_ONLY} ]]
+then
+    BUILD_WHL_CPU_ONLY="OFF"
+fi
+
+SRC_DIR=$(readlink -f "`dirname $0`/../../../")
+BUILD_DIR=${SRC_DIR}/build_dir/host/MGE_WITH_CUDA_OFF/MGE_INFERENCE_ONLY_OFF/Release/build/
+if [ ${BUILD_WHL_CPU_ONLY} = "OFF" ]; then
+    BUILD_DIR=${SRC_DIR}/build_dir/host/MGE_WITH_CUDA_ON/MGE_INFERENCE_ONLY_OFF/Release/build/
 fi
+SO_NAME=_imperative_rt
+SO_PATH=megengine/core
+NEW_LIB_PATH=core/lib
 
 for ver in ${ALL_PYTHON}
 do
     python_ver=${ver:0:2}
-    BUILD_DIR=/tmp/build_megengine/python${python_ver}
     MAJOR=${python_ver:0:1}
     MINOR=${ver:1}
     PYTHON_DIR=/opt/python/cp${python_ver}-cp${ver}/
     EXT_NAME=${SO_NAME}.cpython-${ver}-x86_64-linux-gnu.so
-    mkdir -p ${BUILD_DIR}
-    pushd ${BUILD_DIR} >/dev/null
-        MGE_CMAKE_FLAGS="-DMGE_WITH_DISTRIBUTED=ON \
-            -DMGE_WITH_CUDA=ON \
-            -DCMAKE_PREFIX_PATH=${PYTHON_DIR} \
-            -DCMAKE_INSTALL_PREFIX=/home/output "
-        if [[ "$BUILD_IMPERATIVE" == ON ]]; then
-            MGE_CMAKE_FLAGS+=" -DMGE_BUILD_IMPERATIVE_RT=ON \
-                -DPYTHON_EXECUTABLE=${PYTHON_DIR}/bin/python3"
-        else
-            MGE_CMAKE_FLAGS+=" -DPYTHON_LIBRARY=${PYTHON_DIR}lib/ \
-                -DPYTHON_INCLUDE_DIR=${PYTHON_DIR}include/python${MAJOR}.${MINOR}"
-        fi
-        cmake /home/code ${MGE_CMAKE_FLAGS} ${EXTRA_CMAKE_ARGS}
-        make -j$(nproc) VERBOSE=1
-        make install
-        mkdir -p staging
-        mkdir -p /home/output/debug
-        if [[ "$BUILD_IMPERATIVE" == ON ]]; then
-            cp -a imperative/python/{megengine,setup.py,requires.txt,requires-style.txt,requires-test.txt} staging/
-        else
-            cp -a python_module/{megengine,setup.py,requires.txt,requires-style.txt,requires-test.txt} staging/
-        fi
-        pushd dnn/cuda-stub/ >/dev/null
-            strip -s libcuda.so
-            ln -sf libcuda.so libcuda.so.1
-        popd >/dev/null
-        pushd staging >/dev/null
-            pushd ${SO_PATH} >/dev/null
-                SO_NAME_EXT=${SO_NAME}.so
-                objcopy --only-keep-debug ${SO_NAME_EXT} ${EXT_NAME}.dbg
-                strip -s ${SO_NAME_EXT}
-                objcopy --add-gnu-debuglink=${EXT_NAME}.dbg ${SO_NAME_EXT}
-                cp -a ${EXT_NAME}.dbg /home/output/debug
-                mkdir -p lib/ucx
-                cp -L /usr/local/cuda/lib*/libnvrtc-builtins.so lib
-	            cp -L ${BUILD_DIR}/third_party/MegRay/third_party/ucx/lib/ucx/*.so lib/ucx/
-                strip -s lib/ucx/*.so
-            popd >/dev/null
-            ${PYTHON_DIR}/bin/python setup.py bdist_wheel
-        popd >/dev/null
-    popd >/dev/null
-    pushd /home/output >/dev/null
-        if [[ "$BUILD_IMPERATIVE" == ON ]]; then
-            NEW_LIB_PATH=core/lib
-        else
-            NEW_LIB_PATH=_internal/lib
-        fi
-        LD_LIBRARY_PATH=${BUILD_DIR}/dnn/cuda-stub:$LD_LIBRARY_PATH auditwheel repair -L ${NEW_LIB_PATH} ${BUILD_DIR}/staging/dist/Meg*.whl
-        chown -R ${UID}.${UID} .
-    popd >/dev/null
-    rm -rf ${BUILD_DIR}
-done
+    export EXTRA_CMAKE_ARGS="${EXTRA_CMAKE_ARGS} -DCMAKE_BUILD_TYPE=RelWithDebInfo"
+    export EXTRA_CMAKE_ARGS="${EXTRA_CMAKE_ARGS} -DCMAKE_PREFIX_PATH=${PYTHON_DIR}"
+    export EXTRA_CMAKE_ARGS="${EXTRA_CMAKE_ARGS} -DPYTHON_EXECUTABLE=${PYTHON_DIR}/bin/python3"
+    export EXTRA_CMAKE_ARGS="${EXTRA_CMAKE_ARGS} -DPYTHON_LIBRARY=${PYTHON_DIR}lib/"
+    export EXTRA_CMAKE_ARGS="${EXTRA_CMAKE_ARGS} -DPYTHON_INCLUDE_DIR=${PYTHON_DIR}include/python${MAJOR}.${MINOR}"
+
+    if [ ${BUILD_WHL_CPU_ONLY} = "OFF" ]; then
+        ${SRC_DIR}/scripts/cmake-build/host_build.sh -c -t -r
+    else
+        ${SRC_DIR}/scripts/cmake-build/host_build.sh -t -r
+    fi
 
+    cd ${BUILD_DIR}
+    rm -rf staging
+    mkdir -p staging
+    cp -a imperative/python/{megengine,setup.py,requires.txt,requires-style.txt,requires-test.txt} staging/
 
-pushd /home/code/dnn/scripts >/dev/null
-rm -rf __pycache__
-popd >/dev/null
+
+    if [ ${BUILD_WHL_CPU_ONLY} = "OFF" ]; then
+        cd ${BUILD_DIR}/dnn/cuda-stub/
+        strip -s libcuda.so
+        ln -sf libcuda.so libcuda.so.1
+    fi
+
+    cd ${BUILD_DIR}/staging/${SO_PATH}
+    SO_NAME_EXT=${SO_NAME}.so
+    objcopy --only-keep-debug ${SO_NAME_EXT} ${EXT_NAME}.dbg
+    strip -s ${SO_NAME_EXT}
+    objcopy --add-gnu-debuglink=${EXT_NAME}.dbg ${SO_NAME_EXT}
+    mkdir -p lib/ucx
+
+    if [ ${BUILD_WHL_CPU_ONLY} = "OFF" ]; then
+        cp -L /usr/local/cuda/lib*/libnvrtc-builtins.so lib
+        cp -L ${BUILD_DIR}/third_party/MegRay/third_party/ucx/lib/ucx/*.so lib/ucx/
+        strip -s lib/ucx/*.so
+    fi
+
+    cd ${BUILD_DIR}/staging/
+    ${PYTHON_DIR}/bin/python setup.py bdist_wheel
+    cd /home/output
+    LD_LIBRARY_PATH=${BUILD_DIR}/dnn/cuda-stub:$LD_LIBRARY_PATH auditwheel repair -L ${NEW_LIB_PATH} ${BUILD_DIR}/staging/dist/Meg*.whl
+    chown -R ${UID}.${UID} .
+    # compat for root-less docker env to remove output at host side
+    chmod -R 777 .
+done
diff --git a/scripts/whl/windows/windows_build_whl.sh b/scripts/whl/windows/windows_build_whl.sh
index 1e1d553a..d7fd0b86 100755
--- a/scripts/whl/windows/windows_build_whl.sh
+++ b/scripts/whl/windows/windows_build_whl.sh
@@ -58,9 +58,10 @@ function config_python_env() {
     PYTHON_INCLUDE_DIR=${PYTHON_DIR}/include
 }
 
-if [[ -z ${WINDOWS_WHL_WITH_CUDA} ]]
+BUILD_WHL_CPU_ONLY=${BUILD_WHL_CPU_ONLY}
+if [[ -z ${BUILD_WHL_CPU_ONLY} ]]
 then
-    WINDOWS_WHL_WITH_CUDA="OFF"
+    BUILD_WHL_CPU_ONLY="OFF"
 fi
 
 
@@ -86,32 +87,23 @@ function depend_real_copy() {
 
 function copy_more_dll() {
     # for python whl real use
-    if [ ${BUILD_IMPERATIVE} = "ON" ]; then
-        echo "config BUILD_IMPERATIVE core lib dir"
-        CP_WHL_DST=${BUILD_DIR}/staging/megengine/core/lib
-    else
-        echo "config legacy python lib dir"
-        CP_WHL_DST=${BUILD_DIR}/staging/megengine/_internal/lib
-    fi
-    rm -rf ${CP_WHL_DST}
-    mkdir ${CP_WHL_DST}
+    echo "config BUILD_IMPERATIVE core lib dir"
+    CP_WHL_DST_IMP=${BUILD_DIR}/staging/megengine/core/lib
+    rm -rf ${CP_WHL_DST_IMP}
+    mkdir ${CP_WHL_DST_IMP}
+
     # workround for cpu-only version import failed, use a
     # empty.file to triger setup.py to create a null empty
-    echo "empty" > ${CP_WHL_DST}/empty.file
+    echo "empty" > ${CP_WHL_DST_IMP}/empty.file
 
 
-    if [ ${WINDOWS_WHL_WITH_CUDA} = "ON" ]; then
+    if [ ${BUILD_WHL_CPU_ONLY} = "OFF" ]; then
         echo "copy nvidia lib to whl use...."
-        depend_real_copy ${CP_WHL_DST}
+        depend_real_copy ${CP_WHL_DST_IMP}
 
     fi
 }
 
-if [[ -z ${BUILD_IMPERATIVE} ]]
-then
-    BUILD_IMPERATIVE="OFF"
-fi
-
 function do_build() {
     for ver in ${ALL_PYTHON}
     do
@@ -144,14 +136,8 @@ function do_build() {
         #-r to remove build cache after a new ver build, which
         #will be more slow build than without -r
         BUILD_ARGS=" -t -r"
-        if [ ${BUILD_IMPERATIVE} = "ON" ]; then
-            echo "build whl with IMPERATIVE python rt"
-            BUILD_ARGS="${BUILD_ARGS} -n "
-        else
-            echo "build whl with legacy python rt"
-        fi
 
-        if [ ${WINDOWS_WHL_WITH_CUDA} = "ON" ]; then
+        if [ ${BUILD_WHL_CPU_ONLY} = "OFF" ]; then
             echo "build windows whl with cuda"
             BUILD_ARGS="${BUILD_ARGS} -c "
         else
@@ -161,39 +147,27 @@ function do_build() {
         echo "host_build.sh BUILD_ARGS: ${BUILD_ARGS}"
         ${SRC_DIR}/scripts/cmake-build/host_build.sh ${BUILD_ARGS}
 
-        #call setup.py
         BUILD_DIR=${SRC_DIR}/build_dir/host/build/
         cd ${BUILD_DIR}
 
-        if [ -d "staging" ]; then
-            echo "remove old build cache file"
-            rm -rf staging
-        fi
+        rm -rf staging
         mkdir -p staging
-
-        if [ ${BUILD_IMPERATIVE} = "ON" ]; then
-            echo "build whl with IMPERATIVE python rt"
-            cp -a imperative/python/{megengine,setup.py,requires.txt,requires-style.txt,requires-test.txt} staging/
-            cd ${BUILD_DIR}/staging/megengine/core
-            rt_file=`ls _imperative_rt.*.pyd`
-            echo "rt file is: ${rt_file}"
-            if [[ -z ${rt_file} ]]
-            then
-                echo "ERR: can not find valid rt file"
-                exit -1
-            fi
-            llvm-strip -s ${rt_file}
-            mv ${rt_file} _imperative_rt.pyd
-        else
-            echo "build whl with legacy python rt"
-
-            cp -a python_module/{megengine,setup.py,requires.txt,requires-style.txt,requires-test.txt} staging/
-            cd ${BUILD_DIR}/staging/megengine/_internal
-            llvm-strip -s _mgb.pyd
+        cp -a imperative/python/{megengine,setup.py,requires.txt,requires-style.txt,requires-test.txt} staging/
+        cd ${BUILD_DIR}/staging/megengine/core
+        rt_file=`ls _imperative_rt.*.pyd`
+        echo "rt file is: ${rt_file}"
+        if [[ -z ${rt_file} ]]
+        then
+            echo "ERR: can not find valid rt file"
+            exit -1
         fi
+        llvm-strip -s ${rt_file}
+        mv ${rt_file} _imperative_rt.pyd
+
 
         copy_more_dll
         cd ${BUILD_DIR}/staging
+        echo "call setup.py now"
         ${PYTHON_DIR}/python3 setup.py bdist_wheel
         cp ${BUILD_DIR}/staging/dist/Meg*.whl ${WINDOWS_WHL_HOME}/