diff --git a/CMakeLists.txt b/CMakeLists.txt
index b1b42d61..4fe36c64 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -53,9 +53,11 @@ option(MGE_WITH_DISTRIBUTED "Build with distributed support" ON)
 option(MGE_BUILD_IMPERATIVE_RT "Build _imperative_rt Python Module " ON)
 option(MGE_BUILD_SDK "Build load_and_run" ON)
 option(MGE_INFERENCE_ONLY "Build inference only library." OFF)
+option(MGE_WITH_PYTHON_MODULE "Build MegEngine legacy Python Module." OFF)
 option(MGE_WITH_MKLDNN "Enable Intel MKL_DNN support," ON)
 option(MGE_WITH_ROCM "Enable ROCM support" OFF)
 
+
 if(NOT ${MGE_BIN_REDUCE} STREQUAL "")
     message("build with BIN REDUCE")
     if(MGE_WITH_MINIMUM_SIZE)
@@ -152,6 +154,14 @@ if(${MGE_ARCH} STREQUAL "x86_64" OR ${MGE_ARCH} STREQUAL "i386" OR ${MGE_ARCH} S
 endif()
 
 if(MSVC OR WIN32)
+    # for cmake after 3.15.2
+    cmake_policy(SET CMP0091 NEW)
+    if(${CMAKE_BUILD_TYPE} STREQUAL "Debug")
+        set(CMAKE_MSVC_RUNTIME_LIBRARY "MultiThreadedDebug")
+    else()
+        set(CMAKE_MSVC_RUNTIME_LIBRARY "MultiThreaded")
+    endif()
+
     add_compile_definitions(NOMINMAX=1 _USE_MATH_DEFINES=1 WIN32=1)
     message("-- into windows build...")
     message("-- CMAKE_C_COMPILER_ID: ${CMAKE_C_COMPILER_ID}")
@@ -285,7 +295,6 @@ if(MGE_WITH_TEST)
 endif()
 
 if(MGE_BUILD_IMPERATIVE_RT)
-    add_compile_definitions(MGB_ENABLE_IMPERATIVE_RUNTIME)
     set(CMAKE_CXX_STANDARD 17)
 endif()
 
@@ -701,7 +710,8 @@ endif()
 
 set (CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${MARCH}")
 
-set(MGB_ENABLE_IMPERATIVE ${MGE_BUILD_IMPERATIVE_RT})
+set(MGE_VERSION_SCRIPT ${PROJECT_SOURCE_DIR}/src/version.ld CACHE INTERNAL "Path to linker version script")
+
 # Write out megbrain_build_config.h
 # It defines macros needed by both megbrain and dnn
 configure_file(src/megbrain_build_config.h.in ${CMAKE_CURRENT_BINARY_DIR}/genfiles/megbrain_build_config.h)
@@ -831,3 +841,8 @@ if(MSVC OR WIN32)
         endif()
     endforeach()
 endif()
+
+if(MGE_WITH_JIT_MLIR)
+    add_subdirectory(tools/mlir/mgb-opt)
+    add_subdirectory(tools/mlir/mgb-file-check)
+endif()
diff --git a/dnn/include/megdnn/oprs/nn.h b/dnn/include/megdnn/oprs/nn.h
index 41142948..3b1a5caa 100644
--- a/dnn/include/megdnn/oprs/nn.h
+++ b/dnn/include/megdnn/oprs/nn.h
@@ -683,6 +683,53 @@ protected:
 };
 
 /**
+ * \brief base class for AdaptivePooling
+ */
+class AdaptivePoolingBase : public OperatorBase {
+    DEF_OPR_IMPL_CTOR(AdaptivePoolingBase, OperatorBase);
+    DEF_OPR_PARAM(AdaptivePooling);
+
+protected:
+    param::Pooling deduce_pooling_param(const TensorLayout& src,
+                                        const TensorLayout& dst);
+};
+
+class AdaptivePoolingForward : public AdaptivePoolingBase {
+    DEF_OPR_IMPL(AdaptivePoolingForward, AdaptivePoolingBase, 1, 1);
+
+public:
+    /**
+     * \param[in] src input tensor
+     * \param[out] dst output tensor
+     */
+    virtual void exec(_megdnn_tensor_in src, _megdnn_tensor_out dst,
+                      _megdnn_workspace workspace) = 0;
+    virtual size_t get_workspace_in_bytes(const TensorLayout& src,
+                                          const TensorLayout& dst) = 0;
+};
+
+using AdaptivePooling = AdaptivePoolingForward;
+
+class AdaptivePoolingBackward : public AdaptivePoolingBase {
+    DEF_OPR_IMPL(AdaptivePoolingBackward, AdaptivePoolingBase, 3, 1);
+
+public:
+    /**
+     * \param[in] src the `src' parameter in AdaptivePoolingForward::exec
+     * \param[in] dst the `dst' parameter in AdaptivePoolingForward::exec
+     * \param[in] diff the backpropagated gradient wrt. dst
+     * \param[out] grad the backpropagated gradient wrt. src
+     */
+    virtual void exec(_megdnn_tensor_in src, _megdnn_tensor_in dst,
+                      _megdnn_tensor_in diff, _megdnn_tensor_out grad,
+                      _megdnn_workspace workspace) = 0;
+    virtual size_t get_workspace_in_bytes(const TensorLayout& src,
+                                          const TensorLayout& dst,
+                                          const TensorLayout& diff,
+                                          const TensorLayout& grad) = 0;
+};
+
+/**
  * \brief base class for Local
  */
 class LocalBase : public OperatorBase {
diff --git a/dnn/scripts/opr_param_defs.py b/dnn/scripts/opr_param_defs.py
index 3d16e178..1b16b5db 100755
--- a/dnn/scripts/opr_param_defs.py
+++ b/dnn/scripts/opr_param_defs.py
@@ -179,6 +179,11 @@ pdef('Axis').add_fields('int32', 'axis', 0)
  add_enum_alias('Format', 'ConvolutionV0')
  )
 
+(pdef('AdaptivePooling').
+ add_enum_alias('Mode', 'Pooling').
+ add_enum_alias('Format', 'ConvolutionV0')
+ )
+
 (pdef('LRN',
       'see ImageNet Classification with Deep Convolutional Neural Networks for'
       ' meaning of the fields').
diff --git a/dnn/src/atlas/megcore/computing_context.cpp b/dnn/src/atlas/megcore/computing_context.cpp
index c92d3ab4..715dcc77 100644
--- a/dnn/src/atlas/megcore/computing_context.cpp
+++ b/dnn/src/atlas/megcore/computing_context.cpp
@@ -55,8 +55,12 @@ void AtlasComputingContext::memcpy(void* dst, const void* src,
         default:
             megdnn_throw("bad atlas memcpy kind");
     }
+#if MGB_USE_ATLAS_ASYNC_API
     acl_check(aclrtMemcpyAsync(dst, size_in_bytes, src, size_in_bytes,
                                atlas_kind, m_ctx.stream));
+#else
+    acl_check(aclrtMemcpy(dst, size_in_bytes, src, size_in_bytes, atlas_kind));
+#endif
 }
 
 void AtlasComputingContext::memset(void* dst, int value, size_t size_in_bytes) {
@@ -65,7 +69,11 @@ void AtlasComputingContext::memset(void* dst, int value, size_t size_in_bytes) {
 }
 
 void AtlasComputingContext::synchronize() {
+#if MGB_USE_ATLAS_ASYNC_API
     acl_check(aclrtSynchronizeStream(m_ctx.stream));
+#else
+    return;
+#endif
 }
 
 // vim: syntax=cpp.doxygen
diff --git a/dnn/src/common/adaptive_pooling.cpp b/dnn/src/common/adaptive_pooling.cpp
new file mode 100644
index 00000000..56bcb3a1
--- /dev/null
+++ b/dnn/src/common/adaptive_pooling.cpp
@@ -0,0 +1,37 @@
+/**
+ * \file dnn/src/common/adaptive_pooling.cpp
+ * MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
+ *
+ * Copyright (c) 2014-2020 Megvii Inc. All rights reserved.
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or
+ * implied.
+ */
+#include "megdnn/opr_param_defs.h"
+#include "megdnn/oprs.h"
+
+#include "src/common/utils.h"
+namespace megdnn {
+
+param::Pooling AdaptivePoolingBase::deduce_pooling_param(
+        const TensorLayout& src, const TensorLayout& dst) {
+    megdnn_assert(param().format == param::AdaptivePooling::Format::NCHW);
+    size_t IH = src.shape[2], IW = src.shape[3], OH = dst.shape[2],
+           OW = dst.shape[3];
+
+    param::Pooling ret;
+    ret.mode = param().mode;
+    ret.format = param().format;
+    ret.pad_h = ret.pad_w = 0;
+    ret.stride_h = floor(IH / OH);
+    ret.stride_w = floor(IW / OW);
+    ret.window_h = IH - (OH - 1) * ret.stride_h;
+    ret.window_w = IW - (OW - 1) * ret.stride_w;
+
+    return ret;
+}
+}  // namespace megdnn
+
+// vim: syntax=cpp.doxygen
diff --git a/dnn/src/common/basic_types.cpp b/dnn/src/common/basic_types.cpp
index e9b90d0e..74624414 100644
--- a/dnn/src/common/basic_types.cpp
+++ b/dnn/src/common/basic_types.cpp
@@ -392,8 +392,6 @@ TensorLayout TensorLayout::broadcast(const TensorShape& tshape) const {
         TensorLayout result{dtype, format};
         result.ndim = tshape.ndim;
         for (size_t i = 0; i < tshape.ndim; i++) {
-            megdnn_throw_if(!tshape.shape[i], tensor_reshape_error,
-                            megdnn_mangle("target shape is 0"));
             result.shape[i] = tshape.shape[i];
             result.stride[i] = (tshape.shape[i] == 1);
         }
@@ -409,8 +407,6 @@ TensorLayout TensorLayout::broadcast(const TensorShape& tshape) const {
     for (size_t i = 0; i < tshape.ndim; ++i) {
         int target_idx = tshape.ndim - i - 1;
         int cur_idx = ndim - i - 1;
-        megdnn_throw_if(!tshape.shape[target_idx], tensor_reshape_error,
-                        megdnn_mangle("target shape is 0"));
         size_t cur_shape = (cur_idx >= 0 ? shape[cur_idx] : 1),
                cur_stride = (cur_idx >= 0 ? stride[cur_idx] : 0);
         if (tshape.shape[target_idx] != cur_shape) {
@@ -434,10 +430,16 @@ TensorLayout TensorLayout::broadcast(const TensorShape& tshape) const {
 bool TensorLayout::try_reshape(TensorLayout& result,
                                const TensorShape& tshp) const {
     megdnn_assert(tshp.ndim);
+
+    bool is_empty_shape = false;
     for (size_t i = 0; i < tshp.ndim; ++i) {
-        megdnn_throw_if(!tshp.shape[i], tensor_reshape_error,
-                        megdnn_mangle(ssprintf("bad target tshp: %s",
-                                               tshp.to_string().c_str())));
+        if (!tshp.shape[i]) {
+            megdnn_throw_if(!format.is_default(), tensor_reshape_error,
+                megdnn_mangle(ssprintf("bad target tshp: %s",
+                                tshp.to_string().c_str())));
+            is_empty_shape = true;
+            break;
+        }
     }
 
     megdnn_throw_if(
@@ -454,6 +456,11 @@ bool TensorLayout::try_reshape(TensorLayout& result,
     result.format = this->format;
     result.TensorShape::operator=(tshp);
 
+    if (is_empty_shape) {
+        result.init_contiguous_stride();
+        return true;
+    }
+
     size_t sdim = 0, prod = 1, cont_sdim = 0;
     for (size_t i = 0; i < tshp.ndim; ++i) {
         megdnn_assert(cont_sdim < cont.ndim);
diff --git a/dnn/src/common/handle_impl.h b/dnn/src/common/handle_impl.h
index 354bbd0a..84469972 100644
--- a/dnn/src/common/handle_impl.h
+++ b/dnn/src/common/handle_impl.h
@@ -199,6 +199,8 @@ private:
     cb(Remap) \
     cb(RemapBackwardData) \
     cb(RemapBackwardMat) \
+    cb(AdaptivePoolingForward) \
+    cb(AdaptivePoolingBackward) \
 
 /*!
  * \brief specialize HandleImpl::create_operator for a single opr type;
diff --git a/dnn/src/cuda/adaptive_pooling/opr_impl.cpp b/dnn/src/cuda/adaptive_pooling/opr_impl.cpp
new file mode 100644
index 00000000..c9ece2b4
--- /dev/null
+++ b/dnn/src/cuda/adaptive_pooling/opr_impl.cpp
@@ -0,0 +1,53 @@
+/**
+ * \file dnn/src/cuda/adaptive_pooling/opr_impl.cpp
+ * MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
+ *
+ * Copyright (c) 2014-2020 Megvii Inc. All rights reserved.
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or
+ * implied.
+ */
+#include "src/cuda/adaptive_pooling/opr_impl.h"
+#include "src/cuda/utils.h"
+
+namespace megdnn {
+namespace cuda {
+
+void AdaptivePoolingForwardImpl::exec(_megdnn_tensor_in src,
+                                      _megdnn_tensor_out dst,
+                                      _megdnn_workspace workspace) {
+    auto opr = handle()->create_operator<PoolingForward>();
+    opr->param() = deduce_pooling_param(src.layout, dst.layout);
+    opr->exec(src, dst, workspace);
+}
+
+size_t AdaptivePoolingForwardImpl::get_workspace_in_bytes(
+        const TensorLayout& src, const TensorLayout& dst) {
+    auto opr = handle()->create_operator<PoolingForward>();
+    opr->param() = deduce_pooling_param(src, dst);
+    return opr->get_workspace_in_bytes(src, dst);
+}
+
+void AdaptivePoolingBackwardImpl::exec(_megdnn_tensor_in src,
+                                       _megdnn_tensor_in dst,
+                                       _megdnn_tensor_in diff,
+                                       _megdnn_tensor_out grad,
+                                       _megdnn_workspace workspace) {
+    auto opr = handle()->create_operator<PoolingBackward>();
+    opr->param() = deduce_pooling_param(src.layout, dst.layout);
+    opr->exec(src, dst, diff, grad, workspace);
+}
+
+size_t AdaptivePoolingBackwardImpl::get_workspace_in_bytes(
+        const TensorLayout& src, const TensorLayout& dst,
+        const TensorLayout& diff, const TensorLayout& grad) {
+    auto opr = handle()->create_operator<PoolingBackward>();
+    opr->param() = deduce_pooling_param(src, dst);
+    return opr->get_workspace_in_bytes(src, dst, diff, grad);
+}
+}  // namespace cuda
+}  // namespace megdnn
+
+// vim: syntax=cpp.doxygen
diff --git a/dnn/src/cuda/adaptive_pooling/opr_impl.h b/dnn/src/cuda/adaptive_pooling/opr_impl.h
new file mode 100644
index 00000000..5df0538b
--- /dev/null
+++ b/dnn/src/cuda/adaptive_pooling/opr_impl.h
@@ -0,0 +1,44 @@
+/**
+ * \file dnn/src/cuda/adaptive_pooling/opr_impl.h
+ * MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
+ *
+ * Copyright (c) 2014-2020 Megvii Inc. All rights reserved.
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or
+ * implied.
+ */
+#pragma once
+#include "megdnn/oprs.h"
+
+#include "src/cuda/cudnn_wrapper.h"
+#include "src/cuda/utils.h"
+
+namespace megdnn {
+namespace cuda {
+
+class AdaptivePoolingForwardImpl final : public AdaptivePoolingForward {
+public:
+    using AdaptivePoolingForward::AdaptivePoolingForward;
+    void exec(_megdnn_tensor_in src, _megdnn_tensor_out dst,
+              _megdnn_workspace workspace) override;
+    size_t get_workspace_in_bytes(const TensorLayout& src,
+                                  const TensorLayout& dst) override;
+};
+
+class AdaptivePoolingBackwardImpl final : public AdaptivePoolingBackward {
+public:
+    using AdaptivePoolingBackward::AdaptivePoolingBackward;
+    void exec(_megdnn_tensor_in src, _megdnn_tensor_in dst,
+              _megdnn_tensor_in diff, _megdnn_tensor_out grad,
+              _megdnn_workspace workspace) override;
+    size_t get_workspace_in_bytes(const TensorLayout& src,
+                                  const TensorLayout& dst,
+                                  const TensorLayout& diff,
+                                  const TensorLayout& grad) override;
+};
+}  // namespace cuda
+}  // namespace megdnn
+
+// vim: syntax=cpp.doxygen
diff --git a/dnn/src/cuda/handle_create.cpp b/dnn/src/cuda/handle_create.cpp
index e060a43d..59262b5d 100644
--- a/dnn/src/cuda/handle_create.cpp
+++ b/dnn/src/cuda/handle_create.cpp
@@ -11,6 +11,7 @@
 
 #include "src/common/handle_impl.h"
 
+#include "src/cuda/adaptive_pooling/opr_impl.h"
 #include "src/cuda/add_update/opr_impl.h"
 #include "src/cuda/argmxx/opr_impl.h"
 #include "src/cuda/argsort/opr_impl.h"
diff --git a/dnn/src/cuda/indexing_multi_axis_vec/kern_apply_opr_impl.cuinl b/dnn/src/cuda/indexing_multi_axis_vec/kern_apply_opr_impl.cuinl
index a640d865..de8ec033 100644
--- a/dnn/src/cuda/indexing_multi_axis_vec/kern_apply_opr_impl.cuinl
+++ b/dnn/src/cuda/indexing_multi_axis_vec/kern_apply_opr_impl.cuinl
@@ -72,6 +72,7 @@ namespace indexing_multi_axis_vec {
 #define cb0(_dtype) \
     MEGDNN_FOREACH_TENSOR_NDIM(INST, DTypeTrait<_dtype>::ctype)
     MEGDNN_FOREACH_COMPUTING_DTYPE(cb0)
+    cb0(::megdnn::dtype::Bool)
 #undef cb0
 #undef INST
 
diff --git a/dnn/src/cuda/indexing_multi_axis_vec/kern_apply_opr_incr.cu b/dnn/src/cuda/indexing_multi_axis_vec/kern_apply_opr_incr.cu
index 74874f23..cfefef12 100644
--- a/dnn/src/cuda/indexing_multi_axis_vec/kern_apply_opr_incr.cu
+++ b/dnn/src/cuda/indexing_multi_axis_vec/kern_apply_opr_incr.cu
@@ -39,6 +39,11 @@ __device__ void atomicAdd(megdnn::dt_int16 *, megdnn::dt_int16) {
     ((int*)0)[0] = 1;
 }
 
+__device__ void atomicAdd(megdnn::dt_bool *, megdnn::dt_bool) {
+    __trap();
+    ((int*)0)[0] = 1;
+}
+
 #define KERN_APPLY_OPR_OPR \
     ::megdnn::cuda::indexing_multi_axis_vec::OprAtomicIncr
 #include "./kern_apply_opr_impl.cuinl"
diff --git a/dnn/src/cuda/indexing_multi_axis_vec/opr_impl.cpp b/dnn/src/cuda/indexing_multi_axis_vec/opr_impl.cpp
index 4e864905..85b83468 100644
--- a/dnn/src/cuda/indexing_multi_axis_vec/opr_impl.cpp
+++ b/dnn/src/cuda/indexing_multi_axis_vec/opr_impl.cpp
@@ -120,6 +120,7 @@ void ExecImpl<Opr>::dispatch_exec() {
         case DTypeTrait<_dtype>::enumv: \
             return dispatch_exec_ctype<DTypeTrait<_dtype>::ctype>();
         MEGDNN_FOREACH_COMPUTING_DTYPE(cb)
+        cb(::megdnn::dtype::Bool)
 #undef cb
         default:
             megdnn_throw("bad dtype");
diff --git a/dnn/src/naive/adaptive_pooling/opr_impl.cpp b/dnn/src/naive/adaptive_pooling/opr_impl.cpp
new file mode 100644
index 00000000..0d6f53e6
--- /dev/null
+++ b/dnn/src/naive/adaptive_pooling/opr_impl.cpp
@@ -0,0 +1,52 @@
+/**
+ * \file dnn/src/naive/adaptive_pooling/opr_impl.cpp
+ * MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
+ *
+ * Copyright (c) 2014-2020 Megvii Inc. All rights reserved.
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or
+ * implied.
+ */
+#include "src/naive/adaptive_pooling/opr_impl.h"
+
+#include "src/common/opr_delegate.h"
+#include "src/common/utils.h"
+#include "src/naive/handle.h"
+
+namespace megdnn {
+namespace naive {
+
+void AdaptivePoolingForwardImpl::exec(_megdnn_tensor_in src,
+                                      _megdnn_tensor_out dst,
+                                      _megdnn_workspace workspace) {
+    MEGDNN_DISPATCH_CPU_KERN(static_cast<naive::HandleImpl*>(handle()), {
+        auto opr = inplace_cpu_handle()->create_operator<PoolingForward>();
+        opr->param() = deduce_pooling_param(src.layout, dst.layout);
+        opr->exec(src, dst, workspace);
+    });
+}
+
+void AdaptivePoolingBackwardImpl::exec(_megdnn_tensor_in src,
+                                       _megdnn_tensor_in dst,
+                                       _megdnn_tensor_in diff,
+                                       _megdnn_tensor_out grad,
+                                       _megdnn_workspace workspace) {
+    MEGDNN_DISPATCH_CPU_KERN(static_cast<naive::HandleImpl*>(handle()), {
+        auto opr = inplace_cpu_handle()->create_operator<PoolingBackward>();
+        opr->param() = deduce_pooling_param(src.layout, dst.layout);
+        opr->exec(src, dst, diff, grad, workspace);
+    });
+}
+
+size_t AdaptivePoolingBackwardImpl::get_workspace_in_bytes(
+        const TensorLayout& src, const TensorLayout& dst,
+        const TensorLayout& diff, const TensorLayout& grad) {
+    auto opr = inplace_cpu_handle()->create_operator<PoolingBackward>();
+    opr->param() = deduce_pooling_param(src, dst);
+    return opr->get_workspace_in_bytes(src, dst, diff, grad);
+}
+}  // namespace naive
+}  // namespace megdnn
+// vim: syntax=cpp.doxygen
diff --git a/dnn/src/naive/adaptive_pooling/opr_impl.h b/dnn/src/naive/adaptive_pooling/opr_impl.h
new file mode 100644
index 00000000..cb3bec17
--- /dev/null
+++ b/dnn/src/naive/adaptive_pooling/opr_impl.h
@@ -0,0 +1,43 @@
+/**
+ * \file dnn/src/naive/adaptive_pooling/opr_impl.h
+ * MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
+ *
+ * Copyright (c) 2014-2020 Megvii Inc. All rights reserved.
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or
+ * implied.
+ */
+#pragma once
+#include "megdnn/oprs.h"
+#include "src/common/utils.h"
+
+namespace megdnn {
+namespace naive {
+
+class AdaptivePoolingForwardImpl : public AdaptivePoolingForward {
+public:
+    using AdaptivePoolingForward::AdaptivePoolingForward;
+    void exec(_megdnn_tensor_in src, _megdnn_tensor_out dst,
+              _megdnn_workspace workspace) override;
+    size_t get_workspace_in_bytes(const TensorLayout&,
+                                  const TensorLayout&) override {
+        return 0;
+    }
+};
+
+class AdaptivePoolingBackwardImpl : public AdaptivePoolingBackward {
+public:
+    using AdaptivePoolingBackward::AdaptivePoolingBackward;
+    void exec(_megdnn_tensor_in src, _megdnn_tensor_in dst,
+              _megdnn_tensor_in diff, _megdnn_tensor_out grad,
+              _megdnn_workspace workspace) override;
+    size_t get_workspace_in_bytes(const TensorLayout& src,
+                                  const TensorLayout& dst,
+                                  const TensorLayout& diff,
+                                  const TensorLayout& grad) override;
+};
+}  // namespace naive
+}  // namespace megdnn
+// vim: syntax=cpp.doxygen
diff --git a/dnn/src/naive/handle.cpp b/dnn/src/naive/handle.cpp
index 6a1129d9..fd4cb780 100644
--- a/dnn/src/naive/handle.cpp
+++ b/dnn/src/naive/handle.cpp
@@ -13,6 +13,7 @@
 
 #include "src/common/handle_impl.h"
 
+#include "src/naive/adaptive_pooling/opr_impl.h"
 #include "src/naive/add_update/opr_impl.h"
 #include "src/naive/argmxx/opr_impl.h"
 #include "src/naive/argsort/opr_impl.h"
diff --git a/dnn/src/naive/indexing_multi_axis_vec/opr_impl.cpp b/dnn/src/naive/indexing_multi_axis_vec/opr_impl.cpp
index 52de1335..16ca74b0 100644
--- a/dnn/src/naive/indexing_multi_axis_vec/opr_impl.cpp
+++ b/dnn/src/naive/indexing_multi_axis_vec/opr_impl.cpp
@@ -88,6 +88,7 @@ void dispatch_exec(HandleImpl *handle,
     }
     switch (data.layout.dtype.enumv()) {
         MEGDNN_FOREACH_COMPUTING_DTYPE(cb)
+        cb(::megdnn::dtype::Bool)
         default:
             megdnn_throw(megdnn_mangle("bad dtype"));
     }
diff --git a/dnn/test/common/adaptive_pooling.h b/dnn/test/common/adaptive_pooling.h
new file mode 100644
index 00000000..7e449513
--- /dev/null
+++ b/dnn/test/common/adaptive_pooling.h
@@ -0,0 +1,55 @@
+/**
+ * \file dnn/test/common/adaptive_pooling.h
+ * MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
+ *
+ * Copyright (c) 2014-2020 Megvii Inc. All rights reserved.
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or
+ * implied.
+ */
+#pragma once
+#include <cstddef>
+#include "megdnn/basic_types.h"
+#include "megdnn/opr_param_defs.h"
+
+namespace megdnn {
+namespace test {
+namespace adaptive_pooling {
+
+struct TestArg {
+    param::AdaptivePooling param;
+    TensorShape ishape;
+    TensorShape oshape;
+    TestArg(param::AdaptivePooling param, TensorShape ishape,
+            TensorShape oshape)
+            : param(param), ishape(ishape), oshape(oshape) {}
+};
+
+inline std::vector<TestArg> get_args() {
+    std::vector<TestArg> args;
+    using Param = param::AdaptivePooling;
+    using Mode = param::AdaptivePooling::Mode;
+
+    for (size_t i = 36; i < 40; ++i) {
+        args.emplace_back(Param{Mode::AVERAGE}, TensorShape{2, 3, i, i + 1},
+                          TensorShape{2, 3, i - 4, i - 2});
+        args.emplace_back(Param{Mode::MAX}, TensorShape{2, 3, i, i + 1},
+                          TensorShape{2, 3, i - 4, i - 2});
+    }
+
+    for (size_t i = 5; i < 10; ++i) {
+        args.emplace_back(Param{Mode::AVERAGE}, TensorShape{2, 3, i, i + 1},
+                          TensorShape{2, 3, i - 3, i - 2});
+        args.emplace_back(Param{Mode::MAX}, TensorShape{2, 3, i, i + 1},
+                          TensorShape{2, 3, i - 3, i - 2});
+    }
+    return args;
+}
+
+}  // namespace adaptive_pooling
+}  // namespace test
+}  // namespace megdnn
+
+// vim: syntax=cpp.doxygen
diff --git a/dnn/test/common/opr_trait.h b/dnn/test/common/opr_trait.h
index 22f0f349..66af7db3 100644
--- a/dnn/test/common/opr_trait.h
+++ b/dnn/test/common/opr_trait.h
@@ -41,6 +41,8 @@ DEF(Images2NeibsForward, 2, true, true);
 DEF(Images2NeibsBackward, 2, true, false);
 DEF(PoolingForward, 2, true, true);
 DEF(PoolingBackward, 4, true, false);
+DEF(AdaptivePoolingForward, 2, true, false);
+DEF(AdaptivePoolingBackward, 4, true, false);
 DEF(LocalForward, 3, true, true);
 DEF(LocalBackwardData, 3, true, false);
 DEF(LocalBackwardFilter, 3, true, false);
diff --git a/dnn/test/cuda/adaptive_pooling.cpp b/dnn/test/cuda/adaptive_pooling.cpp
new file mode 100644
index 00000000..14d444c5
--- /dev/null
+++ b/dnn/test/cuda/adaptive_pooling.cpp
@@ -0,0 +1,97 @@
+/**
+ * \file dnn/test/cuda/adaptive_pooling.cpp
+ * MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
+ *
+ * Copyright (c) 2014-2020 Megvii Inc. All rights reserved.
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or
+ * implied.
+ */
+#include "test/cuda/fixture.h"
+
+#include "megdnn/tensor_iter.h"
+#include "test/common/adaptive_pooling.h"
+#include "test/common/checker.h"
+
+#include "src/common/utils.h"
+#include "test/cuda/utils.h"
+
+#include <cudnn.h>
+#include "test/cuda/benchmark.h"
+
+namespace megdnn {
+namespace test {
+
+TEST_F(CUDA, ADAPTIVE_POOLING_FORWARD) {
+    auto args = adaptive_pooling::get_args();
+    using Format = param::AdaptivePooling::Format;
+    DType dtype = dtype::Float32();
+    for (auto&& arg : args) {
+        auto param = arg.param;
+        auto src = arg.ishape;
+        auto dst = arg.oshape;
+        param.format = Format::NCHW;
+        Checker<AdaptivePooling> checker(handle_cuda());
+        checker.set_epsilon(1e-2);
+        checker.set_param(param).set_dtype(0, dtype).set_dtype(1, dtype).exec(
+                TensorShapeArray{src, dst, {}});
+    }
+}
+
+TEST_F(CUDA, ADAPTIVE_POOLING_BACKWARD) {
+    auto args = adaptive_pooling::get_args();
+    for (auto&& arg : args) {
+        Checker<AdaptivePoolingBackward> checker(handle_cuda());
+        TensorLayout ilayout = TensorLayout(arg.ishape, dtype::Float32());
+        TensorLayout olayout = TensorLayout(arg.oshape, dtype::Float32());
+
+        auto constraint = [this,
+                           arg](CheckerHelper::TensorValueArray& tensors_orig) {
+            megdnn_assert(tensors_orig.size() == 4);
+            auto opr = handle_cuda()->create_operator<AdaptivePoolingForward>();
+            opr->param() = arg.param;
+
+            auto tensors_cuda_storage = CheckerHelper::alloc_tensors(
+                    handle_cuda(),
+                    {tensors_orig[0].layout, tensors_orig[1].layout}, 0);
+            auto&& tensors_cuda = *tensors_cuda_storage;
+
+            auto span = tensors_cuda[0].layout.span();
+            auto dst = static_cast<dt_byte*>(tensors_cuda[0].raw_ptr) +
+                       span.low_byte;
+            auto src = static_cast<const dt_byte*>(tensors_orig[0].raw_ptr) +
+                       span.low_byte;
+            megdnn_memcpy_H2D(handle_cuda(), dst, src, span.dist_byte());
+
+            auto workspace_size = opr->get_workspace_in_bytes(
+                    tensors_cuda[0].layout, tensors_cuda[1].layout);
+            auto workspace_cuda = megdnn_malloc(handle_cuda(), workspace_size);
+            Workspace workspace{static_cast<dt_byte*>(workspace_cuda),
+                                workspace_size};
+            opr->exec(tensors_cuda[0], tensors_cuda[1], workspace);
+            megdnn_free(handle_cuda(), workspace_cuda);
+
+            span = tensors_cuda[1].layout.span();
+            dst = static_cast<dt_byte*>(tensors_orig[1].raw_ptr) +
+                  span.low_byte;
+            src = static_cast<const dt_byte*>(tensors_cuda[1].raw_ptr) +
+                  span.low_byte;
+            megdnn_memcpy_D2H(handle_cuda(), dst, src, span.dist_byte());
+        };
+
+        DType dtype = dtype::Float32();
+        checker.set_tensors_constraint(constraint)
+                .set_dtype(0, dtype)
+                .set_dtype(1, dtype)
+                .set_dtype(2, dtype)
+                .set_dtype(3, dtype)
+                .set_param(arg.param)
+                .exec(TensorShapeArray{ilayout, olayout, olayout, ilayout});
+    }
+}
+}  // namespace test
+}  // namespace megdnn
+
+// vim: syntax=cpp.doxygen
diff --git a/dnn/test/cuda/conv_bias_int8.cpp b/dnn/test/cuda/conv_bias_int8.cpp
index 58b769be..79a9a185 100644
--- a/dnn/test/cuda/conv_bias_int8.cpp
+++ b/dnn/test/cuda/conv_bias_int8.cpp
@@ -6,7 +6,8 @@
  *
  * Unless required by applicable law or agreed to in writing,
  * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or
+ * implied.
  */
 #include "megdnn/oprs/nn.h"
 
@@ -37,7 +38,7 @@ std::vector<BenchArgs> get_resnet50_bench_args(size_t batch = 64) {
     args.emplace_back(BenchArgs{batch, 256, 56, 56, 32, 3, 1});
     args.emplace_back(BenchArgs{batch, 256, 56, 56, 32, 3, 2});
     args.emplace_back(BenchArgs{batch, 4, 256, 256, 32, 7, 2});
- 
+
     args.emplace_back(BenchArgs{batch, 256, 56, 56, 64, 1, 1});
     args.emplace_back(BenchArgs{batch, 64, 56, 56, 64, 1, 1});
     args.emplace_back(BenchArgs{batch, 64, 56, 56, 64, 3, 1});
@@ -614,11 +615,8 @@ TEST_F(CUDA, CONV_BIAS_INT8_CHWN4_HSWISH) {
     param.stride_h = param.stride_w = 1;
     param.format = param::ConvBias::Format::CHWN4;
     param.nonlineMode = param::ConvBias::NonlineMode::H_SWISH;
-    checker.set_param(param).execs({{4, 12, 12, 32, 4},
-                                    {4, 3, 3, 16, 4},
-                                    {4, 1, 1, 1, 4},
-                                    {},
-                                    {}});
+    checker.set_param(param).execs(
+            {{4, 12, 12, 32, 4}, {4, 3, 3, 16, 4}, {4, 1, 1, 1, 4}, {}, {}});
 }
 
 TEST_F(CUDA, CONV_BIAS_INT8_CHWN4_CHECK_BOUNDS) {
@@ -1076,7 +1074,6 @@ TEST_F(CUDA, CONV_BIAS_INT8_CHWN4_UNROLL_WIDTH_TENSORCORE_1x1_ALGO_2) {
 }
 
 
-
 #if CUDA_VERSION >= 10020
 /// \note: we only check several cases and block sizes in megdnn_test, the full
 /// testcases are written in cutlass repository
@@ -1234,8 +1231,7 @@ TEST_F(CUDA, BENCHMARK_CUTLASS_CONV_BIAS_INT8_NCHW4) {
             handle_cuda(), get_resnet50_bench_args(64),
             dtype::QuantizedS8{1.2f}, dtype::QuantizedS8{1.3f},
             dtype::QuantizedS32{1.2f * 1.3f}, dtype::QuantizedS8{1.0f},
-            "INT8_NCHW4_DOTPROD_IMPLICIT_GEMM",
-            param::ConvBias::Format::NCHW4);
+            "INT8_NCHW4_DOTPROD_IMPLICIT_GEMM", param::ConvBias::Format::NCHW4);
 }
 #endif
 }  // namespace test
diff --git a/imperative/CMakeLists.txt b/imperative/CMakeLists.txt
index a0721270..24d788ba 100644
--- a/imperative/CMakeLists.txt
+++ b/imperative/CMakeLists.txt
@@ -47,8 +47,7 @@ add_custom_target(gen_opr_py DEPENDS ${GEN_OPS_FILE})
 
 ##################### end of opdef generation #########################
 
-set(VERSION_SCRIPT ${CMAKE_CURRENT_SOURCE_DIR}/src/version.ld)
-add_custom_target(_version_ld SOURCES ${VERSION_SCRIPT})
+add_custom_target(_version_ld SOURCES ${MGE_VERSION_SCRIPT})
 
 add_subdirectory(${PROJECT_SOURCE_DIR}/third_party/pybind11 ${PROJECT_BINARY_DIR}/third_party/pybind11)
 pybind11_add_module(${MODULE_NAME} NO_EXTRAS ${SRCS})
@@ -57,8 +56,21 @@ if (APPLE)
 elseif (MSVC OR WIN32)
     # Windows does not support implicitly importing data members from DLL.
     target_link_libraries(${MODULE_NAME} PRIVATE megbrain megdnn)
+    message("-- CMAKE_MSVC_RUNTIME_LIBRARY: ${CMAKE_MSVC_RUNTIME_LIBRARY}")
+    set_target_properties(${MODULE_NAME} PROPERTIES MSVC_RUNTIME_LIBRARY "${CMAKE_MSVC_RUNTIME_LIBRARY}")
 else()
-    target_link_libraries(${MODULE_NAME} PRIVATE megengine_export -Wl,--version-script=${VERSION_SCRIPT})
+    if (MGE_WITH_PYTHON_MODULE)
+        # use to fix runtime crash when build both mgb(MGE_WITH_PYTHON_MODULE) and imperative(MGE_BUILD_IMPERATIVE_RT)
+        target_link_libraries(${MODULE_NAME} PRIVATE megengine_export -Wl,--version-script=${MGE_VERSION_SCRIPT})
+    else()
+        # use to reduce whl size by depend on megbrain/dnn directly, caused by cmake create two cuda fatbin
+        # elf section on both megengine_export and target which depend on megengine_export
+        target_link_libraries(${MODULE_NAME} PRIVATE megbrain megdnn -Wl,--version-script=${MGE_VERSION_SCRIPT})
+        if (MGE_WITH_DISTRIBUTED)
+            message("-- Imperative configured to link megray")
+            target_link_libraries(${MODULE_NAME} PRIVATE megray)
+        endif()
+    endif()
 endif()
 
 target_include_directories(${MODULE_NAME} PUBLIC src/include PRIVATE ${PYTHON_INCLUDE_DIRS} ${NUMPY_INCLUDE_DIR})
diff --git a/imperative/python/megengine/__init__.py b/imperative/python/megengine/__init__.py
index fa1d6962..083032fd 100644
--- a/imperative/python/megengine/__init__.py
+++ b/imperative/python/megengine/__init__.py
@@ -76,7 +76,7 @@ from .logger import enable_debug_log, get_logger, set_log_file, set_log_level
 from .serialization import load, save
 from .tensor import Parameter, Tensor, tensor
 from .version import __version__
-from .core import cgtools
+from .utils import comp_graph_tools as cgtools
 
 _set_fork_exec_path_for_timed_func(
     sys.executable,
diff --git a/imperative/python/megengine/autodiff/grad_manager.py b/imperative/python/megengine/autodiff/grad_manager.py
index 2a0f6906..001c9f9d 100644
--- a/imperative/python/megengine/autodiff/grad_manager.py
+++ b/imperative/python/megengine/autodiff/grad_manager.py
@@ -20,7 +20,7 @@ class GradManager:
     the forward operations start and when all resources should be released. A typical usage of
     GradManager is as follows:
 
-        .. codeblock::
+        .. code-block::
 
             gm = GradManager()
             gm.attach(model.parameters())
@@ -32,7 +32,7 @@ class GradManager:
 
     You can also use `record()` and `release()` method instead of `with` context:
 
-        .. codeblock::
+        .. code-block::
 
             gm = GradManager()
             gm.attach(model.parameters())
@@ -50,7 +50,7 @@ class GradManager:
     processes. Users will finally get the averaged gradients if an "AllReduce"
     callback is registered as follows:
 
-        .. codeblock::
+        .. code-block::
 
             import megengine.distributed as dist
 
@@ -71,7 +71,7 @@ class GradManager:
         r"""Registers parameters that gradients should be calculated with respect to.
         Callback Functions should have a signature like this:
 
-            .. codeblock::
+            .. code-block::
 
                 def cb(param: Tensor, grad: Tensor) -> Tensor:
                     # do something
@@ -100,6 +100,8 @@ class GradManager:
         :param ys: outputs of forward operators, e.g., the loss tensor
         :param dys: derivatives of ys
         """
+        from ..functional import ones_like
+
         global backwarding_grad_manager
         cache = backwarding_grad_manager
         backwarding_grad_manager = self
@@ -113,7 +115,7 @@ class GradManager:
         if not isinstance(ys, (tuple, list)):
             ys = [ys]
         if dys is None:
-            dys = [tensor(1.0).broadcast(y.shape) for y in ys]
+            dys = [ones_like(y) for y in ys]
         if not isinstance(dys, (tuple, list)):
             dys = [dys]
         try:
diff --git a/imperative/python/megengine/core/__init__.py b/imperative/python/megengine/core/__init__.py
index 50d29e9e..4fd130bc 100644
--- a/imperative/python/megengine/core/__init__.py
+++ b/imperative/python/megengine/core/__init__.py
@@ -11,4 +11,3 @@ import sys
 
 from .tensor import Tensor
 from .tensor.megbrain_graph import Graph
-from .utils import comp_graph_tools as cgtools
diff --git a/imperative/python/megengine/core/_wrap.py b/imperative/python/megengine/core/_wrap.py
index c4bf7564..538518a1 100644
--- a/imperative/python/megengine/core/_wrap.py
+++ b/imperative/python/megengine/core/_wrap.py
@@ -22,11 +22,13 @@ class Device:
         else:
             self._cn = CompNode(device)
 
+        self.logical_name = self._cn.logical_name
+
     def to_c(self):
         return self._cn
 
     def __repr__(self):
-        return "{}({})".format(type(self).__qualname__, self)
+        return "{}({})".format(type(self).__qualname__, repr(self._cn))
 
     def __str__(self):
         return str(self._cn)
diff --git a/imperative/python/megengine/core/autodiff/builtin_op_utils.py b/imperative/python/megengine/core/autodiff/builtin_op_utils.py
index 51f54194..6ed12afb 100644
--- a/imperative/python/megengine/core/autodiff/builtin_op_utils.py
+++ b/imperative/python/megengine/core/autodiff/builtin_op_utils.py
@@ -160,7 +160,7 @@ def subtensor_grad_fn(op, inputs, outputs, input_requires_grad):
     def make_grad(grad_op, dy):
         grad = (
             TensorWrapper(0, dtype=dy.dtype, device=dy.device)
-            .broadcast(TensorWrapper(input_shape))
+            ._broadcast(TensorWrapper(input_shape))
             .__wrapped__
         )
         (dx,) = apply(grad_op, grad, dy, *params)
@@ -186,7 +186,7 @@ def indexingMultiAxisVec_grad_fn(op, inputs, outputs, input_requires_grad):
     def make_grad(grad_op, dy):
         grad = (
             TensorWrapper(0, dtype=dy.dtype, device=dy.device)
-            .broadcast(TensorWrapper(input_shape))
+            ._broadcast(TensorWrapper(input_shape))
             .__wrapped__
         )
         (dx,) = apply(grad_op, grad, dy, *params)
diff --git a/imperative/python/megengine/core/tensor/function.py b/imperative/python/megengine/core/tensor/function.py
index b50e8261..87f734b3 100644
--- a/imperative/python/megengine/core/tensor/function.py
+++ b/imperative/python/megengine/core/tensor/function.py
@@ -50,8 +50,8 @@ class Function:
         """
         Applies operations to ``inputs`` and returns results. It must be overriden by all subclasses.
 
-        :param input: Input tensors.
-        :return: A tuple of Tensor or a single Tensor.
+        :param input: input tensors.
+        :return: a tuple of Tensor or a single Tensor.
 
         .. note::
 
@@ -64,12 +64,12 @@ class Function:
         """
         Compute the gradient of the forward function. It must be overriden by all subclasses.
 
-        :param output_grads: gradients of outputs that are returned by :meth:`~.function.Function.forward`
+        :param output_grads: gradients of outputs that are returned by :meth:`~.function.Function.forward`.
 
-            .. note::
+        .. note::
 
-                In case when some tensors of outputs are not related to loss function, the corresponding
-                values in ``output_grads`` would be ``None``.
+            In case when some tensors of outputs are not related to loss function, the corresponding
+            values in ``output_grads`` would be ``None``.
 
         .. note::
 
diff --git a/imperative/python/megengine/core/tensor/indexing.py b/imperative/python/megengine/core/tensor/indexing.py
index cbbc61a1..40a6f1ab 100644
--- a/imperative/python/megengine/core/tensor/indexing.py
+++ b/imperative/python/megengine/core/tensor/indexing.py
@@ -173,7 +173,7 @@ def unpack_getitem(inp, tuple_val, *, allow_newaxis=True):
                 item.append(True)
                 v = get_index(v)
                 assert np.issubdtype(v.dtype, np.integer) or np.issubdtype(
-                    v.dtype, np.bool
+                    v.dtype, np.bool_
                 ), "var type in the subscript must be int or bool"
                 tensors.append(v)
 
@@ -267,7 +267,7 @@ def setitem(tensor, index, value):
                         value.shape, tmp_result.shape
                     )
                 )
-        value = value.broadcast(tmp_result.shape)
+        value = value._broadcast(tmp_result.shape)
     if use_subtensor:
         op = builtin.SetSubtensor(items=items)
     else:
diff --git a/imperative/python/megengine/core/tensor/megbrain_graph.py b/imperative/python/megengine/core/tensor/megbrain_graph.py
index df20a6cd..2330af73 100644
--- a/imperative/python/megengine/core/tensor/megbrain_graph.py
+++ b/imperative/python/megengine/core/tensor/megbrain_graph.py
@@ -8,6 +8,7 @@
 # "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 import collections
 import json
+import os
 import threading
 import weakref
 from concurrent.futures import Future, ThreadPoolExecutor
@@ -49,7 +50,16 @@ class Graph(_imperative_rt.ComputingGraph):
 
     def execute(self, *args):
         assert self._future is None
-        self._future = self._executor.submit(self._function.execute, *args)
+
+        def wrapped(*args):
+            try:
+                self._function.execute(*args)
+            except Exception as exc:
+                for i in self._function._all_rendezvous:
+                    i.set_exception(str(exc))
+                raise exc
+
+        self._future = self._executor.submit(wrapped, *args)
 
     def wait(self):
         assert self._future is not None
@@ -275,6 +285,7 @@ def dump_graph(
     keep_param_name: bool = False,
     keep_opr_priority: bool = False,
     strip_info_file=None,
+    append_json=False
 ):
     """serialize the computing graph of `output_vars` and get byte result.
 
@@ -295,6 +306,9 @@ def dump_graph(
     :param keep_opr_priority: whether to keep priority setting for operators
     :param strip_info_file: a string for path or a file handler. if is not None,
         then the dump information for code strip would be written to ``strip_info_file``
+    :param append_json: will be check when `strip_info_file` is not None. if set
+        true, the information for code strip will be append to strip_info_file.
+        if set false, will rewrite strip_info_file
     :return: dump result as byte string, and an instance of namedtuple
         :class:`CompGraphDumpResult`, whose fields are:
 
@@ -342,10 +356,25 @@ def dump_graph(
 
     if strip_info_file is not None:
         if isinstance(strip_info_file, str):
-            strip_info_file = open(strip_info_file, "w")
-        strip_info = json.loads(_imperative_rt.get_info_for_strip(ov))
-        strip_info["hash"] = dump_info.content_hash
-        json.dump(strip_info, strip_info_file)
+            if not os.path.exists(strip_info_file):
+                os.mknod(strip_info_file)
+            strip_info_file = open(strip_info_file, "r+")
+        new_strip_dict = json.loads(_imperative_rt.get_info_for_strip(ov))
+        ori_strip_dict = new_strip_dict
+        json_content = strip_info_file.read()
+        if append_json and len(json_content) != 0:
+            # if there are contents in json file. Read them first and then append new information
+            ori_strip_dict = json.loads(json_content)
+            for k in ori_strip_dict:
+                new_strip_dict_v = new_strip_dict.get(k)
+                if new_strip_dict_v is not None:
+                    for value in new_strip_dict_v:
+                        if not value in ori_strip_dict[k]:
+                            ori_strip_dict[k].append(value)
+        ori_strip_dict["hash"] = dump_info.content_hash
+        strip_info_file.seek(0)
+        strip_info_file.truncate()
+        json.dump(ori_strip_dict, strip_info_file)
 
     return dump_content, dump_info
 
@@ -358,7 +387,7 @@ CompGraphLoadResult = collections.namedtuple(
 def load_graph(fpath):
     """Load a serialized computing graph from file.
 
-    :parma fpath: Path or Handle for the output file
+    :param fpath: Path or Handle of the input file
     :return: An instance of namedtuple :class:`CompGraphLoadResult`,
         whose fields are:
 
diff --git a/imperative/python/megengine/core/tensor/multipledispatch/conflict.py b/imperative/python/megengine/core/tensor/multipledispatch/conflict.py
index 6989755d..ec852aa7 100644
--- a/imperative/python/megengine/core/tensor/multipledispatch/conflict.py
+++ b/imperative/python/megengine/core/tensor/multipledispatch/conflict.py
@@ -40,6 +40,8 @@
 #  All Megvii Modifications are Copyright (C) 2014-2020 Megvii Inc. All rights reserved.
 # --------------------------------------------------------------------------------------
 
+from collections import OrderedDict
+
 from .utils import _toposort, groupby
 from .variadic import isvariadic
 
@@ -159,5 +161,5 @@ def ordering(signatures):
     for s in signatures:
         if s not in edges:
             edges[s] = []
-    edges = dict((k, [b for a, b in v]) for k, v in edges.items())
+    edges = OrderedDict((k, [b for a, b in v]) for k, v in edges.items())
     return _toposort(edges)
diff --git a/imperative/python/megengine/core/tensor/raw_tensor/__init__.py b/imperative/python/megengine/core/tensor/raw_tensor/__init__.py
index a14fd197..ca62b105 100644
--- a/imperative/python/megengine/core/tensor/raw_tensor/__init__.py
+++ b/imperative/python/megengine/core/tensor/raw_tensor/__init__.py
@@ -100,6 +100,8 @@ def _(data: DeviceTensorND):
 @as_raw_tensor.register(np.ndarray)
 def _(array: np.ndarray, dtype=None, device=None):
     device = None if device is None else as_device(device).to_c()
+    if 0 in array.strides:
+        array = array.squeeze().reshape(array.shape)
     return RawTensor(put(array, dtype=dtype, device=device))
 
 
diff --git a/imperative/python/megengine/core/tensor/tensor_wrapper.py b/imperative/python/megengine/core/tensor/tensor_wrapper.py
index 840dfcab..6c8a3277 100644
--- a/imperative/python/megengine/core/tensor/tensor_wrapper.py
+++ b/imperative/python/megengine/core/tensor/tensor_wrapper.py
@@ -57,7 +57,29 @@ def _transpose(data, axes):
 
 
 def _broadcast(inp, shape):
+    def valid_broadcast(src, tar):
+        def failed():
+            raise ValueError(
+                "the input shape {} can not be broadcasted to target shape {}".format(
+                    src, tar
+                )
+            )
+
+        if isinstance(src, (TensorBase, TensorWrapperBase)):
+            src = src.numpy()
+
+        if isinstance(tar, (TensorBase, TensorWrapperBase)):
+            tar = tar.numpy()
+
+        if len(src) > len(tar):
+            failed()
+
+        for i in range(min(len(src), len(tar))):
+            if src[-i - 1] != 1 and src[-i - 1] != tar[-i - 1]:
+                failed()
+
     shape = utils.astensor1d(shape, inp, dtype="int32", device=inp.device)
+    valid_broadcast(inp.shape, shape)
     (result,) = apply(builtin.Broadcast(), inp, shape)
     return result
 
@@ -158,6 +180,10 @@ def _reduce(mode):
     def f(self, axis=None, keepdims: bool = False):
         data = self
         (data,) = utils.convert_inputs(data)
+        if mode == "MEAN":
+            data = data.astype("float32")
+        elif self.dtype == np.bool_:
+            data = data.astype("int32")
         if axis is None:
             data = data.reshape(-1)
             assert not keepdims, "can not set axis=None and keepdims=True"
@@ -180,6 +206,9 @@ def _reduce(mode):
 
             if not keepdims:
                 result = _remove_axis(result, axis)
+        if self.dtype == np.bool_:
+            if mode in ["MIN", "MAX"]:
+                result = result.astype("bool")
         return result
 
     return f
@@ -203,7 +232,8 @@ def _todo(*_):
 def _expand_args(args):
     if len(args) == 1:
         if isinstance(
-            args[0], (collections.abc.Sequence, TensorBase, TensorWrapperBase)
+            args[0],
+            (collections.abc.Sequence, TensorBase, TensorWrapperBase, np.ndarray),
         ):
             args = args[0]
     return args
@@ -366,7 +396,8 @@ class ArrayMethodMixin(abc.ABC):
     def reshape(self, *args):
         return _reshape(self, _expand_args(args))
 
-    def broadcast(self, *args):
+    # FIXME: remove this method
+    def _broadcast(self, *args):
         return _broadcast(self, _expand_args(args))
 
     def transpose(self, *args):
@@ -377,7 +408,38 @@ class ArrayMethodMixin(abc.ABC):
     def flatten(self):
         return self.reshape(-1)
 
-    sum = _reduce("SUM")
+    def sum(self, axis=None, keepdims: bool = False):
+        r"""Returns the sum of each row of the input tensor in the given dimension ``axis``.
+        If ``axis`` is a list of axises, reduce over all of them.
+
+        If ``keepdims`` is ``True``, the shape of output tensor is the same as the input tensor, except in the dimension(s) ``axis`` where it is of size 1. Otherwise, ``axis`` is squeezed(see :meth:`~.functional.tensor.squeeze`).
+
+        Same for prod/mean/max/min.
+
+        :param axis: the dimension or dimensions to reduce.
+        :param keepdim: whether the output tensor has ndim retained or not.
+        :return: output tensor.
+
+        Examples:
+
+        .. testcode::
+
+            from megengine import tensor
+            a = tensor([False, True, True, False])
+            b = tensor([1.0, 2.0, 3.0, 4.0])
+            print(a.sum().numpy())
+            print(b.sum().numpy())
+
+        Outputs:
+
+        .. testoutput::
+
+            [2]
+            [10.]
+
+        """
+        return _reduce("SUM")(self, axis, keepdims)
+
     prod = _reduce("PRODUCT")
     min = _reduce("MIN")
     max = _reduce("MAX")
diff --git a/imperative/python/megengine/core/tensor/utils.py b/imperative/python/megengine/core/tensor/utils.py
index 9c795fbe..400e1523 100644
--- a/imperative/python/megengine/core/tensor/utils.py
+++ b/imperative/python/megengine/core/tensor/utils.py
@@ -16,39 +16,74 @@ from ..ops.special import Const
 from ..tensor.core import OpBase, TensorBase, TensorWrapperBase, apply
 
 
-def dtype_promotion(raw_inputs):
-    def add_dtype(i):
-        if type(i) == int:
-            return np.array(i, dtype=np.int32)
-        if type(i) == float:
-            return np.array(i, dtype=np.float32)
-        if type(i) == bool:
-            return np.array(i, dtype=np.bool_)
-        return None
-
-    scalar_inputs = [
-        add_dtype(i) for i in raw_inputs if not hasattr(i, "dtype") and add_dtype(i)
-    ]
-    inputs = [i for i in raw_inputs if hasattr(i, "dtype")]
-    assert len(scalar_inputs + inputs) > 0
-    dtype = None
-    if len(inputs) > 0:
-        dtype = np.result_type(*inputs)
-    dtype_all = np.result_type(*(inputs + scalar_inputs))
-    assert (
-        dtype != np.float64 and dtype != np.int64
-    ), "unsupport dtype {} by dtype_promotion, please use explict type convert".format(
-        dtype
-    )
-    if dtype_all == np.bool_:
-        for i in raw_inputs:
-            if not hasattr(i, "dtype") or i.dtype != np.bool_:
-                raise TypeError(
-                    "bool dtype can not be operated with an element without bool dtype"
-                )
-    if dtype_all == np.float64:
-        dtype_all = np.float32
-    return dtype_all
+def dtype_promotion(inputs):
+    """
+    Returns the dtype that would result from performing an arithmetic
+    operation on the provided input tensors and scalars.
+    """
+    # map numpy.dtype.kind to priority
+    category_priority = {
+        "f": 3,  # floating-point
+        "i": 2,  # signed integer
+        "u": 2,  # unsigned integer
+        "b": 1,  # boolean
+    }
+
+    def scalar2dtype(x):
+        """
+        For scalar `x`, returns its corresponding type. A floating point scalar
+        has dtype 'float32'. An integral non-boolean scalar has dtype 'int32'.
+        A boolean scalar has dtype 'bool'.
+        """
+        if isinstance(x, bool):
+            return np.bool_
+        if isinstance(x, int):
+            return np.int32
+        if isinstance(x, float):
+            return np.float32
+
+    def promote_types(types, cat):
+        """
+        Returns the data type with sufficient size to hold all types of
+        category `cat` in the list `types`.
+        """
+        used_types = [
+            i for i in types if category_priority.get(np.dtype(i).kind, 0) == cat
+        ]
+        assert len(used_types) > 0
+        res = used_types[0]
+        for i in used_types:
+            res = np.promote_types(res, i)
+        return res
+
+    def max_priority(types):
+        """
+        Returns the maximum value of the priority of each type in the list
+        `types`.
+        """
+        if not types:
+            return 0
+        else:
+            return max([category_priority.get(np.dtype(i).kind, 0) for i in types])
+
+    scalars = []
+    tensors = []
+
+    for data in inputs:
+        if hasattr(data, "dtype"):
+            tensors.append(data.dtype)
+        elif isinstance(data, (float, int, bool)):
+            scalars.append(scalar2dtype(data))
+
+    max_pri_scalars = max_priority(scalars)
+    max_pri_tensors = max_priority(tensors)
+
+    assert max_pri_scalars > 0 or max_pri_tensors > 0
+
+    if max_pri_scalars > max_pri_tensors:
+        return promote_types(scalars, max_pri_scalars)
+    else:
+        return promote_types(tensors, max_pri_tensors)
 
 
 def get_device(inputs):
diff --git a/imperative/python/megengine/core/utils/__init__.py b/imperative/python/megengine/core/utils/__init__.py
deleted file mode 100644
index 200f5ab2..00000000
--- a/imperative/python/megengine/core/utils/__init__.py
+++ /dev/null
@@ -1,9 +0,0 @@
-# -*- coding: utf-8 -*-
-# MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
-#
-# Copyright (c) 2014-2020 Megvii Inc. All rights reserved.
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-from .comp_graph_tools import *
diff --git a/imperative/python/megengine/data/_queue.py b/imperative/python/megengine/data/_queue.py
index a9e328c6..8e359ae0 100644
--- a/imperative/python/megengine/data/_queue.py
+++ b/imperative/python/megengine/data/_queue.py
@@ -26,7 +26,7 @@ def _clear_plasma_store():
     # `_PlasmaStoreManager.__del__` will not be called automaticly in subprocess,
     # so this function should be called explicitly
     global MGE_PLASMA_STORE_MANAGER
-    if MGE_PLASMA_STORE_MANAGER is not None:
+    if MGE_PLASMA_STORE_MANAGER is not None and MGE_PLASMA_STORE_MANAGER.refcount == 0:
         del MGE_PLASMA_STORE_MANAGER
         MGE_PLASMA_STORE_MANAGER = None
 
@@ -50,6 +50,7 @@ class _PlasmaStoreManager:
             stderr=None if debug_flag else subprocess.DEVNULL,
         )
         self.__initialized = True
+        self.refcount = 1
 
     def __del__(self):
         if self.__initialized and self.plasma_store.returncode is None:
@@ -83,6 +84,8 @@ class PlasmaShmQueue:
                     "Exception happened in starting plasma_store: {}\n"
                     "Tips: {}".format(str(e), err_info)
                 )
+        else:
+            MGE_PLASMA_STORE_MANAGER.refcount += 1
 
         self.socket_name = MGE_PLASMA_STORE_MANAGER.socket_name
 
@@ -133,6 +136,8 @@ class PlasmaShmQueue:
     def close(self):
         self.queue.close()
         self.disconnect_client()
+        global MGE_PLASMA_STORE_MANAGER
+        MGE_PLASMA_STORE_MANAGER.refcount -= 1
         _clear_plasma_store()
 
     def cancel_join_thread(self):
diff --git a/imperative/python/megengine/data/collator.py b/imperative/python/megengine/data/collator.py
index 952fc398..6242e767 100644
--- a/imperative/python/megengine/data/collator.py
+++ b/imperative/python/megengine/data/collator.py
@@ -34,14 +34,14 @@ default_collate_err_msg_format = (
 
 class Collator:
     r"""
-    Used for merge a list of samples to form a mini-batch of Tenor(s). Used when using batched loading from a dataset.
-    modified from https://github.com/pytorch/pytorch/blob/master/torch/utils/data/_utils/collate.py
+    Used for merging a list of samples to form a mini-batch of Tensor(s). Used when using batched loading from a dataset.
+    Modified from https://github.com/pytorch/pytorch/blob/master/torch/utils/data/_utils/collate.py
     """
 
     def apply(self, inputs):
         """
-        input : sequence_N(tuple(CHW, C, CK))
-        output : tuple(NCHW, NC, NCK)
+        :param input: sequence_N(tuple(CHW, C, CK)).
+        :return: tuple(NCHW, NC, NCK).
         """
         elem = inputs[0]
         elem_type = type(elem)
diff --git a/imperative/python/megengine/data/dataloader.py b/imperative/python/megengine/data/dataloader.py
index 1fd3482d..a92dff7a 100644
--- a/imperative/python/megengine/data/dataloader.py
+++ b/imperative/python/megengine/data/dataloader.py
@@ -43,7 +43,7 @@ class DataLoader:
     ):
         r"""Provides a convenient way to iterate on a given dataset.
 
-        `DataLoader` combines a dataset with sampler, transform and collator,
+        `DataLoader` combines a dataset with `sampler`, `transform` and `collator`,
         make it flexible to get minibatch continually from a dataset.
 
         :type dataset: Dataset
@@ -53,21 +53,21 @@ class DataLoader:
             If specified, :attr:`shuffle` must be ``False``.
         :type transform: Transform
         :param transform: defined the transforming strategy for a sampled batch.
-            (default: ``None``)
+            Default: None
         :type collator: Collator
         :param collator: defined the merging strategy for a transformed batch.
-            (default: ``None``)
+            Default: None
         :type num_workers: int
         :param num_workers: the number of sub-process to load, transform and collate
-            the batch. ``0`` means using single-process. (default: ``0``)
+            the batch. ``0`` means using single-process. Default: 0
         :type timeout: int
         :param timeout: if positive, means the timeout value(second) for collecting a
-            batch from workers. (default: 0)
+            batch from workers. Default: 0
         :type divide: bool
         :param divide: define the paralleling strategy in multi-processing mode.
             ``True`` means one batch is divided into :attr:`num_workers` pieces, and
             the workers will process these pieces parallelly. ``False`` means
-            different sub-process will process different batch. (default: ``False``)
+            different sub-process will process different batch. Default: False
 
         """
 
diff --git a/imperative/python/megengine/data/dataset/meta_dataset.py b/imperative/python/megengine/data/dataset/meta_dataset.py
index 4415a427..8b2a304d 100644
--- a/imperative/python/megengine/data/dataset/meta_dataset.py
+++ b/imperative/python/megengine/data/dataset/meta_dataset.py
@@ -12,7 +12,7 @@ from typing import Tuple
 
 class Dataset(ABC):
     r"""
-    An abstract class for all Datasets
+    An abstract class for all Datasets.
     """
 
     @abstractmethod
@@ -22,8 +22,8 @@ class Dataset(ABC):
 
 class MapDataset(Dataset):
     r"""
-    An abstract class for map data
-    __getitem__ and __len__ method are aditionally needed
+    An abstract class for map data.
+    __getitem__ and __len__ method are aditionally needed.
     """
 
     @abstractmethod
@@ -41,8 +41,8 @@ class MapDataset(Dataset):
 
 class StreamDataset(Dataset):
     r"""
-    An abstract class for stream data
-    __iter__ method is aditionally needed
+    An abstract class for stream data.
+    __iter__ method is aditionally needed.
     """
 
     @abstractmethod
diff --git a/imperative/python/megengine/data/dataset/vision/cifar.py b/imperative/python/megengine/data/dataset/vision/cifar.py
index 9ce73688..e969921d 100644
--- a/imperative/python/megengine/data/dataset/vision/cifar.py
+++ b/imperative/python/megengine/data/dataset/vision/cifar.py
@@ -21,7 +21,7 @@ logger = get_logger(__name__)
 
 
 class CIFAR10(VisionDataset):
-    r""" ``Dataset`` for CIFAR10 meta data
+    r""" ``Dataset`` for CIFAR10 meta data.
     """
 
     url_path = "http://www.cs.utoronto.ca/~kriz/"
diff --git a/imperative/python/megengine/data/dataset/vision/coco.py b/imperative/python/megengine/data/dataset/vision/coco.py
index d247e52b..11366de0 100644
--- a/imperative/python/megengine/data/dataset/vision/coco.py
+++ b/imperative/python/megengine/data/dataset/vision/coco.py
@@ -118,7 +118,7 @@ class COCO(VisionDataset):
             self.ids = ids
 
         self.json_category_id_to_contiguous_id = {
-            v: i + 1 for i, v in enumerate(self.cats.keys())
+            v: i + 1 for i, v in enumerate(sorted(self.cats.keys()))
         }
 
         self.contiguous_category_id_to_json_id = {
diff --git a/imperative/python/megengine/data/dataset/vision/folder.py b/imperative/python/megengine/data/dataset/vision/folder.py
index 7124ef56..e16c32e5 100644
--- a/imperative/python/megengine/data/dataset/vision/folder.py
+++ b/imperative/python/megengine/data/dataset/vision/folder.py
@@ -30,19 +30,18 @@ class ImageFolder(VisionDataset):
         r"""
         ImageFolder is a class for loading image data and labels from a organized folder.
 
-        the folder is expected to be organized as followed
-        root/cls/xxx.img_ext
+        The folder is expected to be organized as followed: root/cls/xxx.img_ext
 
-        labels are indices of sorted classes in the root directory
+        Labels are indices of sorted classes in the root directory.
 
-        :param root: root directory of an image folder
+        :param root: root directory of an image folder.
         :param loader: a function used to load image from path,
                        if ``None``, default function that loads
-                       images with PILwill be called
+                       images with PIL will be called.
         :param check_valid_func: a function used to check if files in folder are
                                  expected image files, if ``None``, default function
-                                 that checks file extensions will be called
-        :param class_name: if ``True``, return class name instead of class index
+                                 that checks file extensions will be called.
+        :param class_name: if ``True``, return class name instead of class index.
 
         """
         super().__init__(root, order=("image", "image_category"))
diff --git a/imperative/python/megengine/data/dataset/vision/imagenet.py b/imperative/python/megengine/data/dataset/vision/imagenet.py
index 94c2396c..e84dcddf 100644
--- a/imperative/python/megengine/data/dataset/vision/imagenet.py
+++ b/imperative/python/megengine/data/dataset/vision/imagenet.py
@@ -31,7 +31,7 @@ logger = get_logger(__name__)
 
 class ImageNet(ImageFolder):
     r"""
-    Load ImageNet from raw files or folder, expected folder looks like
+    Load ImageNet from raw files or folder. Expected folder looks like:
 
     .. code-block:: bash
 
@@ -60,25 +60,25 @@ class ImageNet(ImageFolder):
 
     def __init__(self, root: str = None, train: bool = True, **kwargs):
         r"""
-        initialization:
+        Initialization:
 
-        * if ``root`` contains ``self.target_folder`` depent on ``train``:
+        * if ``root`` contains ``self.target_folder`` depending on ``train``:
 
-          * initialize ImageFolder with target_folder
+          * initialize ImageFolder with target_folder.
 
         * else:
 
           * if all raw files are in ``root``:
 
-            * parse ``self.target_folder`` from raw files
-            * initialize ImageFolder with ``self.target_folder``
+            * parse ``self.target_folder`` from raw files.
+            * initialize ImageFolder with ``self.target_folder``.
 
           * else:
 
-            * raise error
+            * raise error.
 
-        :param root: root directory of imagenet data, if root is ``None``, used default_dataset_root
-        :param train: if ``True``, load the train split, otherwise load the validation split
+        :param root: root directory of imagenet data, if root is ``None``, use default_dataset_root.
+        :param train: if ``True``, load the train split, otherwise load the validation split.
         """
 
         # process the root path
diff --git a/imperative/python/megengine/data/dataset/vision/mnist.py b/imperative/python/megengine/data/dataset/vision/mnist.py
index 5e89a314..665fb693 100644
--- a/imperative/python/megengine/data/dataset/vision/mnist.py
+++ b/imperative/python/megengine/data/dataset/vision/mnist.py
@@ -22,12 +22,12 @@ logger = get_logger(__name__)
 
 
 class MNIST(VisionDataset):
-    r""" ``Dataset`` for MNIST meta data
+    r""" ``Dataset`` for MNIST meta data.
     """
 
     url_path = "http://yann.lecun.com/exdb/mnist/"
     """
-    url prefix for downloading raw file
+    Url prefix for downloading raw file.
     """
     raw_file_name = [
         "train-images-idx3-ubyte.gz",
@@ -36,7 +36,7 @@ class MNIST(VisionDataset):
         "t10k-labels-idx1-ubyte.gz",
     ]
     """
-    raw file names of both training set and test set (10k)
+    Raw file names of both training set and test set (10k).
     """
     raw_file_md5 = [
         "f68b3c2dcbeaaa9fbdd348bbdeb94873",
@@ -45,7 +45,7 @@ class MNIST(VisionDataset):
         "ec29112dd5afa0611ce80d1b7f02629c",
     ]
     """
-    md5 for checking raw files
+    Md5 for checking raw files.
     """
 
     def __init__(
@@ -57,10 +57,10 @@ class MNIST(VisionDataset):
     ):
         r"""
         :param root: path for mnist dataset downloading or loading, if ``None``,
-            set ``root`` to the ``_default_root``
-        :param train: if ``True``, loading trainingset, else loading test set
+            set ``root`` to the ``_default_root``.
+        :param train: if ``True``, loading trainingset, else loading test set.
         :param download: if raw files do not exists and download sets to ``True``,
-            download raw files and process, otherwise raise ValueError, default is True
+            download raw files and process, otherwise raise ValueError, default is True.
 
         """
         super().__init__(root, order=("image", "image_category"))
diff --git a/imperative/python/megengine/data/dataset/vision/objects365.py b/imperative/python/megengine/data/dataset/vision/objects365.py
index 7c1481ba..e56e6462 100644
--- a/imperative/python/megengine/data/dataset/vision/objects365.py
+++ b/imperative/python/megengine/data/dataset/vision/objects365.py
@@ -81,7 +81,7 @@ class Objects365(VisionDataset):
             self.ids = ids
 
         self.json_category_id_to_contiguous_id = {
-            v: i + 1 for i, v in enumerate(self.cats.keys())
+            v: i + 1 for i, v in enumerate(sorted(self.cats.keys()))
         }
 
         self.contiguous_category_id_to_json_id = {
diff --git a/imperative/python/megengine/data/dataset/vision/voc.py b/imperative/python/megengine/data/dataset/vision/voc.py
index 42bf712d..b22fd2fa 100644
--- a/imperative/python/megengine/data/dataset/vision/voc.py
+++ b/imperative/python/megengine/data/dataset/vision/voc.py
@@ -75,6 +75,8 @@ class PascalVOC(VisionDataset):
         else:
             raise NotImplementedError
 
+        self.img_infos = dict()
+
     def __getitem__(self, index):
         target = []
         for k in self.order:
@@ -107,9 +109,8 @@ class PascalVOC(VisionDataset):
                 mask = mask[:, :, np.newaxis]
                 target.append(mask)
             elif k == "info":
-                if image is None:
-                    image = cv2.imread(self.images[index], cv2.IMREAD_COLOR)
-                info = [image.shape[0], image.shape[1], self.file_names[index]]
+                info = self.get_img_info(index, image)
+                info = [info["height"], info["width"], info["file_name"]]
                 target.append(info)
             else:
                 raise NotImplementedError
@@ -119,6 +120,17 @@ class PascalVOC(VisionDataset):
     def __len__(self):
         return len(self.images)
 
+    def get_img_info(self, index, image=None):
+        if index not in self.img_infos:
+            if image is None:
+                image = cv2.imread(self.images[index], cv2.IMREAD_COLOR)
+            self.img_infos[index] = dict(
+                height=image.shape[0],
+                width=image.shape[1],
+                file_name=self.file_names[index],
+            )
+        return self.img_infos[index]
+
     def _trans_mask(self, mask):
         label = np.ones(mask.shape[:2]) * 255
         for i in range(len(self.class_colors)):
@@ -171,25 +183,3 @@ class PascalVOC(VisionDataset):
         "train",
         "tvmonitor",
     )
-    class_colors = [
-        [0, 0, 128],
-        [0, 128, 0],
-        [0, 128, 128],
-        [128, 0, 0],
-        [128, 0, 128],
-        [128, 128, 0],
-        [128, 128, 128],
-        [0, 0, 64],
-        [0, 0, 192],
-        [0, 128, 64],
-        [0, 128, 192],
-        [128, 0, 64],
-        [128, 0, 192],
-        [128, 128, 64],
-        [128, 128, 192],
-        [0, 64, 0],
-        [0, 64, 128],
-        [0, 192, 0],
-        [0, 192, 128],
-        [128, 64, 0],
-    ]
diff --git a/imperative/python/megengine/data/sampler.py b/imperative/python/megengine/data/sampler.py
index dbd5d3a3..3a748ae7 100644
--- a/imperative/python/megengine/data/sampler.py
+++ b/imperative/python/megengine/data/sampler.py
@@ -28,25 +28,25 @@ class Sampler(ABC):
         seed=None,
     ):
         r"""
-        An abstract class for all sampler
+        An abstract class for all sampler.
 
         :type dataset: `dataset`
-        :param dataset: dataset to sample from
+        :param dataset: dataset to sample from.
         :type batch_size: positive integer
-        :param batch_size: batch size for batch method
+        :param batch_size: batch size for batch method.
         :type drop_last: bool
         :param drop_last: set ``True`` to drop the last incomplete batch,
             if the dataset size is not divisible by the batch size. If ``False`` and 
             the size of dataset is not divisible by the batch_size, then the last batch will
-            be smaller. (default: ``False``)
+            be smaller. Default: False
         :type num_samples: positive integer
-        :param num_samples: number of samples assigned to one rank
+        :param num_samples: number of samples assigned to one rank.
         :type world_size: positive integer
-        :param world_size: number of ranks
+        :param world_size: number of ranks.
         :type rank: non-negative integer within 0 and world_size
-        :param rank: rank id, non-negative interger within 0 and ``world_size``
+        :param rank: rank id, non-negative interger within 0 and ``world_size``.
         :type seed: non-negative integer
-        :param seed: seed for random operators
+        :param seed: seed for random operators.
         """
         if (
             not isinstance(batch_size, int)
@@ -103,15 +103,15 @@ class Sampler(ABC):
 
     def sample(self):
         """
-        return a list contains all sample indices
+        Return a list contains all sample indices.
         """
         raise NotImplementedError
 
     def scatter(self, indices) -> List:
         r"""
-        scatter method is used for splitting indices into subset, each subset
+        Scatter method is used for splitting indices into subset, each subset
         will be assigned to a rank. Indices are evenly splitted by default.
-        If customized indices assignment method is needed, please rewrite this method
+        If customized indices assignment method is needed, please rewrite this method.
         """
         total_size = self.num_samples * self.world_size
 
@@ -127,7 +127,7 @@ class Sampler(ABC):
 
     def batch(self) -> Iterator[List[Any]]:
         r"""
-        batch method provides a batch indices generator
+        Batch method provides a batch indices generator.
         """
         indices = list(self.sample())
 
@@ -156,7 +156,7 @@ class SequentialSampler(Sampler):
         rank=None,
     ):
         r"""
-        Sample elements sequentially
+        Sample elements sequentially.
         """
         super().__init__(dataset, batch_size, drop_last, None, world_size, rank)
         if indices is not None and not isinstance(indices, collections.abc.Sequence):
@@ -168,7 +168,7 @@ class SequentialSampler(Sampler):
 
     def sample(self) -> Iterator[Any]:
         r"""
-        return a generator 
+        Return a generator.
         """
         if self.indices is None:
             return iter(range(len(self.dataset)))
@@ -188,7 +188,7 @@ class RandomSampler(Sampler):
         seed=None,
     ):
         r"""
-        Sample elements randomly without replacement
+        Sample elements randomly without replacement.
         """
         super().__init__(dataset, batch_size, drop_last, None, world_size, rank, seed)
         if indices is not None and not isinstance(indices, collections.abc.Sequence):
@@ -218,10 +218,10 @@ class ReplacementSampler(Sampler):
         seed=None,
     ):
         r"""
-        Sample elements randomly with replacement
+        Sample elements randomly with replacement.
 
         :type weights: List
-        :param weights: weights for sampling indices, it could be unnormalized weights
+        :param weights: weights for sampling indices, it could be unnormalized weights.
         """
         super().__init__(
             dataset, batch_size, drop_last, num_samples, world_size, rank, seed
@@ -250,7 +250,7 @@ class ReplacementSampler(Sampler):
 
 
 class Infinite(Sampler):
-    r"""Infinite Sampler warper for basic sampler"""
+    r"""Infinite Sampler warper for basic sampler."""
 
     def sample(self):
         raise NotImplementedError("sample method not supported in Infinite")
diff --git a/imperative/python/megengine/data/transform/meta_transform.py b/imperative/python/megengine/data/transform/meta_transform.py
index d7fd4f47..18951aaf 100644
--- a/imperative/python/megengine/data/transform/meta_transform.py
+++ b/imperative/python/megengine/data/transform/meta_transform.py
@@ -12,7 +12,7 @@ from typing import Sequence, Tuple
 
 class Transform(ABC):
     """
-    rewrite apply method in subclass
+    Rewrite apply method in subclass.
     """
 
     def apply_batch(self, inputs: Sequence[Tuple]):
diff --git a/imperative/python/megengine/data/transform/vision/functional.py b/imperative/python/megengine/data/transform/vision/functional.py
index e2f4e512..d5b9ad51 100644
--- a/imperative/python/megengine/data/transform/vision/functional.py
+++ b/imperative/python/megengine/data/transform/vision/functional.py
@@ -15,7 +15,7 @@ import numpy as np
 
 
 def wrap_keepdims(func):
-    """Wraper to keep the dimension of input images unchanged"""
+    """Wraper to keep the dimension of input images unchanged."""
 
     @functools.wraps(func)
     def wrapper(image, *args, **kwargs):
@@ -34,10 +34,10 @@ def wrap_keepdims(func):
 @wrap_keepdims
 def to_gray(image):
     r"""
-    Change BGR format image's color space to gray
+    Change BGR format image's color space to gray.
 
-    :param image: Input BGR format image, with (H, W, C) shape
-    :return: Gray format image, with (H, W, C) shape
+    :param image: input BGR format image, with `(H, W, C)` shape.
+    :return: gray format image, with `(H, W, C)` shape.
     """
     return cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
 
@@ -45,10 +45,10 @@ def to_gray(image):
 @wrap_keepdims
 def to_bgr(image):
     r"""
-    Change gray format image's color space to BGR
+    Change gray format image's color space to BGR.
 
-    :param image: input Gray format image, with (H, W, C) shape
-    :return: BGR format image, with (H, W, C) shape
+    :param image: input Gray format image, with `(H, W, C)` shape.
+    :return: BGR format image, with `(H, W, C)` shape.
     """
     return cv2.cvtColor(image, cv2.COLOR_GRAY2BGR)
 
@@ -56,18 +56,18 @@ def to_bgr(image):
 @wrap_keepdims
 def pad(input, size, value):
     r"""
-    Pad input data with *value* and given *size*
+    Pad input data with *value* and given *size*.
 
-    :param input: Input data, with (H, W, C) shape
-    :param size: Padding size of input data, it could be integer or sequence.
-        If it's an integer, the input data will be padded in four directions.
-        If it's a sequence contains two integer, the bottom and right side
+    :param input: input data, with `(H, W, C)` shape.
+    :param size: padding size of input data, it could be integer or sequence.
+        If it is an integer, the input data will be padded in four directions.
+        If it is a sequence contains two integer, the bottom and right side
         of input data will be padded.
-        If it's a sequence contains four integer, the top, bottom, left, right
+        If it is a sequence contains four integer, the top, bottom, left, right
         side of input data will be padded with given size.
-    :param value: Padding value of data, could be a sequence of int or float.
-        if it's float value, the dtype of image will be casted to float32 also.
-    :return: Padded image
+    :param value: padding value of data, could be a sequence of int or float.
+        If it is float value, the dtype of image will be casted to float32 also.
+    :return: padded image.
     """
     if isinstance(size, int):
         size = (size, size, size, size)
@@ -81,14 +81,18 @@ def pad(input, size, value):
 @wrap_keepdims
 def flip(image, flipCode):
     r"""
-    Accordding to the flipCode (the type of flip), flip the input image
+    Accordding to the flipCode (the type of flip), flip the input image.
 
-    :param image: Input image, with (H, W, C) shape
+    :param image: input image, with `(H, W, C)` shape.
     :param flipCode: code that indicates the type of flip.
-        1 : Flip horizontally
-        0 : Flip vertically
-        -1 : Flip horizontally and vertically
-    :return: BGR format image, with (H, W, C) shape
+
+        * 1 : Flip horizontally
+
+        * 0 : Flip vertically
+
+        * -1: Flip horizontally and vertically
+
+    :return: BGR format image, with `(H, W, C)` shape.
     """
     return cv2.flip(image, flipCode=flipCode)
 
@@ -96,12 +100,12 @@ def flip(image, flipCode):
 @wrap_keepdims
 def resize(input, size, interpolation=cv2.INTER_LINEAR):
     r"""
-    resize the input data to given size
+    Resize the input data to given size.
 
-    :param input: Input data, could be image or masks, with (H, W, C) shape
-    :param size: Target size of input data, with (height, width) shape.
-    :param interpolation: Interpolation method.
-    :return: Resized data, with (H, W, C) shape
+    :param input: input data, could be image or masks, with `(H, W, C)` shape.
+    :param size: target size of input data, with (height, width) shape.
+    :param interpolation: interpolation method.
+    :return: resized data, with `(H, W, C)` shape.
     """
     if len(size) != 2:
         raise ValueError("resize needs (h, w), but got {}".format(size))
diff --git a/imperative/python/megengine/data/transform/vision/transform.py b/imperative/python/megengine/data/transform/vision/transform.py
index ff989ed6..4a605cb8 100644
--- a/imperative/python/megengine/data/transform/vision/transform.py
+++ b/imperative/python/megengine/data/transform/vision/transform.py
@@ -44,26 +44,26 @@ __all__ = [
 class VisionTransform(Transform):
     r"""
     Base class of all transforms used in computer vision.
-    calling logic: apply_batch() -> apply() -> _apply_image() and other _apply_*()
+    Calling logic: apply_batch() -> apply() -> _apply_image() and other _apply_*()
     method. If you want to implement a self-defined transform method for image,
     rewrite _apply_image method in subclass.
 
-    :param order: Input type order. Input is a tuple contains different structures,
+    :param order: input type order. Input is a tuple containing different structures,
         order is used to specify the order of structures. For example, if your input
-        is (image, boxes) type, then the order should be ("image", "boxes").
-        Current available strings & data type are describe below:
+        is (image, boxes) type, then the ``order`` should be ("image", "boxes").
+        Current available strings and data type are describe below:
 
-        * "image": input image, with shape of (H, W, C)
-        * "coords": coordinates, with shape of (N, 2)
-        * "boxes": bounding boxes, with shape of (N, 4), "xyxy" format,
+        * "image": input image, with shape of `(H, W, C)`.
+        * "coords": coordinates, with shape of `(N, 2)`.
+        * "boxes": bounding boxes, with shape of `(N, 4)`, "xyxy" format,
           the 1st "xy" represents top left point of a box,
           the 2nd "xy" represents right bottom point.
-        * "mask": map used for segmentation, with shape of (H, W, 1)
-        * "keypoints": keypoints with shape of (N, K, 3), N for number of instances,
+        * "mask": map used for segmentation, with shape of `(H, W, 1)`.
+        * "keypoints": keypoints with shape of `(N, K, 3)`, N for number of instances,
           and K for number of keypoints in one instance. The first two dimensions
           of last axis is coordinate of keypoints and the the 3rd dimension is
           the label of keypoints.
-        * "polygons": A sequence contains numpy array, its length is number of instances.
+        * "polygons": a sequence containing numpy arrays, its length is the number of instances.
           Each numpy array represents polygon coordinate of one instance.
         * "category": categories for some data type. For example, "image_category"
           means category of the input image and "boxes_category" means categories of
@@ -94,11 +94,11 @@ class VisionTransform(Transform):
         self.order = order
 
     def apply_batch(self, inputs: Sequence[Tuple]):
-        r"""Apply transform on batch input data"""
+        r"""Apply transform on batch input data."""
         return tuple(self.apply(input) for input in inputs)
 
     def apply(self, input: Tuple):
-        r"""Apply transform on single input data"""
+        r"""Apply transform on single input data."""
         if not isinstance(input, tuple):
             input = (input,)
 
@@ -156,10 +156,10 @@ class VisionTransform(Transform):
 class ToMode(VisionTransform):
     r"""Change input data to a target mode.
     For example, most transforms use HWC mode image,
-    while the Neural Network might use CHW mode input tensor
+    while the neural network might use CHW mode input tensor.
 
-    :param mode: Output mode of input. Use "CHW" mode by default.
-    :param order: The same with :class:`VisionTransform`
+    :param mode: output mode of input. Default: "CHW"
+    :param order: the same with :class:`VisionTransform`
     """
 
     def __init__(self, mode="CHW", *, order=None):
@@ -185,14 +185,14 @@ class Compose(VisionTransform):
     r"""
     Composes several transforms together.
 
-    :param transforms: List of :class:`VisionTransform` to compose.
-    :param batch_compose: Whether use shuffle_indices for batch data or not.
+    :param transforms: list of :class:`VisionTransform` to compose.
+    :param batch_compose: whether use shuffle_indices for batch data or not.
         If True, use original input sequence.
         Otherwise, the shuffle_indices will be used for transforms.
-    :param shuffle_indices: Indices used for random shuffle, start at 1.
+    :param shuffle_indices: indices used for random shuffle, start at 1.
         For example, if shuffle_indices is [(1, 3), (2, 4)], then the 1st and 3rd transform
         will be random shuffled, the 2nd and 4th transform will also be shuffled.
-    :param order: The same with :class:`VisionTransform`
+    :param order: the same with :class:`VisionTransform`
 
     Examples:
 
@@ -264,8 +264,8 @@ class TorchTransformCompose(VisionTransform):
     some transforms with tensor in torchvision are not supported,
     such as Normalize and ToTensor in torchvision.
 
-    :param transforms: The same with ``Compose``
-    :param order: The same with :class:`VisionTransform`
+    :param transforms: the same with ``Compose``.
+    :param order: the same with :class:`VisionTransform`.
     """
 
     def __init__(self, transforms, *, order=None):
@@ -303,16 +303,16 @@ class TorchTransformCompose(VisionTransform):
 class Pad(VisionTransform):
     r"""Pad the input data.
 
-    :param size: Padding size of input image, it could be integer or sequence.
-        If it's an integer, the input image will be padded in four directions.
-        If it's a sequence contains two integer, the bottom and right side
+    :param size: padding size of input image, it could be integer or sequence.
+        If it is an integer, the input image will be padded in four directions.
+        If it is a sequence containing two integers, the bottom and right side
         of image will be padded.
-        If it's a sequence contains four integer, the top, bottom, left, right
+        If it is a sequence containing four integers, the top, bottom, left, right
         side of image will be padded with given size.
-    :param value: Padding value of image, could be a sequence of int or float.
-        if it's float value, the dtype of image will be casted to float32 also.
-    :param mask_value: Padding value of segmentation map.
-    :param order: The same with :class:`VisionTransform`
+    :param value: padding value of image, could be a sequence of int or float.
+        if it is float value, the dtype of image will be casted to float32 also.
+    :param mask_value: padding value of segmentation map.
+    :param order: the same with :class:`VisionTransform`.
     """
 
     def __init__(self, size=0, value=0, mask_value=0, *, order=None):
@@ -350,15 +350,15 @@ class Pad(VisionTransform):
 class Resize(VisionTransform):
     r"""Resize the input data.
 
-    :param output_size: Target size of image, with (height, width) shape.
-    :param interpolation: Interpolation method. All methods are listed below:
+    :param output_size: target size of image, with (height, width) shape.
+    :param interpolation: interpolation method. All methods are listed below:
 
         * cv2.INTER_NEAREST – a nearest-neighbor interpolation.
         * cv2.INTER_LINEAR – a bilinear interpolation (used by default).
         * cv2.INTER_AREA – resampling using pixel area relation.
         * cv2.INTER_CUBIC – a bicubic interpolation over 4×4 pixel neighborhood.
         * cv2.INTER_LANCZOS4 – a Lanczos interpolation over 8×8 pixel neighborhood.
-    :param order: The same with :class:`VisionTransform`
+    :param order: the same with :class:`VisionTransform`.
     """
 
     def __init__(self, output_size, interpolation=cv2.INTER_LINEAR, *, order=None):
@@ -476,8 +476,8 @@ class ShortestEdgeResize(VisionTransform):
 class RandomResize(VisionTransform):
     r"""Resize the input data randomly.
 
-    :param scale_range: .
-    :param order: The same with :class:`VisionTransform`
+    :param scale_range: range of scaling.
+    :param order: the same with :class:`VisionTransform`.
     """
 
     def __init__(self, scale_range, interpolation=cv2.INTER_LINEAR, *, order=None):
@@ -519,13 +519,13 @@ class RandomResize(VisionTransform):
 
 class RandomCrop(VisionTransform):
     r"""Crop the input data randomly. Before applying the crop transform,
-    pad the image first. And if target size is still bigger than the size of
+    pad the image first. If target size is still bigger than the size of
     padded image, pad the image size to target size.
 
-    :param output_size: Target size of output image, with (height, width) shape.
-    :param padding_size: The same with `size` in ``Pad``
-    :param padding_value: The same with `value` in ``Pad``
-    :param order: The same with :class:`VisionTransform`
+    :param output_size: target size of output image, with (height, width) shape.
+    :param padding_size: the same with `size` in ``Pad``.
+    :param padding_value: the same with `value` in ``Pad``.
+    :param order: the same with :class:`VisionTransform`.
     """
 
     def __init__(
@@ -580,10 +580,10 @@ class RandomResizedCrop(VisionTransform):
     aspect ratio (default: of 3/4 to 1.33) of the original aspect ratio is made.
     After applying crop transfrom, the input data will be resized to given size.
 
-    :param output_size: Target size of output image, with (height, width) shape.
-    :param scale_range: Range of size of the origin size cropped. Default: (0.08, 1.0)
-    :param ratio_range: Range of aspect ratio of the origin aspect ratio cropped. Default: (0.75, 1.33)
-    :param order: The same with :class:`VisionTransform`
+    :param output_size: target size of output image, with (height, width) shape.
+    :param scale_range: range of size of the origin size cropped. Default: (0.08, 1.0)
+    :param ratio_range: range of aspect ratio of the origin aspect ratio cropped. Default: (0.75, 1.33)
+    :param order: the same with :class:`VisionTransform`.
     """
 
     def __init__(
@@ -666,8 +666,8 @@ class RandomResizedCrop(VisionTransform):
 class CenterCrop(VisionTransform):
     r"""Crops the given the input data at the center.
 
-    :param output_size: Target size of output image, with (height, width) shape.
-    :param order: The same with :class:`VisionTransform`
+    :param output_size: target size of output image, with (height, width) shape.
+    :param order: the same with :class:`VisionTransform`.
     """
 
     def __init__(self, output_size, *, order=None):
@@ -710,7 +710,7 @@ class RandomHorizontalFlip(VisionTransform):
     r"""Horizontally flip the input data randomly with a given probability.
 
     :param p: probability of the input data being flipped. Default: 0.5
-    :param order: The same with :class:`VisionTransform`
+    :param order: the same with :class:`VisionTransform`.
     """
 
     def __init__(self, prob: float = 0.5, *, order=None):
@@ -742,7 +742,7 @@ class RandomVerticalFlip(VisionTransform):
     r"""Vertically flip the input data randomly with a given probability.
 
     :param p: probability of the input data being flipped. Default: 0.5
-    :param order: The same with :class:`VisionTransform`
+    :param order: the same with :class:`VisionTransform`.
     """
 
     def __init__(self, prob: float = 0.5, *, order=None):
@@ -776,9 +776,9 @@ class Normalize(VisionTransform):
     this transform will normalize each channel of the input data.
     ``output[channel] = (input[channel] - mean[channel]) / std[channel]``
 
-    :param mean: Sequence of means for each channel.
-    :param std: Sequence of standard deviations for each channel.
-    :param order: The same with :class:`VisionTransform`
+    :param mean: sequence of means for each channel.
+    :param std: sequence of standard deviations for each channel.
+    :param order: the same with :class:`VisionTransform`.
     """
 
     def __init__(self, mean=0.0, std=1.0, *, order=None):
@@ -802,7 +802,7 @@ class GaussianNoise(VisionTransform):
 
     :param mean: Gaussian mean used to generate noise.
     :param std: Gaussian standard deviation used to generate noise.
-    :param order: The same with :class:`VisionTransform`
+    :param order: the same with :class:`VisionTransform`
     """
 
     def __init__(self, mean=0.0, std=1.0, *, order=None):
@@ -826,9 +826,9 @@ class GaussianNoise(VisionTransform):
 class BrightnessTransform(VisionTransform):
     r"""Adjust brightness of the input data.
 
-    :param value: How much to adjust the brightness. Can be any
-        non negative number. 0 gives the original image
-    :param order: The same with :class:`VisionTransform`
+    :param value: how much to adjust the brightness. Can be any
+        non negative number. 0 gives the original image.
+    :param order: the same with :class:`VisionTransform`.
     """
 
     def __init__(self, value, *, order=None):
@@ -857,9 +857,9 @@ class BrightnessTransform(VisionTransform):
 class ContrastTransform(VisionTransform):
     r"""Adjust contrast of the input data.
 
-    :param value: How much to adjust the contrast. Can be any
-        non negative number. 0 gives the original image
-    :param order: The same with :class:`VisionTransform`
+    :param value: how much to adjust the contrast. Can be any
+        non negative number. 0 gives the original image.
+    :param order: the same with :class:`VisionTransform`.
     """
 
     def __init__(self, value, *, order=None):
@@ -888,9 +888,9 @@ class ContrastTransform(VisionTransform):
 class SaturationTransform(VisionTransform):
     r"""Adjust saturation of the input data.
 
-    :param value: How much to adjust the saturation. Can be any
-        non negative number. 0 gives the original image
-    :param order: The same with :class:`VisionTransform`
+    :param value: how much to adjust the saturation. Can be any
+        non negative number. 0 gives the original image.
+    :param order: the same with :class:`VisionTransform`.
     """
 
     def __init__(self, value, *, order=None):
@@ -919,9 +919,9 @@ class SaturationTransform(VisionTransform):
 class HueTransform(VisionTransform):
     r"""Adjust hue of the input data.
 
-    :param value: How much to adjust the hue. Can be any number
-        between 0 and 0.5, 0 gives the original image
-    :param order: The same with :class:`VisionTransform`
+    :param value: how much to adjust the hue. Can be any number
+        between 0 and 0.5, 0 gives the original image.
+    :param order: the same with :class:`VisionTransform`.
     """
 
     def __init__(self, value, *, order=None):
@@ -957,19 +957,19 @@ class HueTransform(VisionTransform):
 class ColorJitter(VisionTransform):
     r"""Randomly change the brightness, contrast, saturation and hue of an image.
 
-    :param brightness: How much to jitter brightness.
+    :param brightness: how much to jitter brightness.
         Chosen uniformly from [max(0, 1 - brightness), 1 + brightness]
         or the given [min, max]. Should be non negative numbers.
-    :param contrast: How much to jitter contrast.
+    :param contrast: how much to jitter contrast.
         Chosen uniformly from [max(0, 1 - contrast), 1 + contrast]
         or the given [min, max]. Should be non negative numbers.
-    :param saturation: How much to jitter saturation.
+    :param saturation: how much to jitter saturation.
         Chosen uniformly from [max(0, 1 - saturation), 1 + saturation]
         or the given [min, max]. Should be non negative numbers.
-    :param hue: How much to jitter hue.
+    :param hue: how much to jitter hue.
         Chosen uniformly from [-hue, hue] or the given [min, max].
         Should have 0<= hue <= 0.5 or -0.5 <= min <= max <= 0.5.
-    :param order: The same with :class:`VisionTransform`
+    :param order: the same with :class:`VisionTransform`.
     """
 
     def __init__(self, brightness=0, contrast=0, saturation=0, hue=0, *, order=None):
diff --git a/imperative/python/megengine/device.py b/imperative/python/megengine/device.py
index 95538320..2667536f 100644
--- a/imperative/python/megengine/device.py
+++ b/imperative/python/megengine/device.py
@@ -7,6 +7,7 @@
 # software distributed under the License is distributed on an
 # "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 import os
+import re
 
 from .core._imperative_rt.common import CompNode, DeviceType
 from .core._imperative_rt.common import set_prealloc_config as _set_prealloc_config
@@ -22,10 +23,8 @@ __all__ = [
 
 
 def _valid_device(inp):
-    if isinstance(inp, str) and len(inp) == 4:
-        if inp[0] in {"x", "c", "g"} and inp[1:3] == "pu":
-            if inp[3] == "x" or inp[3].isdigit():
-                return True
+    if isinstance(inp, str) and re.match("^[cxg]pu(\d+|\d+:\d+|x)$", inp):
+        return True
     return False
 
 
@@ -71,11 +70,11 @@ def set_default_device(device: str = "xpux"):
 
         'multithread' device type is avaliable when inference, which implements
         multi-threading parallelism at the operator level. For example,
-        'multithread4' will compute with 4 threads. which implements
+        'multithread4' will compute with 4 threads.
 
         The default value is 'xpux' to specify any device available. The priority of using gpu is higher when both gpu and cpu are available.
 
-        It can also be set by environmental variable `MGE_DEFAULT_DEVICE`.
+        It can also be set by environment variable `MGE_DEFAULT_DEVICE`.
     """
     assert _valid_device(device), "Invalid device name {}".format(device)
     CompNode._set_default_device(device)
@@ -99,13 +98,13 @@ def set_prealloc_config(
     growth_factor=2.0,
     device_type=DeviceType.CUDA,
 ):
-    """specifies how to pre-allocate from raw dev allocator
+    """Specifies how to pre-allocate from raw device allocator.
 
     :param alignment: specifies the alignment in bytes.
     :param min_req: min request size in bytes.
     :param max_overhead: max overhead above required size in bytes.
-    :growth_factor: request size / cur allocated
-    :device_type: the device type
+    :param growth_factor: `request size / cur allocated`
+    :param device_type: the device type
 
     """
     assert alignment > 0
diff --git a/imperative/python/megengine/distributed/functional.py b/imperative/python/megengine/distributed/functional.py
index 2bb01ccb..35210fd7 100644
--- a/imperative/python/megengine/distributed/functional.py
+++ b/imperative/python/megengine/distributed/functional.py
@@ -102,7 +102,7 @@ def _(op: RemoteRecv):
 
 
 def collective_comm(inp, mode, group, device):
-    """Helper function for applying collective communication functions"""
+    """Helper function for applying collective communication functions."""
     assert isinstance(group, Group)
     if group is None:
         return inp
@@ -123,11 +123,11 @@ def collective_comm(inp, mode, group, device):
 def reduce_sum(
     inp: Tensor, group: Optional[Group] = WORLD, device: Optional[str] = ""
 ) -> Tensor:
-    """Create reduce_sum operator for collective communication
+    """Create reduce_sum operator for collective communication.
 
-    :param inp: input tensor
-    :param group: communication group
-    :param device: execute placement
+    :param inp: input tensor.
+    :param group: communication group.
+    :param device: execution device.
     """
     mode = CollectiveCommMode.REDUCE_SUM
     return collective_comm(inp, mode, group, device)
@@ -136,11 +136,11 @@ def reduce_sum(
 def broadcast(
     inp: Tensor, group: Optional[Group] = WORLD, device: Optional[str] = ""
 ) -> Tensor:
-    """Create broadcast operator for collective communication
+    """Create broadcast operator for collective communication.
 
-    :param inp: input tensor
-    :param group: communication group
-    :param device: execute placement
+    :param inp: input tensor.
+    :param group: communication group.
+    :param device: execution device.
     """
     mode = CollectiveCommMode.BROADCAST
     return collective_comm(inp, mode, group, device)
@@ -149,11 +149,11 @@ def broadcast(
 def all_gather(
     inp: Tensor, group: Optional[Group] = WORLD, device: Optional[str] = ""
 ) -> Tensor:
-    """Create all_gather operator for collective communication
+    """Create all_gather operator for collective communication.
 
-    :param inp: input tensor
-    :param group: communication group
-    :param device: execute placement
+    :param inp: input tensor.
+    :param group: communication group.
+    :param device: execution device.
     """
     mode = CollectiveCommMode.ALL_GATHER
     return collective_comm(inp, mode, group, device)
@@ -162,11 +162,11 @@ def all_gather(
 def reduce_scatter_sum(
     inp: Tensor, group: Optional[Group] = WORLD, device: Optional[str] = ""
 ) -> Tensor:
-    """Create reduce_scatter_sum operator for collective communication
+    """Create reduce_scatter_sum operator for collective communication.
 
-    :param inp: input tensor
-    :param group: communication group
-    :param device: execute placement
+    :param inp: input tensor.
+    :param group: communication group.
+    :param device: execution device.
     """
     mode = CollectiveCommMode.REDUCE_SCATTER_SUM
     return collective_comm(inp, mode, group, device)
@@ -175,11 +175,11 @@ def reduce_scatter_sum(
 def all_reduce_sum(
     inp: Tensor, group: Optional[Group] = WORLD, device: Optional[str] = ""
 ) -> Tensor:
-    """Create all_reduce_sum operator for collective communication
+    """Create all_reduce_sum operator for collective communication.
 
-    :param inp: input tensor
-    :param group: communication group
-    :param device: execute placement
+    :param inp: input tensor.
+    :param group: communication group.
+    :param device: execution device.
     """
     mode = CollectiveCommMode.ALL_REDUCE_SUM
     return collective_comm(inp, mode, group, device)
@@ -188,11 +188,11 @@ def all_reduce_sum(
 def all_reduce_max(
     inp: Tensor, group: Optional[Group] = WORLD, device: Optional[str] = ""
 ) -> Tensor:
-    """Create all_reduce_max operator for collective communication
+    """Create all_reduce_max operator for collective communication.
 
-    :param inp: input tensor
-    :param group: communication group
-    :param device: execute placement
+    :param inp: input tensor.
+    :param group: communication group.
+    :param device: execution device.
     """
     mode = CollectiveCommMode.ALL_REDUCE_MAX
     return collective_comm(inp, mode, group, device)
@@ -201,11 +201,11 @@ def all_reduce_max(
 def all_reduce_min(
     inp: Tensor, group: Optional[Group] = WORLD, device: Optional[str] = ""
 ) -> Tensor:
-    """Create all_reduce_min operator for collective communication
+    """Create all_reduce_min operator for collective communication.
 
-    :param inp: input tensor
-    :param group: communication group
-    :param device: execute placement
+    :param inp: input tensor.
+    :param group: communication group.
+    :param device: execution device.
     """
     mode = CollectiveCommMode.ALL_REDUCE_MIN
     return collective_comm(inp, mode, group, device)
@@ -214,11 +214,11 @@ def all_reduce_min(
 def gather(
     inp: Tensor, group: Optional[Group] = WORLD, device: Optional[str] = ""
 ) -> Tensor:
-    """Create gather operator for collective communication
+    """Create gather operator for collective communication.
 
-    :param inp: input tensor
-    :param group: communication group
-    :param device: execute placement
+    :param inp: input tensor.
+    :param group: communication group.
+    :param device: execution device.
     """
     mode = CollectiveCommMode.GATHER
     return collective_comm(inp, mode, group, device)
@@ -227,11 +227,11 @@ def gather(
 def scatter(
     inp: Tensor, group: Optional[Group] = WORLD, device: Optional[str] = ""
 ) -> Tensor:
-    """Create scatter operator for collective communication
+    """Create scatter operator for collective communication.
 
-    :param inp: input tensor
-    :param group: communication group
-    :param device: execute placement
+    :param inp: input tensor.
+    :param group: communication group.
+    :param device: execution device.
     """
     mode = CollectiveCommMode.SCATTER
     return collective_comm(inp, mode, group, device)
@@ -240,21 +240,21 @@ def scatter(
 def all_to_all(
     inp: Tensor, group: Optional[Group] = WORLD, device: Optional[str] = ""
 ) -> Tensor:
-    """Create all_to_all operator for collective communication
+    """Create all_to_all operator for collective communication.
 
-    :param inp: input tensor
-    :param group: communication group
-    :param device: execute placement
+    :param inp: input tensor.
+    :param group: communication group.
+    :param device: execution device.
     """
     mode = CollectiveCommMode.ALL_TO_ALL
     return collective_comm(inp, mode, group, device)
 
 
 def remote_send(inp: Tensor, dest_rank: int) -> Tensor:
-    """Send a Tensor to a remote process
+    """Send a Tensor to a remote process.
 
-    :param inp: tensor to send
-    :param dest_rank: destination process rank
+    :param inp: tensor to send.
+    :param dest_rank: destination process rank.
     """
     op = RemoteSend()
     op.key = "{}->{}".format(get_rank(), dest_rank)
@@ -266,12 +266,12 @@ def remote_send(inp: Tensor, dest_rank: int) -> Tensor:
 def remote_recv(
     src_rank: int, shape: Tuple[int], dtype: type, device: Optional[str] = None
 ) -> Tensor:
-    """Receive a Tensor from a remote process
+    """Receive a Tensor from a remote process.
 
-    :param src_rank: source process rank
-    :param shape: the shape of the tensor to receive
-    :param dtype: the data type of the tensor to receive
-    :param device: the device to place the received tensor
+    :param src_rank: source process rank.
+    :param shape: the shape of the tensor to receive.
+    :param dtype: the data type of the tensor to receive.
+    :param device: the device to place the received tensor.
     """
     key = "{}->{}".format(src_rank, get_rank())
 
diff --git a/imperative/python/megengine/distributed/group.py b/imperative/python/megengine/distributed/group.py
index 2e60a4d5..b6939333 100644
--- a/imperative/python/megengine/distributed/group.py
+++ b/imperative/python/megengine/distributed/group.py
@@ -83,12 +83,12 @@ def init_process_group(
 ) -> None:
     """Initialize the distributed process group and specify the device used in the current process
 
-    :param master_ip: IP address of the master node
-    :param port: Port available for all processes to communicate
-    :param world_size: Total number of processes participating in the job
-    :param rank: Rank of the current process
-    :param device: The GPU device id to bind this process to
-    :param backend: Communicator backend, currently support 'nccl' and 'ucx'
+    :param master_ip: ip address of the master node.
+    :param port: port available for all processes to communicate.
+    :param world_size: total number of processes participating in the job.
+    :param rank: rank of the current process.
+    :param device: the GPU device id to bind this process to.
+    :param backend: communicator backend, currently support 'nccl' and 'ucx'.
     """
     if not isinstance(master_ip, str):
         raise TypeError("Expect type str but got {}".format(type(master_ip)))
@@ -127,50 +127,50 @@ def init_process_group(
 
 
 def is_distributed() -> bool:
-    """Return True if the distributed process group has been initialized"""
+    """Return True if the distributed process group has been initialized."""
     return _sd is not None
 
 
 def get_rank() -> int:
-    """Get the rank of the current process"""
+    """Get the rank of the current process."""
     return _sd.proc_rank if _sd is not None else 0
 
 
 def get_world_size() -> int:
-    """Get the total number of processes participating in the job"""
+    """Get the total number of processes participating in the job."""
     return _sd.world_size if _sd is not None else 1
 
 
 def get_backend() -> str:
-    """Get the backend str"""
+    """Get the backend str."""
     assert _sd is not None, "please call init_process_group first"
     return _sd.backend if _sd is not None else None
 
 
 def get_py_server_addr() -> Tuple[str, int]:
-    """Get master_ip and port of python XML RPC server"""
+    """Get master_ip and port of python XML RPC server."""
     assert _sd is not None, "please call init_process_group first"
     return _sd.master_ip, _sd.py_server_port
 
 
 def get_mm_server_addr() -> Tuple[str, int]:
-    """Get master_ip and port of C++ mm_server"""
+    """Get master_ip and port of C++ mm_server."""
     assert _sd is not None, "please call init_process_group first"
     return _sd.master_ip, _sd.mm_server_port
 
 
 def get_client() -> Client:
-    """Get client of python XML RPC server"""
+    """Get client of python XML RPC server."""
     assert _sd is not None, "please call init_process_group first"
     return _sd.client
 
 
 def new_group(proc_ranks: List[int]) -> Group:
-    """Build a subgroup containing certain ranks"""
+    """Build a subgroup containing certain ranks."""
     return Group(proc_ranks)
 
 
 def group_barrier(group: Optional[Group] = WORLD) -> None:
-    """Block until all ranks in the group reach this barrier"""
+    """Block until all ranks in the group reach this barrier."""
     assert isinstance(group, Group)
     _sd.client.group_barrier(group.key, group.size)
diff --git a/imperative/python/megengine/distributed/helper.py b/imperative/python/megengine/distributed/helper.py
index 05db40c1..8d84c5c1 100644
--- a/imperative/python/megengine/distributed/helper.py
+++ b/imperative/python/megengine/distributed/helper.py
@@ -17,11 +17,112 @@ import numpy as np
 from megengine.autodiff.grad_manager import GradManager, get_backwarding_grad_manager
 from megengine.device import get_default_device, get_device_count
 
-from ..functional.param_pack import get_offsets, pack_allreduce_split
+from ..core.ops.builtin import ParamPackConcat, ParamPackSplit
+from ..core.tensor.core import apply
 from ..functional.utils import copy
+from ..tensor import Tensor
 from ..utils.future import Future
 from .functional import all_reduce_sum, broadcast
-from .group import WORLD, group_barrier, is_distributed
+from .group import WORLD, Group, group_barrier, is_distributed
+
+
+def param_pack_split(inp: Tensor, offsets: list, shapes: list):
+    r"""
+    Returns split tensor to tensor list as offsets and shapes described,
+            only used for ``parampack``.
+
+    :param inp: input tensor.
+    :param offsets: offsets of outputs, length of `2 * n`,
+            while n is tensor nums you want to split,
+            format `[begin0, end0, begin1, end1]`.
+    :param shapes: tensor shapes of outputs.
+    :return: splitted tensors.
+
+    Examples:
+
+    .. testcode::
+
+        import numpy as np
+        from megengine import tensor
+        from megengine.distributed.helper import param_pack_split
+
+        a = tensor(np.ones((10,), np.int32))
+        b, c = param_pack_split(a, [0, 1, 1, 10], [(1,), (3, 3)])
+        print(b.numpy())
+        print(c.numpy())
+
+    Outputs:
+
+    .. testoutput::
+
+        [1]
+        [[1 1 1]
+         [1 1 1]
+         [1 1 1]]
+
+    """
+    op = ParamPackSplit()
+    op.offsets = offsets
+    op.shapes = shapes
+    return apply(op, inp)
+
+
+def param_pack_concat(inps: list, offsets: Tensor, offsets_val: list):
+    r"""
+    Returns concated tensor, only used for ``parampack``.
+
+    :param inps: input tensors.
+    :param offsets: device value of offsets.
+    :param offsets_val: offsets of inputs, length of `2 * n`,
+            format `[begin0, end0, begin1, end1]`.
+    :return: concated tensor.
+
+    Examples:
+
+    .. testcode::
+
+        import numpy as np
+        from megengine import tensor
+        from megengine.distributed.helper import param_pack_concat
+
+        a = tensor(np.ones((1,), np.int32))
+        b = tensor(np.ones((3, 3), np.int32))
+        offsets_val = [0, 1, 1, 10]
+        offsets = tensor(offsets_val, np.int32)
+        c = param_pack_concat([a, b], offsets, offsets_val)
+        print(c.numpy())
+
+    Outputs:
+
+    .. testoutput::
+
+        [1 1 1 1 1 1 1 1 1 1]
+
+    """
+    op = ParamPackConcat()
+    op.offsets = offsets_val
+    return apply(op, *inps, offsets)[0]
+
+
+def get_offsets(shapes):
+    offsets = []
+    offset = 0
+    for shape in shapes:
+        offsets.append(offset)
+        offset += int(np.prod(shape))
+        offsets.append(offset)
+    return offsets
+
+
+def pack_allreduce_split(pack_list, shapes, group, reduce_method):
+    offsets_val = get_offsets(shapes)
+    offsets = Tensor(offsets_val)
+    packed_grads = param_pack_concat(pack_list, offsets, offsets_val)
+    packed_grads = all_reduce_sum(packed_grads, group, group.comp_node)
+    if reduce_method == "mean":
+        packed_grads /= group.size
+    grads = param_pack_split(packed_grads, offsets_val, shapes)
+    return grads
 
 
 class TensorFuture(Future):
@@ -54,28 +155,43 @@ def synchronized(func: Callable):
     return wrapper
 
 
-def get_device_count_by_fork(device_type: str):
-    q = mp.Queue()
+def _get_device_count_worker(queue, device_type):
+    num = get_device_count(device_type)
+    queue.put(num)
 
-    def worker(queue):
-        num = get_device_count(device_type)
-        queue.put(num)
 
-    p = mp.Process(target=worker, args=(q,))
+def get_device_count_by_fork(device_type: str):
+    """Get device count in fork thread.
+    See https://stackoverflow.com/questions/22950047/cuda-initialization-error-after-fork
+    for more information.
+    """
+    q = mp.Queue()
+    p = mp.Process(target=_get_device_count_worker, args=(q, device_type))
     p.start()
     p.join()
     return q.get()
 
 
-def bcast_list_(params, group):
-    for p in params:
-        p._reset(broadcast(p, group))
+def bcast_list_(inps: list, group: Group = WORLD):
+    """Broadcast tensors between given group.
+
+    :param inps: input tensors.
+    :param group: communication group.
+    """
+    for inp in inps:
+        inp._reset(broadcast(inp, group))
 
 
 class AllreduceCallback:
-    def __init__(self, reduce_method, group=WORLD):
+    """Allreduce Callback with tensor fusion optimization.
+
+    :param reduce_method: the method to reduce gradiants.
+    :param group: communication group.
+    """
+
+    def __init__(self, reduce_method: str, group: Group = WORLD):
         reduce_method = reduce_method.lower()
-        assert reduce_method in ["sum", "mean"]
+        assert reduce_method in ["sum", "mean"], "reduce_method should be sum or mean"
         self._reduce_method = reduce_method
         self._group = group
         self._marked_gm = WeakSet()
@@ -88,6 +204,7 @@ class AllreduceCallback:
         self._futures_dict = dict()
         self._packing_list = defaultdict(list)
         self._packing_size = defaultdict(int)
+        self._grad_origin_device = dict()
 
     def _pack(self, dtype):
         grad_list = [self._gradients_dict[p] for p in self._packing_list[dtype]]
@@ -109,6 +226,7 @@ class AllreduceCallback:
         self._params.append(param)
         self._futures_dict[param] = TensorFuture(ack=False)
         self._gradients_dict[param] = grad
+        self._grad_origin_device[param] = str(grad.device)
 
         dtype_str = str(np.dtype(param.dtype))
         dtype_size = np.dtype(param.dtype).itemsize
@@ -123,6 +241,7 @@ class AllreduceCallback:
             self._pack(dtype)
         for param in self._params:
             grad = self._gradients_dict[param]
+            grad = copy(grad, self._grad_origin_device[param])
             self._futures_dict[param].set(grad)
         self._reset()
 
diff --git a/imperative/python/megengine/distributed/launcher.py b/imperative/python/megengine/distributed/launcher.py
index c5f8bcf8..a6c7c05a 100644
--- a/imperative/python/megengine/distributed/launcher.py
+++ b/imperative/python/megengine/distributed/launcher.py
@@ -15,7 +15,7 @@ from .util import get_free_ports
 
 
 def _run_wrapped(func, master_ip, port, world_size, rank, dev, args, kwargs):
-    """init distributed process group and run wrapped function"""
+    """Init distributed process group and run wrapped function."""
     init_process_group(
         master_ip=master_ip, port=port, world_size=world_size, rank=rank, device=dev
     )
@@ -23,7 +23,7 @@ def _run_wrapped(func, master_ip, port, world_size, rank, dev, args, kwargs):
 
 
 def launcher(func):
-    """decorator for launching multiple processes in single-machine multi-gpu training"""
+    """Decorator for launching multiple processes in single-machine multi-gpu training."""
 
     n_gpus = get_device_count_by_fork("gpu")
 
diff --git a/imperative/python/megengine/distributed/server.py b/imperative/python/megengine/distributed/server.py
index d8f199a6..6c51ae7f 100644
--- a/imperative/python/megengine/distributed/server.py
+++ b/imperative/python/megengine/distributed/server.py
@@ -21,6 +21,12 @@ from .util import get_free_ports
 
 
 class Methods:
+    """Distributed Server Method.
+    Used for exchange information between distributed nodes.
+
+    :param mm_server_port: multiple machine rpc server port.
+    """
+
     def __init__(self, mm_server_port):
         self.lock = threading.Lock()
         self.mm_server_port = mm_server_port
@@ -31,51 +37,65 @@ class Methods:
         self.dict_barrier_event = defaultdict(threading.Event)
 
     def connect(self):
+        """Method for checking connection success."""
         return True
 
     def get_mm_server_port(self):
+        """Get multiple machine rpc server port."""
         return self.mm_server_port
 
-    def set_is_grad(self, rank_peer, is_grad):
+    def set_is_grad(self, key, is_grad):
+        """Mark send/recv need gradiants by key.
+        
+        :param key: key to match send/recv op.
+        :param is_grad: whether this op need grad.
+        """
         with self.lock:
-            future = self.dict_is_grad[rank_peer]
+            future = self.dict_is_grad[key]
         future.set(is_grad)
         return True
 
-    def check_is_grad(self, rank_peer):
+    def check_is_grad(self, key):
+        """Check whether send/recv need gradiants.
+        
+        :param key: key to match send/recv op.
+        """
         with self.lock:
-            future = self.dict_is_grad[rank_peer]
+            future = self.dict_is_grad[key]
         ret = future.get()
         with self.lock:
-            del self.dict_is_grad[rank_peer]
+            del self.dict_is_grad[key]
         return ret
 
-    def set_remote_tracer(self, rank_peer, tracer_set):
+    def set_remote_tracer(self, key, tracer_set):
+        """Set tracer dict for tracing send/recv op.
+
+        :param key: key to match send/recv op.
+        :param tracer_set: valid tracer set.
+        """
         with self.lock:
-            future = self.dict_remote_tracer[rank_peer]
+            future = self.dict_remote_tracer[key]
         future.set(tracer_set)
         return True
 
-    def check_remote_tracer(self, rank_peer):
+    def check_remote_tracer(self, key):
+        """Get tracer dict for send/recv op.
+        
+        :param key: key to match send/recv op.
+        """
         with self.lock:
-            future = self.dict_remote_tracer[rank_peer]
+            future = self.dict_remote_tracer[key]
         ret = future.get()
         with self.lock:
-            del self.dict_remote_tracer[rank_peer]
+            del self.dict_remote_tracer[key]
         return ret
 
-    def set_pack_list(self, key, pack_list):
-        with self.lock:
-            future = self.dict_pack_list[key]
-        future.set(pack_list)
-        return True
-
-    def get_pack_list(self, key):
-        with self.lock:
-            future = self.dict_pack_list[key]
-        return future.get()
-
     def group_barrier(self, key, size):
+        """A barrier wait for all group member.
+        
+        :param key: group key to match each other.
+        :param size: group size.
+        """
         with self.lock:
             self.dict_barrier_counter[key] += 1
             counter = self.dict_barrier_counter[key]
@@ -94,12 +114,23 @@ class ThreadXMLRPCServer(ThreadingMixIn, SimpleXMLRPCServer):
 
 
 def start_server(py_server_port, mm_server_port):
+    """Start python distributed server and multiple machine server.
+    
+    :param py_server_port: python server port.
+    :param mm_server_port: multiple machine server port.
+    """
     server = ThreadXMLRPCServer(("0.0.0.0", py_server_port), logRequests=False)
     server.register_instance(Methods(mm_server_port))
     server.serve_forever()
 
 
 class Server:
+    """Distributed Server for distributed training.
+    Should be running at master node.
+
+    :param port: python server port.
+    """
+
     def __init__(self, port):
         self.py_server_port = get_free_ports(1)[0] if port == 0 else port
         self.mm_server_port = create_mm_server("0.0.0.0", 0)
@@ -112,12 +143,19 @@ class Server:
 
 
 class Client:
+    """Distributed Client for distributed training.
+
+    :param master_ip: ip address of master node.
+    :param port: port of server at master node.
+    """
+
     def __init__(self, master_ip, port):
         self.master_ip = master_ip
         self.port = port
         self.connect()
 
     def connect(self):
+        """Check connection success."""
         while True:
             try:
                 self.proxy = ServerProxy(
@@ -129,25 +167,43 @@ class Client:
                 time.sleep(1)
 
     def get_mm_server_port(self):
+        """Get multiple machine server port."""
         return self.proxy.get_mm_server_port()
 
-    def set_is_grad(self, rank_peer, is_grad):
-        self.proxy.set_is_grad(rank_peer, is_grad)
-
-    def check_is_grad(self, rank_peer):
-        return self.proxy.check_is_grad(rank_peer)
-
-    def set_remote_tracer(self, rank_peer, tracer_set):
-        self.proxy.set_remote_tracer(rank_peer, tracer_set)
-
-    def check_remote_tracer(self, rank_peer):
-        return self.proxy.check_remote_tracer(rank_peer)
-
-    def set_pack_list(self, key, pack_list):
-        self.proxy.set_pack_list(key, pack_list)
-
-    def get_pack_list(self, key):
-        return self.proxy.get_pack_list(key)
+    def set_is_grad(self, key, is_grad):
+        """Mark send/recv need gradiants by key.
+        
+        :param key: key to match send/recv op.
+        :param is_grad: whether this op need grad.
+        """
+        self.proxy.set_is_grad(key, is_grad)
+
+    def check_is_grad(self, key):
+        """Check whether send/recv need gradiants.
+        
+        :param key: key to match send/recv op.
+        """
+        return self.proxy.check_is_grad(key)
+
+    def set_remote_tracer(self, key, tracer_set):
+        """Set tracer dict for tracing send/recv op.
+
+        :param key: key to match send/recv op.
+        :param tracer_set: valid tracer set.
+        """
+        self.proxy.set_remote_tracer(key, tracer_set)
+
+    def check_remote_tracer(self, key):
+        """Get tracer dict for send/recv op.
+        
+        :param key: key to match send/recv op.
+        """
+        return self.proxy.check_remote_tracer(key)
 
     def group_barrier(self, key, size):
+        """A barrier wait for all group member.
+        
+        :param key: group key to match each other.
+        :param size: group size.
+        """
         self.proxy.group_barrier(key, size)
diff --git a/imperative/python/megengine/functional/__init__.py b/imperative/python/megengine/functional/__init__.py
index 66793668..37455891 100644
--- a/imperative/python/megengine/functional/__init__.py
+++ b/imperative/python/megengine/functional/__init__.py
@@ -8,13 +8,10 @@
 # "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # pylint: disable=redefined-builtin
 from .elemwise import *
-from .graph import add_update
-from .loss import *
 from .math import *
 from .nn import *
-from .quantized import conv_bias_activation
 from .tensor import *
-from .utils import accuracy, copy
+from .utils import *
 
 from . import distributed  # isort:skip
 
diff --git a/imperative/python/megengine/functional/debug_param.py b/imperative/python/megengine/functional/debug_param.py
index b27f4b4b..451c095d 100644
--- a/imperative/python/megengine/functional/debug_param.py
+++ b/imperative/python/megengine/functional/debug_param.py
@@ -26,14 +26,14 @@ def set_conv_execution_strategy(option: str):
         Available values:
 
         * 'HEURISTIC' uses heuristic to choose the fastest algorithm.
-        * 'PROFILE' runs possible algorithms on real device to find the best.
-        * 'PROFILE_HEURISTIC' uses profile result and heuristic to choose the fastest algorithm.
-        * 'PROFILE_REPRODUCIBLE' uses the fastest of profile result that is also reproducible.
+        * 'PROFILE' runs possible algorithms on real device to find the best one.
+        * 'PROFILE_HEURISTIC' uses profiling result and heuristic to choose the fastest algorithm.
+        * 'PROFILE_REPRODUCIBLE' uses the fastest of profiling result that is also reproducible.
         * 'HEURISTIC_REPRODUCIBLE' uses heuristic to choose the fastest algorithm that is also reproducible.
 
         The default strategy is 'HEURISTIC'.
 
-        It can also be set through the environmental variable 'MEGENGINE_CONV_EXECUTION_STRATEGY'.
+        It can also be set through the environment variable 'MEGENGINE_CONV_EXECUTION_STRATEGY'.
     """
     valid_option = (
         "HEURISTIC",
diff --git a/imperative/python/megengine/functional/elemwise.py b/imperative/python/megengine/functional/elemwise.py
index 3781ae64..3b71291c 100644
--- a/imperative/python/megengine/functional/elemwise.py
+++ b/imperative/python/megengine/functional/elemwise.py
@@ -26,23 +26,22 @@ __all__ = [
     "acosh",
     "atanh",
     "ceil",
-    "clamp",
+    "clip",
     "cos",
     "cosh",
     "div",
-    "eq",
+    "equal",
     "exp",
     "expm1",
-    "fast_tanh",
     "floor",
     "floor_div",
-    "gt",
-    "ge",
+    "greater",
+    "greater_equal",
     "hswish",
     "hsigmoid",
     "left_shift",
-    "lt",
-    "le",
+    "less",
+    "less_equal",
     "log",
     "log1p",
     "logical_and",
@@ -54,7 +53,7 @@ __all__ = [
     "mod",
     "mul",
     "neg",
-    "ne",
+    "not_equal",
     "pow",
     "relu",
     "relu6",
@@ -88,13 +87,6 @@ def _elwise(*args, mode):
     return result
 
 
-def _logical(*args, mode):
-    op = builtin.CondExecPredLogical(mode=mode)
-    args = utils.convert_inputs(*args)
-    (result,) = apply(op, *args)
-    return result
-
-
 def _elemwise_multi_type(*args, mode, **kwargs):
     op = builtin.ElemwiseMultiType(mode=mode, **kwargs)
     args = utils.convert_inputs(*args)
@@ -106,9 +98,10 @@ def _elemwise_multi_type(*args, mode, **kwargs):
 
 
 def add(x, y):
-    """Element-wise addition.
+    """Element-wise `addition`.
     At least one operand should be tensor.
-    Same for sub/mul/div/floor_div/pow/mod/atan2/eq/ne/lt/le/gt/ge/maximum/minmium.
+
+    Same for sub/mul/div/floor_div/pow/mod/atan2/equal/not_equal/less/less_equal/greater/greater_equal/maximum/minmium.
 
     :param x: input tensor.
     :return: computed tensor.
@@ -138,68 +131,68 @@ def add(x, y):
 
 
 def sub(x, y):
-    """Element-wise subtraction."""
+    """Element-wise `subtraction`."""
     return _elwise(x, y, mode="sub")
 
 
 def mul(x, y):
-    """Element-wise multiplication."""
+    """Element-wise `multiplication`."""
     return _elwise(x, y, mode="mul")
 
 
 def div(x, y):
-    """Element-wise (x / y)."""
+    """Element-wise `(x / y)`."""
     return _elwise(x, y, mode="true_div")
 
 
 def floor_div(x, y):
-    """Element-wise floor(x / y)."""
+    """Element-wise `floor(x / y)`."""
     return _elwise(x, y, mode="floor_divide")
 
 
 def neg(x):
-    """Element-wise negation."""
+    """Element-wise `negation`."""
     return _elwise(x, mode="negate")
 
 
 def pow(x, y):
-    """Element-wise power."""
+    """Element-wise `power`."""
     return _elwise(x, y, mode="pow")
 
 
 def mod(x, y):
-    """Element-wise remainder of division."""
+    """Element-wise `remainder of division`."""
     return _elwise(x, y, mode="mod")
 
 
 def abs(x):
-    """Element-wise absolute value."""
+    """Element-wise `absolute value`."""
     return _elwise(x, mode="abs")
 
 
 def exp(x):
-    """Element-wise exponential."""
+    """Element-wise `exponential`."""
     return _elwise(x, mode="exp")
 
 
 def expm1(x):
-    """Element-wise exp(x)-1."""
+    """Element-wise `exp(x)-1`."""
     return _elwise(x, mode="expm1")
 
 
 def log(x):
-    """Element-wise logarithm (base `e`)."""
+    """Element-wise `logarithm (base e)`."""
     return _elwise(x, mode="log")
 
 
 def log1p(x):
-    """Element-wise log(x+1) (base `e`)."""
+    """Element-wise `log(x+1) (base e)`."""
     return _elwise(x, mode="log1p")
 
 
 def sqrt(x: Tensor) -> Tensor:
-    """Element-wise sqrt.
-    For negative input value, return ``NaN``.
+    """Element-wise `sqrt`.
+    Returns ``NaN`` for negative input value.
 
     :param x: input tensor.
     :return: computed tensor.
@@ -229,10 +222,10 @@ def sqrt(x: Tensor) -> Tensor:
 
 def square(x: Tensor) -> Tensor:
     """
-    Return a new tensor with the square of the elements of input tensor.
+    Returns a new tensor with the square of the elements of input tensor.
 
-    :param inp: The input tensor
-    :return: The computed tensor
+    :param inp: input tensor.
+    :return: computed tensor.
 
     Examples:
 
@@ -258,27 +251,27 @@ def square(x: Tensor) -> Tensor:
 
 
 def round(x):
-    """Element-wise rounding to int."""
+    """Element-wise `rounding to int`."""
     return _elwise(x, mode="round")
 
 
 def ceil(x):
-    """Element-wise ceiling."""
+    """Element-wise `ceiling`."""
     return _elwise(x, mode="ceil")
 
 
 def floor(x):
-    """Element-wise floor."""
+    """Element-wise `floor`."""
     return _elwise(x, mode="floor")
 
 
 def maximum(x, y):
-    """Element-wise maximum of array elements."""
+    """Element-wise `maximum of array elements`."""
     return _elwise(x, y, mode="max")
 
 
 def minimum(x, y):
-    """Element-wise minimum of array elements."""
+    """Element-wise `minimum of array elements`."""
     return _elwise(x, y, mode="min")
 
 
@@ -286,7 +279,7 @@ def minimum(x, y):
 
 
 def cos(x):
-    """Element-wise cosine.
+    """Element-wise `cosine`.
 
     :param x: input tensor.
     :return: computed tensor.
@@ -315,80 +308,71 @@ def cos(x):
 
 
 def sin(x):
-    """Element-wise sine."""
+    """Element-wise `sine`."""
     return _elwise(x, mode="sin")
 
 
 def tan(x):
-    """Element-wise tangent."""
+    """Element-wise `tangent`."""
     return sin(x) / cos(x)
 
 
 def acos(x):
-    """Element-wise inverse cosine."""
+    """Element-wise `inverse cosine`."""
     return _elwise(x, mode="acos")
 
 
 def asin(x):
-    """Element-wise inverse sine."""
+    """Element-wise `inverse sine`."""
     return _elwise(x, mode="asin")
 
 
 def atan(x):
-    """Element-wise inverse tangent."""
+    """Element-wise `inverse tangent`."""
     return _elwise(x, 1, mode="atan2")
 
 
 def atan2(y, x):
-    """Element-wise 2-argument arctangent."""
+    """Element-wise `2-argument arctangent`."""
     return _elwise(y, x, mode="atan2")
 
 
 def cosh(x):
-    r"""Element-wise hyperbolic cosine."""
+    r"""Element-wise `hyperbolic cosine`."""
     return 0.5 * (exp(x) + exp(-x))
 
 
 def sinh(x):
-    r"""Element-wise hyperbolic sine."""
+    r"""Element-wise `hyperbolic sine`."""
     u = expm1(x)
     return 0.5 * u / (u + 1) * (u + 2)
 
 
 def tanh(x):
-    r"""Element-wise hyperbolic tangent."""
+    r"""Element-wise `hyperbolic tangent`."""
     return _elwise(x, mode="tanh")
 
 
 def asinh(x):
-    r"""Element-wise inverse hyperbolic sine."""
+    r"""Element-wise `inverse hyperbolic sine`."""
     return log(x + (x ** 2 + 1) ** 0.5)
 
 
 def acosh(x):
-    r"""Element-wise inverse hyperbolic cosine."""
+    r"""Element-wise `inverse hyperbolic cosine`."""
     return log(x + (x ** 2 - 1) ** 0.5)
 
 
 def atanh(x):
-    r"""Element-wise inverse hyperbolic tangent."""
+    r"""Element-wise `inverse hyperbolic tangent`."""
     return log1p(2 * x / (1 - x)) / 2
 
 
-def fast_tanh(x):
-    r"""Element-wise fast tanh; this is an approximation:
-
-    .. math::
-        \text{fast_tanh}(x) = x * (27. + x * x) / (27. + 9. * x * x)
-    """
-    return _elwise(x, mode="fast_tanh")
-
-
 # bit-twiddling functions
 
 
 def left_shift(x, y):
-    """Element-wise bitwise binary: x << y.
+    """Element-wise `bitwise binary: x << y`.
 
     :param x: input tensor, should be int.
     :param y: how many bits to be left-shifted.
@@ -418,7 +402,7 @@ def left_shift(x, y):
 
 
 def right_shift(x, y):
-    """Element-wise bitwise binary: x >> y."""
+    """Element-wise `bitwise binary: x >> y`."""
     return _elwise(x, y, mode="shr")
 
 
@@ -426,30 +410,30 @@ def right_shift(x, y):
 
 
 def logical_and(x, y):
-    """Element-wise logical and: x && y."""
+    """Element-wise `logical and: x && y`."""
     return _elwise(x, y, mode="AND")
 
 
 def logical_not(x):
-    """Element-wise logical not: ~x."""
+    """Element-wise `logical not: ~x`."""
     return _elwise(x, mode="NOT")
 
 
 def logical_or(x, y):
-    """Element-wise logical or: x || y."""
+    """Element-wise `logical or: x || y`."""
     return _elwise(x, y, mode="OR")
 
 
 def logical_xor(x, y):
-    """Element-wise logical xor: x ^ y."""
+    """Element-wise `logical xor: x ^ y`."""
     return _elwise(x, y, mode="XOR")
 
 
 # comparison functions
 
 
-def eq(x, y):
-    """Element-wise (x == y).
+def equal(x, y):
+    """Element-wise `(x == y)`.
 
     :param x: input tensor 1.
     :param y: input tensor 2.
@@ -465,7 +449,7 @@ def eq(x, y):
 
         x = tensor(np.arange(0, 6, dtype=np.float32).reshape(2, 3))
         y = tensor(np.arange(0, 6, dtype=np.float32).reshape(2, 3))
-        out = F.eq(x, y)
+        out = F.equal(x, y)
         print(out.numpy())
 
     Outputs:
@@ -479,28 +463,28 @@ def eq(x, y):
     return _elwise(x, y, mode="eq")
 
 
-def ne(x, y):
-    """Element-wise (x != y)."""
+def not_equal(x, y):
+    """Element-wise `(x != y)`."""
     return x != y
 
 
-def lt(x, y):
-    """Element-wise (x < y)."""
+def less(x, y):
+    """Element-wise `(x < y)`."""
     return _elwise(x, y, mode="lt")
 
 
-def le(x, y):
-    """Element-wise (x <= y)."""
+def less_equal(x, y):
+    """Element-wise `(x <= y)`."""
     return _elwise(x, y, mode="leq")
 
 
-def gt(x, y):
-    """Element-wise (x > y)."""
+def greater(x, y):
+    """Element-wise `(x > y)`."""
     return _elwise(y, x, mode="lt")
 
 
-def ge(x, y):
-    """Element-wise (x >= y)."""
+def greater_equal(x, y):
+    """Element-wise `(x >= y)`."""
     return _elwise(y, x, mode="leq")
 
 
@@ -508,7 +492,7 @@ def ge(x, y):
 
 
 def hswish(x):
-    """Element-wise x * relu6(x + 3) / 6.
+    """Element-wise `x * relu6(x + 3) / 6`.
 
     :param x: input tensor.
     :return: computed tensor.
@@ -534,7 +518,7 @@ def hswish(x):
 
 
 def hsigmoid(x):
-    """Element-wise relu6(x + 3) / 6."""
+    """Element-wise `relu6(x + 3) / 6`."""
     return relu6(x + 3) / 6
 
 
@@ -544,16 +528,16 @@ def relu(x):
 
 
 def relu6(x):
-    """Element-wise min(max(x, 0), 6)."""
+    """Element-wise `min(max(x, 0), 6)`."""
     return minimum(maximum(x, 0), 6)
 
 
 def sigmoid(x):
-    """Element-wise 1 / ( 1 + exp( -x ) )."""
+    """Element-wise `1 / ( 1 + exp( -x ) )`."""
     return _elwise(x, mode="sigmoid")
 
 
-def clamp(x: Tensor, lower=None, upper=None) -> Tensor:
+def clip(x: Tensor, lower=None, upper=None) -> Tensor:
     r"""Clamps all elements in input tensor into the range `[` :attr:`lower`, :attr:`upper` `]` and returns
     a resulting tensor:
 
@@ -578,9 +562,9 @@ def clamp(x: Tensor, lower=None, upper=None) -> Tensor:
         import megengine.functional as F
 
         a = tensor(np.arange(5).astype(np.int32))
-        print(F.clamp(a, 2, 4).numpy())
-        print(F.clamp(a, lower=3).numpy())
-        print(F.clamp(a, upper=3).numpy())
+        print(F.clip(a, 2, 4).numpy())
+        print(F.clip(a, lower=3).numpy())
+        print(F.clip(a, upper=3).numpy())
 
     Outputs:
 
@@ -596,7 +580,7 @@ def clamp(x: Tensor, lower=None, upper=None) -> Tensor:
     ), "At least one of 'lower' or 'upper' must not be None"
     if lower is not None:
         if upper is not None:
-            assert lower <= upper, "clamp lower bound is bigger that upper bound"
+            assert lower <= upper, "clip lower bound is bigger that upper bound"
             return minimum(maximum(x, lower), upper)
         else:
             return maximum(x, lower)
diff --git a/imperative/python/megengine/functional/external.py b/imperative/python/megengine/functional/external.py
deleted file mode 100644
index f164ed19..00000000
--- a/imperative/python/megengine/functional/external.py
+++ /dev/null
@@ -1,44 +0,0 @@
-# -*- coding: utf-8 -*-
-# MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
-#
-# Copyright (c) 2014-2020 Megvii Inc. All rights reserved.
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# pylint: disable=too-many-lines
-from typing import List
-
-from ..tensor import Tensor
-
-
-def cambricon_subgraph(
-    inputs: List[Tensor], data: bytes, symbol: str, tensor_dim_mutable: bool,
-) -> List[Tensor]:
-    """Loads a serialized Cambricon subgraph (i.e. cnrtModel_t) and
-    execute the operations defined in the subgraph.
-
-    :param inputs: list of input tensors of the subgraph.
-    :param data: the serialized subgraph.
-    :param symbol: the name of the function in the subgraph.
-        The function is corresponding to a cnmlFusionOp
-        which is added to the cnmlModel_t/cnrtModel_t.
-    :param tensor_dim_mutable: whether the input tensors' shapes are mutalbe
-        in cnrtModel_t.
-    """
-    raise NotImplementedError
-
-
-def extern_opr_subgraph(
-    inputs, output_shapes: List[tuple], dump_name: str, dump_data: bytes,
-) -> List[Tensor]:
-    """Loads a serialized extern opr subgraph and fake execute the operator.
-
-    :param inputs: tensor or list of input tensors.
-    :param output_shapes: the output shapes.
-    :param dump_name: the serialized subgraph name.
-    :param dump_data: the serialized subgraph.
-
-    :return: list of tensors.
-    """
-    raise NotImplementedError
diff --git a/imperative/python/megengine/functional/graph.py b/imperative/python/megengine/functional/graph.py
deleted file mode 100644
index 6af627ee..00000000
--- a/imperative/python/megengine/functional/graph.py
+++ /dev/null
@@ -1,41 +0,0 @@
-# -*- coding: utf-8 -*-
-# MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
-#
-# Copyright (c) 2014-2020 Megvii Inc. All rights reserved.
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-import collections
-from typing import Iterable, Optional, Union
-
-from ..tensor import Tensor
-
-
-def add_update(
-    dest: Tensor,
-    delta: Tensor,
-    *,
-    alpha: Union[Tensor, float, int] = 1.0,
-    beta: Union[Tensor, float, int] = 1.0,
-    bias: Union[Tensor, float, int] = 0.0
-):
-    r"""Modify ``dest`` inplace as follows:
-
-    .. math::
-        dest = alpha * dest +  beta * delta + bias
-
-    :param dest: input data that will be inplace modified.
-    :param delta: update value that will be added to ``dest``.
-    :param alpha: weight ratio of ``dest``. Default: 1.0
-    :param beta: weight ratio of ``delta``. Default: 1.0
-    :param bias: bias value appended to the result. Default: 0.0
-    """
-    if beta is not None and beta != 1.0:
-        delta = delta * beta
-    if bias is not None and bias != 0.0:
-        delta = delta + bias
-    if alpha is not None and alpha != 1.0:
-        dest *= alpha
-    dest += delta
-    return dest
diff --git a/imperative/python/megengine/functional/loss.py b/imperative/python/megengine/functional/loss.py
index 77d3e1f3..40da78d8 100644
--- a/imperative/python/megengine/functional/loss.py
+++ b/imperative/python/megengine/functional/loss.py
@@ -10,14 +10,14 @@ import numpy as np
 
 from ..core.tensor.utils import make_shape_tuple
 from ..tensor import Tensor
-from .elemwise import abs, eq, exp, log, maximum, pow, relu
-from .nn import indexing_one_hot
+from .elemwise import abs, equal, exp, log, maximum, pow, relu
+from .nn import indexing_one_hot, logsigmoid, logsumexp
 from .tensor import where
 
 __all__ = [
     "l1_loss",
     "square_loss",
-    "cross_entropy_with_softmax",
+    "cross_entropy",
     "binary_cross_entropy",
     "hinge_loss",
 ]
@@ -55,7 +55,7 @@ def l1_loss(pred: Tensor, label: Tensor) -> Tensor:
 
         ipt = mge.tensor(np.array([3, 3, 3, 3]).astype(np.float32))
         tgt = mge.tensor(np.array([2, 8, 6, 1]).astype(np.float32))
-        loss = F.l1_loss(ipt, tgt)
+        loss = F.nn.l1_loss(ipt, tgt)
         print(loss.numpy())
 
     Outputs:
@@ -106,7 +106,7 @@ def square_loss(pred: Tensor, label: Tensor) -> Tensor:
 
         ipt = mge.tensor(np.array([3, 3, 3, 3]).astype(np.float32))
         tgt = mge.tensor(np.array([2, 8, 6, 1]).astype(np.float32))
-        loss = F.square_loss(ipt, tgt)
+        loss = F.nn.square_loss(ipt, tgt)
         print(loss.numpy())
 
     Outputs:
@@ -120,10 +120,16 @@ def square_loss(pred: Tensor, label: Tensor) -> Tensor:
     return (diff ** 2).mean()
 
 
-def cross_entropy_with_softmax(
-    pred: Tensor, label: Tensor, axis: int = 1, label_smooth: float = 0
+def cross_entropy(
+    pred: Tensor,
+    label: Tensor,
+    axis: int = 1,
+    with_logits: bool = True,
+    label_smooth: float = 0,
 ) -> Tensor:
-    r"""Returns loss after applying :func:`~.softmax` + :func:`~.cross_entropy`.
+    r"""Compute the multi-class cross entropy loss (using logits by default).
+
+    By default, prediction is assumed to be logits, whose softmax gives probabilities.
 
     It has better numerical stability compared with sequential calls to :func:`~.softmax` and :func:`~.cross_entropy`.
 
@@ -132,11 +138,12 @@ def cross_entropy_with_softmax(
     .. math:: y^{LS}_{k}=y_{k}\left(1-\alpha\right)+\alpha/K
 
     where :math:`y^{LS}` and :math:`y` are new label distribution and origin label distribution respectively.
-    k is the index of label distribution. :math:`\alpha` is label_smooth and :math:`K` is the number of classes.
+    k is the index of label distribution. :math:`\alpha` is ``label_smooth`` and :math:`K` is the number of classes.
 
     :param pred: input tensor representing the predicted probability.
     :param label: input tensor representing the classification label.
     :param axis: an axis along which softmax will be applied. Default: 1
+    :param with_logits: whether to apply softmax first. Default: True
     :param label_smooth: a label smoothing of parameter that can re-distribute target distribution. Default: 0
     :return: loss value.
 
@@ -150,9 +157,9 @@ def cross_entropy_with_softmax(
 
         data_shape = (1, 2)
         label_shape = (1, )
-        pred = tensor(np.array([0.5, 0.5], dtype=np.float32).reshape(data_shape))
+        pred = tensor(np.array([0, 0], dtype=np.float32).reshape(data_shape))
         label = tensor(np.ones(label_shape, dtype=np.int32))
-        loss = F.cross_entropy_with_softmax(pred, label)
+        loss = F.nn.cross_entropy(pred, label)
         print(loss.numpy())
 
     Outputs:
@@ -170,26 +177,41 @@ def cross_entropy_with_softmax(
     )
 
     num_classes = pred.shape[axis]
+    no_label_smooth = (
+        label_smooth is None or type(label_smooth) in (int, float) and label_smooth == 0
+    )
+
+    if not with_logits:
+        if no_label_smooth:
+            return -log(indexing_one_hot(pred, label, axis)).mean()
+        pred = log(pred)
+        return (
+            label_smooth * pred.mean()
+            - (1 - label_smooth) * indexing_one_hot(pred, label, axis).mean()
+        )
 
     # Denominator of the softmax
-    offset = pred.max(axis=axis, keepdims=True).detach()
-    pred = pred - offset
-    down = exp(pred).sum(axis=axis, keepdims=True)
+    down = logsumexp(pred, axis=axis, keepdims=True)
 
     up = indexing_one_hot(pred, label, axis)
 
-    if label_smooth != 0:
+    if not no_label_smooth:
         factor = label_smooth / num_classes
         up = up * (1 - label_smooth) + pred.sum(axis=axis, keepdims=True) * factor
 
-    return (log(down) - up).mean()
+    return (down - up).mean()
 
 
-def binary_cross_entropy(pred: Tensor, label: Tensor) -> Tensor:
-    r"""Function that measures the Binary Cross Entropy between the target and the prediction.
+def binary_cross_entropy(
+    pred: Tensor, label: Tensor, with_logits: bool = True
+) -> Tensor:
+    r"""Compute the binary cross entropy loss (using logits by default).
+
+    By default, prediction is assumed to be logits, whose sigmoid gives probabilities.
 
-    :param pred: `(N, *)` where `*` means any number of additional dimensions.
+    :param pred: `(N, *)`, where `*` means any number of additional dimensions.
     :param label: `(N, *)`, same shape as the input.
+    :param with_logits: bool, whether to apply sigmoid first. Default: True
     :return: loss value.
 
     Examples:
@@ -200,9 +222,9 @@ def binary_cross_entropy(pred: Tensor, label: Tensor) -> Tensor:
         from megengine import tensor
         import megengine.functional as F
 
-        pred = tensor(np.array([0.5, 0.5], dtype=np.float32).reshape(1, 2))
+        pred = tensor(np.array([0, 0], dtype=np.float32).reshape(1, 2))
         label = tensor(np.ones((1, 2), dtype=np.float32))
-        loss = F.binary_cross_entropy(pred, label)
+        loss = F.nn.binary_cross_entropy(pred, label)
         print(loss.numpy())
 
     Outputs:
@@ -212,11 +234,15 @@ def binary_cross_entropy(pred: Tensor, label: Tensor) -> Tensor:
         [0.6931]
 
     """
-    return -1.0 * (label * log(pred) + (1.0 - label) * log(1 - pred)).mean()
+    if not with_logits:
+        return -(label * log(pred) + (1 - label) * log(1 - pred)).mean()
+    # logsigmoid(pred) and logsigmoid(-pred) has common sub-expression
+    # hopefully the backend would optimize this
+    return -(label * logsigmoid(pred) + (1 - label) * logsigmoid(-pred)).mean()
 
 
 def hinge_loss(pred: Tensor, label: Tensor, norm: str = "L1") -> Tensor:
-    r"""Caculate the hinge loss which is often used in SVMs.
+    r"""Caculates the hinge loss which is often used in SVM.
 
     The hinge loss can be described as:
 
@@ -236,7 +262,7 @@ def hinge_loss(pred: Tensor, label: Tensor, norm: str = "L1") -> Tensor:
 
         pred = tensor([[0.5, -0.5, 0.1], [-0.6, 0.7, 0.8]], dtype="float32")
         label = tensor([[1, -1, -1], [-1, 1, 1]], dtype="float32")
-        loss = F.hinge_loss(pred, label)
+        loss = F.nn.hinge_loss(pred, label)
         print(loss.numpy())
 
     Outputs:
diff --git a/imperative/python/megengine/functional/math.py b/imperative/python/megengine/functional/math.py
index 408fd848..804bb730 100644
--- a/imperative/python/megengine/functional/math.py
+++ b/imperative/python/megengine/functional/math.py
@@ -14,11 +14,12 @@ from typing import Optional, Sequence, Tuple, Union
 
 from ..core.ops import builtin
 from ..core.ops._internal import param_defs as P
+from ..core.ops.special import Const
 from ..core.tensor import utils
-from ..core.tensor.core import apply
+from ..core.tensor.core import TensorBase, TensorWrapperBase, apply
 from ..tensor import Tensor
-from .elemwise import clamp, exp, log, log1p
-from .tensor import add_axis, remove_axis, reshape
+from .elemwise import clip, exp, log, log1p
+from .tensor import reshape, squeeze
 
 __all__ = [
     "argmax",
@@ -45,7 +46,7 @@ def isnan(inp: Tensor) -> Tensor:
     r"""Returns a new tensor representing if each element is ``NaN`` or not.
 
     :param inp: input tensor.
-    :return: a new tensor representing if each element in inp is NaN or not.
+    :return: result tensor.
 
     Examples:
 
@@ -71,7 +72,7 @@ def isinf(inp: Tensor) -> Tensor:
     r"""Returns a new tensor representing if each element is ``Inf`` or not.
 
     :param inp: input tensor.
-    :return: a new tensor representing if each element in inp is Inf or not.
+    :return: result tensor.
 
     Examples:
 
@@ -84,7 +85,7 @@ def isinf(inp: Tensor) -> Tensor:
         print(F.isinf(x).numpy())
 
     Outputs:
-    
+
     .. testoutput::
 
         [False  True False]
@@ -108,7 +109,7 @@ def sign(inp: Tensor):
 
         x = tensor([1, -1, 0])
         print(F.sign(x).numpy())
-    
+
     Outputs:
 
     .. testoutput::
@@ -128,7 +129,7 @@ def sum(
     reduce over all of them.
 
     :param inp: input tensor.
-    :param axis: dimension to reduce. If None, all the dimensions will be reduced.
+    :param axis: dimension to reduce. If None, all dimensions will be reduced.
         Default: None
     :param keepdims: whether the output tensor has axis retained or not.
         Default: False
@@ -163,7 +164,7 @@ def prod(
     reduce over all of them.
 
     :param inp: input tensor.
-    :param axis: dimension to reduce. If None, all the dimensions will be reduced. Default: None
+    :param axis: dimension to reduce. If None, all dimensions will be reduced. Default: None
     :param keepdims: whether the output tensor has axis retained or not. Default: False
     :return: output tensor.
 
@@ -199,7 +200,7 @@ def mean(
     reduce over all of them.
 
     :param inp: input tensor.
-    :param axis: dimension to reduce. If None, all the dimensions will be reduced. Default: None
+    :param axis: dimension to reduce. If None, all dimensions will be reduced. Default: None
     :param keepdims: whether the output tensor has axis retained or not. Default: False
     :return: output tensor.
 
@@ -235,7 +236,7 @@ def var(
     reduce over all of them.
 
     :param inp: input tensor.
-    :param axis: dimension to reduce. If None, all the dimensions will be reduced. Default: None
+    :param axis: dimension to reduce. If None, all dimensions will be reduced. Default: None
     :param keepdims: whether the output tensor has axis retained or not. Default: False
     :return: output tensor.
 
@@ -275,7 +276,7 @@ def std(
     reduce over all of them.
 
     :param inp: input tensor.
-    :param axis: dimension to reduce. If None, all the dimensions will be reduced. Default: None
+    :param axis: dimension to reduce. If None, all dimensions will be reduced. Default: None
     :param keepdims: whether the output tensor has axis retained or not. Default: False
     :return: output tensor.
 
@@ -310,7 +311,7 @@ def min(
     reduce over all of them.
 
     :param inp: input tensor.
-    :param axis: dimension to reduce. If None, all the dimensions will be reduced. Default: None
+    :param axis: dimension to reduce. If None, all dimensions will be reduced. Default: None
     :param keepdims: whether the output tensor has axis retained or not. Default: False
     :return: output tensor.
 
@@ -346,7 +347,7 @@ def max(
     reduce over all of them.
 
     :param inp: input tensor.
-    :param axis: dimension to reduce. If None, all the dimensions will be reduced. Default: None
+    :param axis: dimension to reduce. If None, all dimensions will be reduced. Default: None
     :param keepdims: whether the output tensor has axis retained or not. Default: False
     :return: output tensor.
 
@@ -373,18 +374,14 @@ def max(
 
 
 def norm(
-    inp: Tensor,
-    p: int = 2,
-    axis: Optional[Union[int, Sequence[int]]] = None,
-    keepdims=False,
+    inp: Tensor, ord: float = None, axis: int = None, keepdims=False,
 ):
     """Calculates ``p``-norm of input tensor along
-    given axis. If axis is a list of dimensions,
-    reduce over all of them.
+    given axis.
 
     :param inp: input tensor.
-    :param p: power of value applied to inp. Default: 2
-    :param axis: dimension to reduce. If None, all the dimensions will be reduced. Default: None
+    :param ord: power of value applied to inp. Default: 2
+    :param axis: dimension to reduce. If None, input must be a vector. Default: None
     :param keepdims: whether the output tensor has axis retained or not. Default: False
     :return: output tensor.
 
@@ -396,7 +393,7 @@ def norm(
         from megengine import tensor
         import megengine.functional as F
 
-        x = tensor(np.arange(-3, 3, dtype=np.float32).reshape(2,3))
+        x = tensor(np.arange(-3, 3, dtype=np.float32))
         out = F.norm(x)
         print(out.numpy())
 
@@ -407,13 +404,18 @@ def norm(
         [4.3589]
 
     """
-    if p == 0:
+    if axis is None:
+        if inp.ndim != 1:
+            raise TypeError("axis is required unless input is a vector")
+    if ord is None:
+        ord = 2
+    if ord == 0:
         return sum(inp != 0, axis=axis, keepdims=keepdims)
-    if p == math.inf:
+    if ord == math.inf:
         return max(abs(inp))
-    if p == -math.inf:
+    if ord == -math.inf:
         return min(abs(inp))
-    return sum(abs(inp) ** p, axis=axis, keepdims=keepdims) ** (1.0 / p)
+    return sum(abs(inp) ** ord, axis=axis, keepdims=keepdims) ** (1.0 / ord)
 
 
 def argmin(
@@ -426,7 +428,7 @@ def argmin(
     reduce over all of them.
 
     :param inp: input tensor.
-    :param axis: dimension to reduce. If None, all the dimensions will be reduced. Default: None
+    :param axis: dimension to reduce. If None, all dimensions will be reduced. Default: None
     :param keepdims: whether the output tensor has axis retained or not. Default: False
     :return: output tensor.
 
@@ -458,7 +460,7 @@ def argmin(
             (inp,) = apply(op, inp)
 
             if not keepdims:
-                inp = remove_axis(inp, ai)
+                inp = squeeze(inp, ai)
 
         return inp
 
@@ -470,7 +472,7 @@ def argmin(
     op = builtin.Argmin(axis=axis)
     (result,) = apply(op, inp)
     if not keepdims:
-        result = remove_axis(result, axis)
+        result = squeeze(result, axis)
     return result
 
 
@@ -484,7 +486,7 @@ def argmax(
     reduce over all of them.
 
     :param inp: input tensor.
-    :param axis: dimension to reduce. If None, all the dimensions will be reduced. Default: None
+    :param axis: dimension to reduce. If None, all dimensions will be reduced. Default: None
     :param keepdims: whether the output tensor has axis retained or not. Default: False
     :return: output tensor.
 
@@ -516,7 +518,7 @@ def argmax(
             (inp,) = apply(op, inp)
 
             if not keepdims:
-                inp = remove_axis(inp, ai)
+                inp = squeeze(inp, ai)
 
         return inp
 
@@ -528,45 +530,40 @@ def argmax(
     op = builtin.Argmax(axis=axis)
     (result,) = apply(op, inp)
     if not keepdims:
-        result = remove_axis(result, axis)
+        result = squeeze(result, axis)
     return result
 
 
 def normalize(
-    inp: Tensor,
-    p: int = 2,
-    axis: Optional[Union[int, Sequence[int]]] = None,
-    eps: float = 1e-12,
+    inp: Tensor, ord: float = None, axis: int = None, eps: float = 1e-12,
 ) -> Tensor:
     r"""Performs :math:`L_p` normalization of input tensor along
-    given axis. If axis is a list of dimensions,
-    reduce over all of them.
+    given axis.
 
-    For a tensor inp of shape :math:`(n_0, ..., n_{dim}, ..., n_k)`, each
+    For a tensor of shape :math:`(n_0, ..., n_{dim}, ..., n_k)`, each
     :math:`n_{dim}` -element vector :math:`v` along dimension :attr:`axis` is transformed as:
 
     .. math::
         v = \frac{v}{\max(\lVert v \rVert_p, \epsilon)}.
 
     :param inp: input tensor.
-    :param p: power of value applied to inp. Default: 2
-    :param axis: dimension to reduce. If None, all the dimensions will be reduced
-        to calculate the norm. Default: None
+    :param ord: power of value applied to input tensor. Default: 2
+    :param axis: dimension to reduce.If None, input must be a vector. Default: None
     :param eps: a small value to avoid division by zero. Default: 1e-12
     :return: normalized output tensor.
     """
     if axis is None:
-        return inp / clamp(norm(inp, p, axis), lower=eps)
+        return inp / clip(norm(inp, ord, axis), lower=eps)
     else:
-        return inp / clamp(norm(inp, p, axis, keepdims=True), lower=eps)
+        return inp / clip(norm(inp, ord, axis, keepdims=True), lower=eps)
 
 
 def argsort(inp: Tensor, descending: bool = False) -> Tensor:
-    r"""Sorts the target 2d matrix by row, return both the sorted tensor and indices.
+    r"""Returns the indices that would sort the input tensor.
 
-    :param inp: input tensor, if 2d, each row will be sorted.
-    :param descending: Sort in descending order, where the largest comes first. Default: False
-    :return: Tuple of two tensors `(sorted_tensor, indices_of_int32)`.
+    :param inp: input tensor. If it's 2d, the result would be array of indices show how to sort each row in the input tensor.
+    :param descending: sort in descending order, where the largest comes first. Default: False
+    :return: indices of int32 indicates how to sort the input.
 
     Examples:
 
@@ -603,6 +600,31 @@ def argsort(inp: Tensor, descending: bool = False) -> Tensor:
 
 
 def sort(inp: Tensor, descending: bool = False) -> Tuple[Tensor, Tensor]:
+    r"""Returns sorted tensor and the indices would sort the input tensor.
+
+    :param inp: input tensor. If it's 2d, the result would be sorted by row.
+    :param descending: sort in descending order, where the largest comes first. Default: False
+    :return: tuple of two tensors `(sorted_tensor, indices_of_int32)`.
+
+    Examples:
+
+    .. testcode::
+
+        import numpy as np
+        from megengine import tensor
+        import megengine.functional as F
+
+        x = tensor(np.array([1,2], dtype=np.float32))
+        out, indices = F.sort(x)
+        print(out.numpy())
+
+    Outputs:
+
+    .. testoutput::
+
+        [1. 2.]
+
+    """
     assert len(inp.shape) <= 2, "Input should be 1d or 2d"
     if descending:
         order = P.Argsort.Order.DESCENDING
@@ -625,13 +647,13 @@ def topk(
     kth_only: bool = False,
     no_sort: bool = False,
 ) -> Tuple[Tensor, Tensor]:
-    r"""Selects the ``Top-K(by default)`` smallest elements of 2d matrix by row.
+    r"""Selects the ``Top-K``(by default) smallest elements of 2d matrix by row.
 
-    :param inp: input tensor, if 2d, each row will be sorted.
+    :param inp: input tensor. If input tensor is 2d, each row will be sorted.
     :param k: number of elements needed.
-    :param descending: if true, return the largest elements instead. Default: False
-    :param kth_only: if true, only the k-th element will be returned. Default: False
-    :param no_sort: if true, the returned elements can be unordered. Default: False
+    :param descending: if True, return the largest elements instead. Default: False
+    :param kth_only: if True, only the k-th element will be returned. Default: False
+    :param no_sort: if True, the returned elements can be unordered. Default: False
     :return: tuple of two tensors `(topk_tensor, indices_of_int32)`.
 
     Examples:
@@ -665,15 +687,18 @@ def topk(
         mode = Mode.VALUE_IDX_SORTED
     op = builtin.TopK(mode=mode)
 
+    if not isinstance(k, (TensorBase, TensorWrapperBase)):
+        (k,) = Const(k, dtype="int32", device=inp.device)(inp)
+
     if len(inp.shape) == 1:
         inp = inp.reshape(1, -1)
-        res = apply(op, inp, Tensor(k, dtype="int32"))
+        res = apply(op, inp, k)
         if kth_only:
             tns = res[0]
         else:
             tns, ind = res[0][0], res[1][0]
     else:
-        res = apply(op, inp, Tensor(k, dtype="int32"))
+        res = apply(op, inp, k)
         if kth_only:
             tns = res
         else:
diff --git a/imperative/python/megengine/functional/nn.py b/imperative/python/megengine/functional/nn.py
index 2282956b..7d120b8e 100644
--- a/imperative/python/megengine/functional/nn.py
+++ b/imperative/python/megengine/functional/nn.py
@@ -13,46 +13,51 @@ from ..core._imperative_rt import CompNode
 from ..core.ops import builtin
 from ..core.ops._internal import param_defs as P
 from ..core.ops.special import Const
-from ..core.tensor import utils
+from ..core.tensor import megbrain_graph, utils
 from ..core.tensor.core import TensorBase, TensorWrapperBase, apply
+from ..core.tensor.utils import astensor1d
 from ..distributed import WORLD, is_distributed
+from ..jit.tracing import is_tracing
 from ..random import uniform
 from ..tensor import Tensor
 from .debug_param import get_conv_execution_strategy
 from .distributed import all_reduce_sum
 from .elemwise import exp, floor, log, log1p, maximum, minimum, relu
 from .math import argsort, max, sum
-from .tensor import add_axis, broadcast, concat, full, ones, remove_axis, reshape, zeros
+from .tensor import (
+    broadcast_to,
+    concat,
+    expand_dims,
+    full,
+    ones,
+    reshape,
+    squeeze,
+    zeros,
+)
 from .types import _pair, _pair_nonzero
 
 __all__ = [
+    "adaptive_avg_pool2d",
+    "adaptive_max_pool2d",
     "avg_pool2d",
-    "batched_nms",
-    "batch_norm2d",
+    "batch_norm",
     "conv2d",
     "conv_transpose2d",
     "dot",
     "dropout",
-    "embedding",
     "indexing_one_hot",
-    "interpolate",
     "leaky_relu",
-    "linear",
     "local_conv2d",
     "logsigmoid",
     "logsumexp",
-    "log_softmax",
+    "logsoftmax",
     "matmul",
     "max_pool2d",
-    "nms",
     "one_hot",
     "prelu",
-    "roi_align",
-    "roi_pooling",
     "softmax",
     "softplus",
     "svd",
-    "sync_batch_norm",
     "warp_perspective",
 ]
 
@@ -106,19 +111,18 @@ def conv2d(
     :param padding: size of the paddings added to the input on both sides of its
         spatial dimensions. Only zero-padding is supported. Default: 0
     :param dilation: dilation of the 2D convolution operation. Default: 1
-    :param groups: number of groups to divide input and output channels into,
-        so as to perform a ``grouped convolution``. When groups is not 1,
-        in_channels and out_channels must be divisible by groups,
+    :param groups: number of groups into which the input and output channels are divided, so as to perform a ``grouped convolution``. When ``groups`` is not 1,
+        ``in_channels`` and ``out_channels`` must be divisible by ``groups``,
         and the shape of weight should be `(groups, out_channel // groups,
         in_channels // groups, height, width)`.
-    :type conv_mode: string or :class:`P.Convolution.Mode`.
+    :type conv_mode: string or :class:`P.Convolution.Mode`
     :param conv_mode: supports "CROSS_CORRELATION" or "CONVOLUTION". Default:
         "CROSS_CORRELATION"
     :type compute_mode: string or
-        :class:`P.Convolution.ComputeMode`.
+        :class:`P.Convolution.ComputeMode`
     :param compute_mode: when set to "DEFAULT", no special requirements will be
         placed on the precision of intermediate results. When set to "FLOAT32",
-        Float32 would be used for accumulator and intermediate result, but only
+        "Float32" would be used for accumulator and intermediate result, but only
         effective when input and output are of Float16 dtype.
     :return: output tensor.
     """
@@ -167,24 +171,23 @@ def conv_transpose2d(
 
     :param inp: feature map of the convolution operation.
     :param weight: convolution kernel.
-    :param bias: bias added to the result of convolution (if given)
+    :param bias: bias added to the result of convolution (if given).
     :param stride: stride of the 2D convolution operation. Default: 1
     :param padding: size of the paddings added to the input on both sides of its
         spatial dimensions. Only zero-padding is supported. Default: 0
     :param dilation: dilation of the 2D convolution operation. Default: 1
-    :param groups: number of groups to divide input and output channels into,
-        so as to perform a ``grouped convolution``. When groups is not 1,
-        in_channels and out_channels must be divisible by groups,
+    :param groups: number of groups into which the input and output channels are divided, so as to perform a ``grouped convolution``. When ``groups`` is not 1,
+        ``in_channels`` and ``out_channels`` must be divisible by groups,
         and the shape of weight should be `(groups, out_channel // groups,
         in_channels // groups, height, width)`. Default: 1
-    :type conv_mode: string or :class:`P.Convolution.Mode`.
+    :type conv_mode: string or :class:`P.Convolution.Mode`
     :param conv_mode: supports "CROSS_CORRELATION" or "CONVOLUTION". Default:
         "CROSS_CORRELATION"
     :type compute_mode: string or
-        :class:`P.Convolution.ComputeMode`.
+        :class:`P.Convolution.ComputeMode`
     :param compute_mode: when set to "DEFAULT", no special requirements will be
         placed on the precision of intermediate results. When set to "FLOAT32",
-        Float32 would be used for accumulator and intermediate result, but only
+        "Float32" would be used for accumulator and intermediate result, but only
         effective when input and output are of Float16 dtype.
     :return: output tensor.
     """
@@ -222,10 +225,8 @@ def local_conv2d(
     padding: Union[int, Tuple[int, int]] = 0,
     dilation: Union[int, Tuple[int, int]] = 1,
     conv_mode="CROSS_CORRELATION",
-) -> Tensor:
-    """Applies spatial 2D convolution over an image with untied kernels.
-
-    Refer to :class:`~.LocalConv2d` for more information.
+):
+    """Applies spatial 2D convolution over an groupped channeled image with untied kernels.
     """
     assert conv_mode == "CROSS_CORRELATION" or conv_mode.name == "CROSS_CORRELATION"
 
@@ -233,6 +234,8 @@ def local_conv2d(
     pad_h, pad_w = expand_hw(padding)
     dilate_h, dilate_w = expand_hw(dilation)
 
+    Sparse = P.Convolution.Sparse
+
     op = builtin.GroupLocal(
         stride_h=stride_h,
         stride_w=stride_w,
@@ -240,7 +243,9 @@ def local_conv2d(
         pad_w=pad_w,
         dilate_h=dilate_h,
         dilate_w=dilate_w,
-        # strategy=get_conv_execution_strategy(),
+        mode=conv_mode,
+        compute_mode="DEFAULT",
+        sparse=Sparse.DENSE,
     )
     inp, weight = utils.convert_inputs(inp, weight)
     (output,) = apply(op, inp, weight)
@@ -263,7 +268,7 @@ def max_pool2d(
     :param kernel_size: size of the window.
     :param stride: stride of the window. If not provided, its value is set to kernel_size.
         Default: None
-    :param padding: implicit zero padding to be added on both sides. Default: 0
+    :param padding: implicit zero padding added on both sides. Default: 0
     :return: output tensor.
     """
     if stride is None:
@@ -292,15 +297,15 @@ def avg_pool2d(
     padding: Union[int, Tuple[int, int]] = 0,
     mode: str = "AVERAGE_COUNT_EXCLUDE_PADDING",
 ) -> Tensor:
-    """Applies a 2D average pooling over an input tensor.
+    """Applies 2D average pooling over an input tensor.
 
     Refer to :class:`~.AvgPool2d` for more information.
 
     :param inp: input tensor.
     :param kernel_size: size of the window.
-    :param stride: stride of the window. If not provided, its value is set to kernel_size.
+    :param stride: stride of the window. If not provided, its value is set to ``kernel_size``.
         Default: None
-    :param padding: implicit zero padding to be added on both sides. Default: 0
+    :param padding: implicit zero padding added on both sides. Default: 0
     :param mode: whether to count padding values. Default: "AVERAGE_COUNT_EXCLUDE_PADDING"
     :return: output tensor.
     """
@@ -323,6 +328,48 @@ def avg_pool2d(
     return output
 
 
+def adaptive_max_pool2d(
+    inp: Tensor, oshp: Union[Tuple[int, int], int, Tensor],
+) -> Tensor:
+    """Applies a 2D max adaptive pooling over an input.
+
+    Refer to :class:`~.MaxAdaptivePool2d` for more information.
+
+    :param inp: The input tensor.
+    :param oshp: (OH, OW) size of the output shape.
+    :return: output tensor.
+    """
+    assert isinstance(inp, (Tensor, megbrain_graph.VarNode)), "inp must be Tensor type"
+    if isinstance(oshp, int):
+        oshp = (oshp, oshp)
+
+    op = builtin.AdaptivePooling(mode="MAX", format="NCHW",)
+    oshp = astensor1d(oshp, inp, dtype="int32", device=inp.device)
+    (output,) = apply(op, inp, oshp)
+    return output
+
+
+def adaptive_avg_pool2d(
+    inp: Tensor, oshp: Union[Tuple[int, int], int, Tensor],
+) -> Tensor:
+    """Applies a 2D average adaptive pooling over an input.
+
+    Refer to :class:`~.AvgAdaptivePool2d` for more information.
+
+    :param inp: The input tensor.
+    :param oshp: (OH, OW) size of the output shape.
+    :return: output tensor.
+    """
+    assert isinstance(inp, (Tensor, megbrain_graph.VarNode)), "inp must be Tensor type"
+    if isinstance(oshp, int):
+        oshp = (oshp, oshp)
+
+    op = builtin.AdaptivePooling(mode="AVERAGE", format="NCHW",)
+    oshp = astensor1d(oshp, inp, dtype="int32", device=inp.device)
+    (output,) = apply(op, inp, oshp)
+    return output
+
+
 def prelu(inp: Tensor, weight: Tensor) -> Tensor:
     r"""
     Applies the element-wise PReLU function.
@@ -346,17 +393,17 @@ def softplus(inp: Tensor) -> Tensor:
 
     .. math::
         \text{softplus}(x) = \log(1 + \exp(x))
-    
+
     softplus is a smooth approximation to the ReLU function and can be used
-    to constrain the output of a machine to always be positive.
+    to constrain the output to be always positive.
     For numerical stability the implementation follows this transformation:
 
     .. math::
-        \text{softplus}(x) = \log(1 + \exp(x)) 
-                           = \log(1 + \exp(-\text{abs}(x))) + \max(x, 0) 
+        \text{softplus}(x) = \log(1 + \exp(x))
+                           = \log(1 + \exp(-\text{abs}(x))) + \max(x, 0)
                            = \log1p(\exp(-\text{abs}(x))) + \text{relu}(x)
 
-    :param inp: The input tensor
+    :param inp: input tensor.
 
     Examples:
 
@@ -369,9 +416,9 @@ def softplus(inp: Tensor) -> Tensor:
         x = tensor(np.arange(-3, 3, dtype=np.float32))
         y = F.softplus(x)
         print(y.numpy())
-    
+
     Outputs:
-    
+
     .. testoutput::
 
         [0.0486 0.1269 0.3133 0.6931 1.3133 2.1269]
@@ -380,7 +427,7 @@ def softplus(inp: Tensor) -> Tensor:
     return log1p(exp(-abs(inp))) + relu(inp)
 
 
-def log_softmax(inp: Tensor, axis: Union[int, Sequence[int]]) -> Tensor:
+def logsoftmax(inp: Tensor, axis: Union[int, Sequence[int]]) -> Tensor:
     r"""Applies the :math:`\log(\text{Softmax}(x))` function to an n-dimensional
     input Tensor. The LogSoftmax formulation can be simplified as:
 
@@ -390,13 +437,13 @@ def log_softmax(inp: Tensor, axis: Union[int, Sequence[int]]) -> Tensor:
     For numerical stability the implementation follows this transformation:
 
     .. math::
-        \operatorname{logsoftmax}(x) 
+        \operatorname{logsoftmax}(x)
         = \log (\frac{\exp (x)}{\sum_{i}(\exp (x_{i}))})
         = x - \log (\sum_{i}(\exp (x_{i})))
         = x - logsumexp(x)
-    
-    :param inp: The input tensor
-    :param axis: An axis along which log_softmax will be applied.
+
+    :param inp: input tensor.
+    :param axis: axis along which logsoftmax will be applied.
 
     Examples:
 
@@ -407,11 +454,11 @@ def log_softmax(inp: Tensor, axis: Union[int, Sequence[int]]) -> Tensor:
         import megengine.functional as F
 
         x = tensor(np.arange(-5, 5, dtype=np.float32)).reshape(2,5)
-        y = F.log_softmax(x, axis=1)
+        y = F.logsoftmax(x, axis=1)
         print(y.numpy())
 
     Outputs:
-    
+
     .. testoutput::
 
         [[-4.4519 -3.4519 -2.4519 -1.4519 -0.4519]
@@ -430,7 +477,7 @@ def logsigmoid(inp: Tensor) -> Tensor:
         = - \log(1 + exp(-x))
         = - \text{softplus}(-x)
 
-    :param inp: The input tensor
+    :param inp: input tensor.
 
     Examples:
 
@@ -459,11 +506,10 @@ def logsumexp(
     inp: Tensor, axis: Union[int, Sequence[int]], keepdims: bool = False
 ) -> Tensor:
     r"""
-    Compute the log of the sum of exponentials of inputs along the given :attr:`axis`. 
-    The computation is numerically stabilized.
-    
+    Calculates the logarithm of the inputs' exponential sum along the given :attr:`axis`.
+
     .. math::
-        
+
         \operatorname{logsumexp}(\boldsymbol{x})= \log \sum_{j=1}^{n} \exp \left(x_{j}\right)
 
     For numerical stability, the implementation follows this transformation:
@@ -472,18 +518,18 @@ def logsumexp(
 
         \operatorname{logsumexp}(\boldsymbol{x})= \log \sum_{j=1}^{n} \exp \left(x_{j}\right)
         = \operatorname{logsumexp}(\boldsymbol{x})=b+\log \sum_{j=1}^{n} \exp \left(x_{j}-b\right)
-    
+
     where
 
     .. math::
         b = \max(x_j)
 
-    :param inp: The input tensor.
-    :param axis: Axis over which the sum is taken. It can be a single axis or a list of axes.
+    :param inp: input tensor.
+    :param axis: axis over which the sum is taken. It could be single axis or list of axes.
     :param keepdims: whether to retain :attr:`axis` or not for the output tensor.
 
     Examples:
-    
+
     .. testcode::
 
         import numpy as np
@@ -501,11 +547,11 @@ def logsumexp(
         [-0.5481  4.4519]
 
     """
-    max_value = max(inp, axis, keepdims=True)
+    max_value = max(inp.detach(), axis, keepdims=True)
     if keepdims:
         return max_value + log(sum(exp(inp - max_value), axis, keepdims))
     else:
-        return remove_axis(max_value, axis=None) + log(
+        return squeeze(max_value, axis=None) + log(
             sum(exp(inp - max_value), axis, keepdims)
         )
 
@@ -523,13 +569,13 @@ def softmax(inp: Tensor, axis: Optional[int] = None) -> Tensor:
     .. math::
             \text{Softmax}(x_{i}) = \frac{\exp(x_i)}{\sum_j \exp(x_j)}
 
-    It is applied to all elements along axis, and will re-scale them so that
-    the elements lie in the range `[0, 1]` and sum to 1.
+    It is applied to all elements along axis, and rescales elements so that
+    they stay in the range `[0, 1]` and sum to 1.
 
     See :class:`~megengine.module.activation.Softmax` for more details.
 
-    :param inp: The input tensor.
-    :param axis: An axis along which softmax will be applied. By default,
+    :param inp: input tensor.
+    :param axis: an axis along which softmax will be applied. By default,
         softmax will apply along the highest ranked axis.
 
     Examples:
@@ -560,7 +606,7 @@ def softmax(inp: Tensor, axis: Optional[int] = None) -> Tensor:
     return cached / down
 
 
-def batch_norm2d(
+def batch_norm(
     inp: Tensor,
     running_mean: Tensor = None,
     running_var: Tensor = None,
@@ -572,7 +618,7 @@ def batch_norm2d(
     eps: float = 1e-5,
     inplace: bool = True
 ):
-    """Applies batch normalization to the input.
+    r"""Applies batch normalization to the input.
 
     Refer to :class:`~.BatchNorm2d` and :class:`~.BatchNorm1d` for more information.
 
@@ -584,26 +630,28 @@ def batch_norm2d(
     :param bias: bias tensor in the learnable affine parameters.
         See :math:`\beta` in :class:`~.BatchNorm2d`.
     :param training: a boolean value to indicate whether batch norm is performed
-        in traning mode. Default: False
+        in training mode. Default: False
     :param momentum: value used for the ``running_mean`` and ``running_var``
         computation.
         Default: 0.9
     :param eps: a value added to the denominator for numerical stability.
         Default: 1e-5
-    :param inplace: whether to update running_mean and running_var inplace or return new tensors 
+    :param inplace: whether to update ``running_mean`` and ``running_var`` inplace or return new tensors
         Default: True
     :return: output tensor.
     """
+    if inp.ndim != 4:
+        raise NotImplementedError("batch_norm for ndim != 4")
 
     def full_value(value):
         C = inp.shape[1]
         (x,) = Const(value, dtype=inp.dtype, device=inp.device)(inp)
-        return broadcast(x, [1, C, 1, 1])
+        return broadcast_to(x, [1, C, 1, 1])
 
     def expand_or_full(x, value):
         if x is None:
             return full_value(value)
-        return add_axis(x, [0, 2, 3])
+        return expand_dims(x, [0, 2, 3])
 
     def make_full_if_none(x, value):
         if x is None:
@@ -676,7 +724,7 @@ def sync_batch_norm(
     eps_mode="ADDITIVE",
     group=WORLD,
 ) -> Tensor:
-    """Applies synchronized batch normalization to the input.
+    r"""Applies synchronized batch normalization to the input.
 
     Refer to :class:`~.BatchNorm2d` and :class:`~.BatchNorm1d` for more information.
 
@@ -717,7 +765,7 @@ def sync_batch_norm(
 
         if is_distributed():
             # reduce all nodes' data to calculate mean and variance
-            reduce_size = broadcast(Tensor(reduce_size, dtype=_dtype), [1] * _ndim)
+            reduce_size = broadcast_to(Tensor(reduce_size, dtype=_dtype), [1] * _ndim)
             stat = concat(
                 [reduce_size.astype(_dtype), channel_x1s, channel_x2s], axis=1
             )
@@ -838,6 +886,10 @@ def warp_perspective(
     :param interp_mode: interpolation methods. Default: "LINEAR"
     :return: output tensor.
 
+    Note:
+
+    The transformation matrix is the inverse of that used by `cv2.warpPerspective`.
+
     Examples:
 
     .. testcode::
@@ -868,7 +920,8 @@ def warp_perspective(
         imode=interp_mode, bmode=border_mode, format="NCHW", border_val=border_val
     )
     inp, M = utils.convert_inputs(inp, M)
-    (result,) = apply(op, inp, M, Tensor(dsize))
+    dsize = astensor1d(dsize, inp, dtype="int32", device=inp.device)
+    (result,) = apply(op, inp, M, dsize)
     return result
 
 
@@ -885,19 +938,18 @@ def matmul(
 
     With different inputs dim, this function behaves differently:
 
-    - Both 1-D tensor, simply forward to dot.
+    - Both 1-D tensor, simply forward to ``dot``.
     - Both 2-D tensor, normal matrix multiplication.
     - If one input tensor is 1-D, matrix vector multiplication.
-    - If at least one tensor are 3-dimensional or >3-dimensional, the batched matrix-matrix is returned, and the tensor with smaller dimension will
+    - If at least one tensor are 3-dimensional or >3-dimensional, the other tensor should have dim >= 2, the batched matrix-matrix is returned, and the tensor with smaller dimension will
       be broadcasted. For example:
-        - inp1: `(k, m)`, inp2: `(m, p)`, return: `(k, p)`
         - inp1: `(n, k, m)`, inp2: `(n, m, p)`, return: `(n, k, p)`
         - inp1: `(n, k, m)`, inp2: `(m, p)`, return: `(n, k, p)`
         - inp1: `(n, j, k, m)`, inp2: `(n, j, m, p)`, return: `(n, j, k, p)`
 
-    :param inp1: The first matrix to be multiplied
-    :param inp2: The second matrix to be multiplied
-    :return: The output tensor
+    :param inp1: first matrix to be multiplied.
+    :param inp2: second matrix to be multiplied.
+    :return: output tensor.
 
     Examples:
 
@@ -931,10 +983,10 @@ def matmul(
         if dim1 != dim2:
             if dim1 < dim2:
                 shape1 = shape2[: dim2 - dim1] + shape1
-                inp1 = inp1.broadcast(*shape1)
+                inp1 = broadcast_to(inp1, shape1)
             else:
                 shape2 = shape1[: dim1 - dim2] + shape2
-                inp2 = inp2.broadcast(*shape2)
+                inp2 = broadcast_to(inp2, shape2)
         reshaped_batch_size = 1
         for i in shape1[:-2]:
             reshaped_batch_size *= i
@@ -949,9 +1001,9 @@ def matmul(
         shp = shape1[:-1] + shape2[-1:]
     elif dim1 == 3 or dim2 == 3:
         if dim2 < 3:
-            inp2 = inp2.broadcast(*(inp1.shape[:1] + inp2.shape))
+            inp2 = broadcast_to(inp2, inp1.shape[:1] + inp2.shape)
         elif dim1 < 3:
-            inp1 = inp1.broadcast(*(inp2.shape[:1] + inp1.shape))
+            inp1 = broadcast_to(inp1, inp2.shape[:1] + inp1.shape)
         op = builtin.BatchedMatrixMul(
             transposeA=transpose_a,
             transposeB=transpose_b,
@@ -961,10 +1013,10 @@ def matmul(
     else:
         if dim1 == 1:
             shp = (inp2.shape[1],)
-            inp1 = add_axis(inp1, 0)
+            inp1 = expand_dims(inp1, 0)
         if dim2 == 1:
             shp = (inp1.shape[0],)
-            inp2 = add_axis(inp2, 1)
+            inp2 = expand_dims(inp2, 1)
         op = builtin.MatrixMul(
             transposeA=transpose_a,
             transposeB=transpose_b,
@@ -981,12 +1033,12 @@ def matmul(
 
 def dot(inp1: Tensor, inp2: Tensor) -> Tensor:
     """
-    Compute dot-product of two vectors ``inp1`` and ``inp2``.
+    Computes dot-product of two vectors ``inp1`` and ``inp2``.
     inputs must be 1-dimensional, scalar input can be automatically broadcasted.
 
-    :param inp1: The first vector
-    :param inp2: The second vector
-    :return: The output value
+    :param inp1: first vector.
+    :param inp2: second vector.
+    :return: output value.
 
     Examples:
 
@@ -1016,10 +1068,10 @@ def dot(inp1: Tensor, inp2: Tensor) -> Tensor:
 
 def svd(inp: Tensor, full_matrices=False, compute_uv=True) -> Tensor:
     """
-    Compute the singular value decompositions of input matrix ``inp``.
+    Computes the singular value decompositions of input matrix.
 
-    :param inp: The input matrix, must has shape ``[..., M, N]``
-    :return: The output matrices, U, sigma, V
+    :param inp: input matrix, must has shape `[..., M, N]`.
+    :return: output matrices, `(U, sigma, V)`.
 
     Examples:
 
@@ -1036,7 +1088,7 @@ def svd(inp: Tensor, full_matrices=False, compute_uv=True) -> Tensor:
     Outputs:
 
     .. testoutput::
-    
+
         [7.3485 1.    ]
 
     """
@@ -1052,8 +1104,7 @@ def interpolate(
     mode: str = "BILINEAR",
     align_corners: bool = None,
 ) -> Tensor:
-    r"""Down/up samples the input tensor to either the given size or the given
-    scale_factor.
+    r"""Down/up samples the input tensor to either the given size or with the given scale_factor. ``size`` can not coexist with ``scale_factor``.
 
     :param inp: input tensor.
     :param size: size of the output tensor. Default: None
@@ -1069,13 +1120,12 @@ def interpolate(
         import numpy as np
         from megengine import tensor
         import megengine.functional as F
-        from megengine.test import assertTensorClose
 
         x = tensor(np.arange(1, 5, dtype=np.float32).reshape(1, 1, 2, 2))
-        out = F.interpolate(x, [4, 4], align_corners=False)
+        out = F.nn.interpolate(x, [4, 4], align_corners=False)
         print(out.numpy())
-        out2 = F.interpolate(x, scale_factor=2.)
-        assertTensorClose(out.numpy(), out2.numpy())
+        out2 = F.nn.interpolate(x, scale_factor=2.)
+        np.testing.assert_allclose(out.numpy(), out2.numpy())
 
     Outputs:
 
@@ -1100,7 +1150,7 @@ def interpolate(
             align_corners = False
 
     if mode == "LINEAR":
-        inp = add_axis(inp, 3)
+        inp = expand_dims(inp, 3)
 
     if inp.ndim != 4:
         raise ValueError("shape of input tensor must correspond to the operartion mode")
@@ -1170,7 +1220,7 @@ def interpolate(
             [row0, row1, Tensor([[0, 0, 1]], dtype="float32", device=inp.device)],
             axis=0,
         ).reshape(1, 3, 3)
-        weight = broadcast(weight, (inp.shape[0], 3, 3))
+        weight = broadcast_to(weight, (inp.shape[0], 3, 3))
     else:
         hscale = 1.0 * ih / oh
         wscale = 1.0 * iw / ow
@@ -1186,7 +1236,7 @@ def interpolate(
             [row0, row1, Tensor([[0, 0, 1]], dtype="float32", device=inp.device)],
             axis=0,
         ).reshape(1, 3, 3)
-        weight = broadcast(weight, (inp.shape[0], 3, 3))
+        weight = broadcast_to(weight, (inp.shape[0], 3, 3))
 
     weight = weight.astype("float32")
     ret = warp_perspective(inp, weight, dsize, interp_mode="LINEAR")
@@ -1197,12 +1247,12 @@ def interpolate(
 
 def dropout(inp: Tensor, drop_prob: float, training: bool = True) -> Tensor:
     """Returns a new tensor where each of the elements are randomly set to zero
-    with probability P = ``drop_prob``. Optionally rescale the output tensor.
+    with probability P = ``drop_prob``. Optionally rescale the output tensor if ``training`` is True.
 
     :param inp: input tensor.
     :param drop_prob: probability to drop (set to zero) a single element.
     :param training: the default behavior of ``dropout`` during training is to rescale the output,
-        then it can be replaced by an :class:`~.Identity` during inference, default to True.
+        then it can be replaced by an :class:`~.Identity` during inference. Default: True
     :return: the output tensor
 
     Examples:
@@ -1244,10 +1294,10 @@ def embedding(
     """Applies lookup table for embedding.
 
     :param inp: tensor with indices.
-    :param weight: learnable weights which embedding from.
-    :param padding_idx: should be set to None, not support now.
-    :param max_norm: should be set to None, not support now.
-    :param norm_type: should be set to None, not support now.
+    :param weight: learnable weights which embeds from.
+    :param padding_idx: should be set to None, not supported now.
+    :param max_norm: should be set to None, not supported now.
+    :param norm_type: should be set to None, not supported now.
     :return: output tensor.
 
     Refer to :class:`~.Embedding` for more information.
@@ -1288,7 +1338,7 @@ def roi_pooling(
             np.random.seed(42)
             inp = tensor(np.random.randn(1, 1, 128, 128))
             rois = tensor(np.random.random((4, 5)))
-            y = F.roi_pooling(inp, rois, (2, 2))
+            y = F.nn.roi_pooling(inp, rois, (2, 2))
             print(y.numpy()[0])
 
     Outputs:
@@ -1323,14 +1373,14 @@ def roi_align(
 ) -> Tensor:
     """Applies roi align on input feature.
 
-    :param inp: tensor that represents the input feature, `(N, C, H, W)` images.
-    :param rois: `(N, 5)` boxes. First column is the index into N. The other 4 columns are xyxy.
+    :param inp: tensor that represents the input feature, shape is `(N, C, H, W)`.
+    :param rois: `(N, 5)` boxes. First column is the box index. The other 4 columns are ``xyxy``.
     :param output_shape: `(height, width)` shape of output rois feature.
     :param mode: "max" or "average", use max/average align just like max/average pooling. Default: "average"
     :param spatial_scale: scale the input boxes by this number. Default: 1.0
     :param sample_points: number of inputs samples to take for each output sample.
         0 to take samples densely. Default: 2
-    :param aligned: wheather align the input feature, with `aligned=True`,
+    :param aligned: wheather to align the input feature, with `aligned=True`,
         we first appropriately scale the ROI and then shift it by -0.5. Default: True
     :return: output tensor.
 
@@ -1345,7 +1395,7 @@ def roi_align(
             np.random.seed(42)
             inp = tensor(np.random.randn(1, 1, 128, 128))
             rois = tensor(np.random.random((4, 5)))
-            y = F.roi_align(inp, rois, (2, 2))
+            y = F.nn.roi_align(inp, rois, (2, 2))
             print(y.numpy()[0])
 
     Outputs:
@@ -1383,7 +1433,7 @@ def roi_align(
 def indexing_one_hot(
     src: Tensor, index: Tensor, axis: int = 1, keepdims=False
 ) -> Tensor:
-    r"""One-hot indexing for some axis.
+    r"""One-hot indexing for some axes.
 
     :param src: input tensor.
     :param index: index tensor.
@@ -1417,19 +1467,23 @@ def indexing_one_hot(
     index = utils.convert_single_value(index, (src,), dtype="int32", device=src.device)
     (result,) = apply(op, src, index)
     if not keepdims:
-        result = remove_axis(result, axis)
+        result = squeeze(result, axis)
     return result
 
 
-def nms(boxes: Tensor, scores: Tensor, iou_thresh: float) -> Tensor:
+def nms(
+    boxes: Tensor, scores: Tensor, iou_thresh: float, max_output: Optional[int] = None
+) -> Tensor:
     r"""
     Performs non-maximum suppression (NMS) on the boxes according to their intersection-over-union(IoU).
 
     :param boxes: tensor of shape `(N, 4)`; the boxes to perform nms on; each box is expected to be in `(x1, y1, x2, y2)` format.
-    :param iou_thresh: iou threshold for overlapping.
+    :param iou_thresh: IoU threshold for overlapping.
     :param scores: tensor of shape `(N,)`, the score of boxes.
+    :param max_output: the maximum number of boxes to keep; it is optional if this operator is not traced 
+        otherwise it required to be specified; if it is not specified, all boxes are kept.
     :return: indices of the elements that have been kept by NMS.
-    
+
     Examples:
 
     .. testcode::
@@ -1444,13 +1498,13 @@ def nms(boxes: Tensor, scores: Tensor, iou_thresh: float) -> Tensor:
         x[:,2:] = np.random.rand(100,2)*20 + 100
         scores = tensor(np.random.rand(100))
         inp = tensor(x)
-        result = F.nms(inp, scores, iou_thresh=0.7)
+        result = F.nn.nms(inp, scores, iou_thresh=0.7)
         print(result.numpy())
 
     Outputs:
 
     .. testoutput::
-    
+
         [75 69]
 
     """
@@ -1466,74 +1520,24 @@ def nms(boxes: Tensor, scores: Tensor, iou_thresh: float) -> Tensor:
     scores = scores.detach()
     sorted_idx = argsort(scores, descending=True)
     boxes = boxes[sorted_idx]
-    max_output = boxes.shape[0]
+
+    if is_tracing():
+        assert (
+            max_output is not None and max_output > 0
+        ), "max_output should be specified under tracing"
+
+    if max_output is None:
+        max_output = boxes.shape[0]
 
     op = builtin.NMSKeep(iou_thresh, max_output)
     inp = utils.convert_inputs(boxes.reshape(1, -1, 4))
     indices, count = apply(op, *inp)
-    indices = indices[0][: count.item()]
+    indices = indices[0][: count[0]]
     keep_inds = sorted_idx[indices]
     return keep_inds
 
 
-def batched_nms(
-    boxes: Tensor, scores: Tensor, idxs: Tensor, iou_thresh: float,
-) -> Tensor:
-    r"""
-    Performs non-maximum suppression (NMS) on the boxes according to their intersection-over-union (IoU).
-
-    :param boxes: tensor of shape `(N, 4)`; the boxes to perform nms on; each box is expected to be in `(x1, y1, x2, y2)` format
-    :param iou_thresh: iou threshold for overlapping
-    :param idxs: tensor of shape `(N,)`, the class indexs of boxes in the batch.
-    :param scores: tensor of shape `(N,)`, the score of boxes.
-    :return: indices and the number of the elements that have been kept by NMS
 
-    Examples:
 
-    .. testcode::
-
-        import numpy as np
-        from megengine import tensor
-        import megengine.functional as F
-
-        x = np.zeros((100,4))
-        np.random.seed(42)
-        x[:,:2] = np.random.rand(100,2)*20
-        x[:,2:] = np.random.rand(100,2)*20 + 100
-        scores = tensor(np.random.rand(100))
-        idxs =  tensor(np.random.randint(0, 10, 100))
-        inp = tensor(x)
-        result = F.batched_nms(inp, scores, idxs, iou_thresh=0.6)
-        print(result.numpy())
-
-    Outputs:
-
-    .. testoutput::
-
-        [75 41 99 98 69 64 11 27 35 18]
-
-    """
-    assert (
-        boxes.ndim == 2 and boxes.shape[1] == 4
-    ), "the expected shape of boxes is (N, 4)"
-    assert scores.ndim == 1, "the expected shape of scores is (N,)"
-    assert idxs.ndim == 1, "the expected shape of idxs is (N,)"
-    assert boxes.shape[0] == scores.shape[0] == idxs.shape[0]
-
-    boxes = boxes.detach()
-    scores = scores.detach()
-    idxs = idxs.detach()
-    max_coordinate = boxes.max()
-    offsets = idxs.astype("float32") * (max_coordinate + 1)
-    boxes = boxes + offsets.reshape(-1, 1).broadcast(boxes.shape[0], 4)
-
-    sorted_idx = argsort(scores, descending=True)
-    boxes = boxes[sorted_idx]
-    max_output = boxes.shape[0]
-
-    op = builtin.NMSKeep(iou_thresh, max_output)
-    inp = utils.convert_inputs(boxes.reshape(1, -1, 4))
-    indices, count = apply(op, *inp)
-    indices = indices[0][: count.item()]
-    keep_inds = sorted_idx[indices]
-    return keep_inds
+from .loss import *  # isort:skip
+from .quantized import conv_bias_activation  # isort:skip
diff --git a/imperative/python/megengine/functional/param_pack.py b/imperative/python/megengine/functional/param_pack.py
deleted file mode 100644
index d7d52085..00000000
--- a/imperative/python/megengine/functional/param_pack.py
+++ /dev/null
@@ -1,34 +0,0 @@
-# -*- coding: utf-8 -*-
-# MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
-#
-# Copyright (c) 2014-2020 Megvii Inc. All rights reserved.
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-import numpy as np
-
-from ..tensor import Tensor
-from .distributed import all_reduce_sum
-from .tensor import param_pack_concat, param_pack_split
-
-
-def get_offsets(shapes):
-    offsets = []
-    offset = 0
-    for shape in shapes:
-        offsets.append(offset)
-        offset += int(np.prod(shape))
-        offsets.append(offset)
-    return offsets
-
-
-def pack_allreduce_split(pack_list, shapes, group, reduce_method):
-    offsets_val = get_offsets(shapes)
-    offsets = Tensor(offsets_val)
-    packed_grads = param_pack_concat(pack_list, offsets, offsets_val)
-    packed_grads = all_reduce_sum(packed_grads, group)
-    if reduce_method == "mean":
-        packed_grads /= group.size
-    grads = param_pack_split(packed_grads, offsets_val, shapes)
-    return grads
diff --git a/imperative/python/megengine/functional/quantized.py b/imperative/python/megengine/functional/quantized.py
index 07a3b61a..0f92b1a0 100644
--- a/imperative/python/megengine/functional/quantized.py
+++ b/imperative/python/megengine/functional/quantized.py
@@ -34,26 +34,23 @@ def conv_bias_activation(
     :param weight: convolution kernel.
     :param bias: bias added to the result of convolution
     :param stride: stride of the 2D convolution operation. Default: 1
-    :param padding: size of the paddings added to the input on both sides of its
-        spatial dimensions. Only zero-padding is supported. Default: 0
+    :param padding: size of the paddings added to the input on both sides of its spatial dimensions. Only zero-padding is supported. Default: 0
     :param dilation: dilation of the 2D convolution operation. Default: 1
-    :param groups: number of groups to divide input and output channels into,
-        so as to perform a "grouped convolution". When groups is not 1,
-        in_channels and out_channels must be divisible by groups,
+    :param groups: number of groups into which the input and output channels are divided, so as to perform a "grouped convolution". When ``groups`` is not 1,
+        ``in_channels`` and ``out_channels`` must be divisible by ``groups``,
         and the shape of weight should be `(groups, out_channel // groups,
         in_channels // groups, height, width)`.
     :type conv_mode: string or :class:`P.Convolution.Mode`.
     :param conv_mode: supports 'CROSS_CORRELATION' or 'CONVOLUTION'. Default:
         'CROSS_CORRELATION'
-    :param dtype: support for np.dtype, Default: np.int8
+    :param dtype: support for ``np.dtype``, Default: np.int8
     :param scale: scale if use quantization, Default: 0.0
     :param zero_point: scale if use quantization quint8, Default: 0.0
     :type compute_mode: string or
         :class:`P.Convolution.ComputeMode`.
-    :param compute_mode: when set to 'DEFAULT', no special requirements will be
-        placed on the precision of intermediate results. When set to 'FLOAT32',
-        Float32 would be used for accumulator and intermediate result, but only
-        effective when input and output are of Float16 dtype.
+    :param compute_mode: when set to "DEFAULT", no special requirements will be
+        placed on the precision of intermediate results. When set to "FLOAT32",
+        "Float32" would be used for accumulator and intermediate result, but only effective when input and output are of Float16 dtype.
 
     """
     ph, pw = _pair(padding)
diff --git a/imperative/python/megengine/functional/tensor.py b/imperative/python/megengine/functional/tensor.py
index 31053b78..99f57438 100644
--- a/imperative/python/megengine/functional/tensor.py
+++ b/imperative/python/megengine/functional/tensor.py
@@ -19,6 +19,7 @@ from ..core.ops import builtin
 from ..core.ops._internal import param_defs as P
 from ..core.ops.special import Const
 from ..core.tensor.core import TensorBase, TensorWrapperBase, apply
+from ..core.tensor.tensor_wrapper import _broadcast, _remove_axis
 from ..core.tensor.utils import (
     astensor1d,
     convert_inputs,
@@ -31,27 +32,22 @@ from ..tensor import Tensor
 from .elemwise import ceil
 
 __all__ = [
-    "add_axis",
     "arange",
-    "broadcast",
+    "broadcast_to",
     "concat",
     "cond_take",
-    "transpose",
-    "add_axis",
+    "expand_dims",
     "eye",
     "flatten",
     "full",
     "full_like",
     "gather",
-    "identity",
     "linspace",
     "ones",
     "ones_like",
-    "param_pack_concat",
-    "param_pack_split",
     "reshape",
-    "remove_axis",
     "split",
+    "squeeze",
     "stack",
     "scatter",
     "transpose",
@@ -61,11 +57,10 @@ __all__ = [
 ]
 
 
-def eye(shape, *, dtype="float32", device: Optional[CompNode] = None) -> Tensor:
+def eye(N, M=None, *, dtype="float32", device: Optional[CompNode] = None) -> Tensor:
     """Returns a 2D tensor with ones on the diagonal and zeros elsewhere.
 
-    :param shape: expected shape of otuput tensor.
-    :param m: number of columns. Default: None
+    :param shape: expected shape of output tensor.
     :param dtype: data type. Default: None
     :param device: compute node of the matrix. Default: None
     :return: eye matrix.
@@ -77,8 +72,7 @@ def eye(shape, *, dtype="float32", device: Optional[CompNode] = None) -> Tensor:
         import numpy as np
         import megengine.functional as F
 
-        data_shape = (4, 6)
-        out = F.eye(data_shape, dtype=np.float32)
+        out = F.eye(4, 6, dtype=np.float32)
         print(out.numpy())
 
     Outputs:
@@ -91,8 +85,17 @@ def eye(shape, *, dtype="float32", device: Optional[CompNode] = None) -> Tensor:
          [0. 0. 0. 1. 0. 0.]]
 
     """
+    if M is not None:
+        if isinstance(N, Tensor) or isinstance(M, Tensor):
+            shape = astensor1d((N, M))
+        else:
+            shape = Tensor([N, M], dtype="int32", device=device)
+    elif isinstance(N, Tensor):
+        shape = N
+    else:
+        shape = Tensor(N, dtype="int32", device=device)
     op = builtin.Eye(k=0, dtype=dtype, comp_node=device)
-    (result,) = apply(op, Tensor(shape, dtype="int32", device=device))
+    (result,) = apply(op, shape)
     return result
 
 
@@ -106,7 +109,7 @@ def full(shape, value, dtype="float32", device=None):
     (x,) = Const(value, dtype=dtype, device=device)(
         Tensor(value, dtype=dtype, device=device)
     )
-    return broadcast(x, shape)
+    return broadcast_to(x, shape)
 
 
 def ones(shape, dtype="float32", device=None):
@@ -160,7 +163,7 @@ def zeros_like(inp: Tensor) -> Tensor:
         print(out.numpy())
 
     Outputs:
-    
+
     .. testoutput::
 
         [[0 0 0]
@@ -171,7 +174,7 @@ def zeros_like(inp: Tensor) -> Tensor:
 
 
 def ones_like(inp: Tensor) -> Tensor:
-    """Returns a identity tensor with the same shape as input tensor.
+    """Returns a ones tensor with the same shape as input tensor.
     """
     return ones(inp.shape, dtype=inp.dtype, device=inp.device)
 
@@ -182,19 +185,7 @@ def full_like(inp: Tensor, value: Union[int, float]) -> Tensor:
     return full(inp.shape, value, dtype=inp.dtype, device=inp.device)
 
 
-def identity(inp: Tensor) -> Tensor:
-    """Applies an identity transform to the input tensor.
-
-    :param inp: input tensor.
-    :return: output tensor.
-    """
-    op = builtin.Identity()
-    (data,) = convert_inputs(inp)
-    (output,) = apply(op, data)
-    return output
-
-
-def broadcast(inp: Tensor, shape: Union[int, Iterable[int]]) -> Tensor:
+def broadcast_to(inp: Tensor, shape: Union[int, Iterable[int]]) -> Tensor:
     """
     Broadcasts a tensor to given shape.
 
@@ -211,7 +202,7 @@ def broadcast(inp: Tensor, shape: Union[int, Iterable[int]]) -> Tensor:
         import megengine.functional as F
 
         data = tensor(np.arange(0, 6, dtype=np.float32).reshape(2, 3))
-        out = F.broadcast(data, (4, 2, 3))
+        out = F.broadcast_to(data, (4, 2, 3))
         print(out.numpy())
 
     Outputs:
@@ -231,9 +222,7 @@ def broadcast(inp: Tensor, shape: Union[int, Iterable[int]]) -> Tensor:
           [3. 4. 5.]]]
 
     """
-    shape = astensor1d(shape, inp, dtype="int32", device=inp.device)
-    (result,) = apply(builtin.Broadcast(), inp, shape)
-    return result
+    return _broadcast(inp, shape)
 
 
 def concat(inps: Iterable[Tensor], axis: int = 0, device=None) -> Tensor:
@@ -241,8 +230,8 @@ def concat(inps: Iterable[Tensor], axis: int = 0, device=None) -> Tensor:
     Concat some tensors
 
     :param inps: input tensors to concat.
-    :param axis: dimension over which the tensors are concatenated. Default: 0
-    :param device: comp node output on. Default: None
+    :param axis: over which dimension the tensors are concatenated. Default: 0
+    :param device: which device output will be. Default: None
     :return: output tensor.
 
     Examples:
@@ -290,7 +279,7 @@ def stack(inps, axis=0, device=None):
 
     :param inps: input tensors.
     :param axis: which axis will be concatenated.
-    :param device: The comp node output on. Default: None
+    :param device: the device output will be. Default: None
     :return: output concatenated tensor.
 
     Examples:
@@ -322,7 +311,7 @@ def stack(inps, axis=0, device=None):
         if len(shapes) != 1:
             raise ValueError("All input tensors must have the same shape")
 
-    inps = [add_axis(inp, axis=axis) for inp in inps]
+    inps = [expand_dims(inp, axis=axis) for inp in inps]
     return concat(inps, axis=axis, device=device)
 
 
@@ -331,7 +320,7 @@ def split(inp, nsplits_or_sections, axis=0):
     When nsplits_or_sections is int, the last tensor may be smaller than others.
 
     :param inp: input tensor.
-    :param nsplits_or_sections: number of sub tensors or section information list.
+    :param nsplits_or_sections: number of sub tensors or sections information list.
     :param axis: which axis will be splited.
     :return: output tensor list.
 
@@ -399,8 +388,7 @@ def _get_idx(index, axis):
                 0, index.shape[i] - 1, index.shape[i], device=index.device,
             )
             arange = (
-                arange.reshape(*shape)
-                .broadcast(index.shape)
+                broadcast_to(arange.reshape(*shape), index.shape)
                 .reshape(-1)
                 .astype(np.int32)
             )
@@ -411,7 +399,8 @@ def _get_idx(index, axis):
 
 
 def gather(inp: Tensor, axis: int, index: Tensor) -> Tensor:
-    r"""Gathers data from inp on axis using index.
+    # TODO: rewrite doc
+    r"""Gathers data from input tensor on axis using index.
 
     For a 3-D tensor, the output is specified by::
 
@@ -419,14 +408,14 @@ def gather(inp: Tensor, axis: int, index: Tensor) -> Tensor:
         out[i][j][k] = inp[i][index[i][j][k]][k] # if axis == 1
         out[i][j][k] = inp[i][j][index[i][j][k]] # if axis == 2
 
-    if inp is an n-dimensional tensor with size
+    if input tensor is a n-dimensional tensor with size
     :math:`(x_0,x_1,...,x_{i-1},x_i,x_{i+1},...,x_{n-1})` and axis=i,
-    then index must be an n-dimensional tensor with size
+    then index must be a n-dimensional tensor with size
     :math:`(x_0,x_1,...,x_{i-1},y,x_{i+1},...,x_{n-1})` where :math:`y\ge 1` and
     output will have the same size as index.
 
     :param inp: input tensor.
-    :param axis: axis along which to index.
+    :param axis: along which axis to index.
     :param index: indices of elements to gather.
     :return: output tensor.
 
@@ -482,20 +471,21 @@ def gather(inp: Tensor, axis: int, index: Tensor) -> Tensor:
 
 
 def scatter(inp: Tensor, axis: int, index: Tensor, source: Tensor) -> Tensor:
-    r"""Writes all values from the tensor source into inp 
+    # TODO: rewrite doc
+    r"""Writes all values from the tensor source into input tensor
     at the indices specified in the index tensor.
 
     For each value in source, its output index is specified by its index
     in source for ``axis != dimension`` and by the corresponding value in
     index for ``axis = dimension``.
 
-    For a 3-D tensor, inp is updated as::
+    For a 3-D tensor, input tensor is updated as::
 
         inp[index[i][j][k]][j][k] = source[i][j][k]  # if axis == 0
         inp[i][index[i][j][k]][k] = source[i][j][k]  # if axis == 1
         inp[i][j][index[i][j][k]] = source[i][j][k]  # if axis == 2
 
-    inp, index and source should have same number of dimensions.
+    ``inp``, ``index`` and ``source`` should have same number of dimensions.
 
     It is also required that ``source.shape(d) <= inp.shape(d)`` and ``index.shape(d) == source.shape(d)``
     for all dimensions ``d``.
@@ -504,10 +494,10 @@ def scatter(inp: Tensor, axis: int, index: Tensor, source: Tensor) -> Tensor:
 
     .. note::
         Please notice that, due to performance issues, the result is uncertain on the GPU device
-        if scatter difference positions from source to the same destination position
+        if scattering different positions from source to the same destination position
         regard to index tensor.
 
-        Show the case using the following examples, the oup[0][2] is maybe
+        Check the following examples, the oup[0][2] is maybe
         from source[0][2] which value is 0.2256 or source[1][2] which value is 0.5339
         if set the index[1][2] from 1 to 0.
 
@@ -593,7 +583,7 @@ def where(mask: Tensor, x: Tensor, y: Tensor) -> Tensor:
 
         \textrm{out}_i = x_i \textrm{ if } \textrm{mask}_i \textrm{ is True else } y_i
 
-    :param mask: a mask used for choosing x or y.
+    :param mask: a mask used for choosing ``x`` or ``y``.
     :param x: first choice.
     :param y: second choice.
     :return: output tensor.
@@ -649,7 +639,7 @@ def where(mask: Tensor, x: Tensor, y: Tensor) -> Tensor:
 
 def cond_take(mask: Tensor, x: Tensor) -> Tensor:
     r"""
-    Take elements from data if specific condition is satisfied on mask.
+    Takes elements from data if specific condition is satisfied on mask.
     This operator has two outputs: the first is the elements taken,
     and the second is the indices corresponding to those elements;
     they are both 1-dimensional. High-dimension input would first be flattened.
@@ -696,7 +686,7 @@ def transpose(inp: Tensor, pattern: Iterable[int]) -> Tensor:
     Swaps shapes and strides according to given pattern.
 
     :param inp: input tensor.
-    :param pattern: a list of integers including 0, 1, ... , ``ndim``-1, 
+    :param pattern: a list of integers including 0, 1, ... , ``ndim``-1,
     and any number of ``'x'`` char in dimensions where this tensor should be broadcasted. For examples:
 
         * (``'x'``) -> make a 0d (scalar) into a 1d vector
@@ -707,7 +697,7 @@ def transpose(inp: Tensor, pattern: Iterable[int]) -> Tensor:
         * (2, 0, 1) -> AxBxC to CxAxB
         * (0, ``'x'``, 1) -> AxB to Ax1xB
         * (1, ``'x'``, 0) -> AxB to Bx1xA
-        * (1,) -> This remove dimensions 0. It must be a broadcastable dimension (1xA to A)
+        * (1,) -> this removes dimensions 0. It must be a broadcastable dimension (1xA to A)
 
     :return: output tensor.
 
@@ -730,13 +720,7 @@ def transpose(inp: Tensor, pattern: Iterable[int]) -> Tensor:
          [1 0]]
 
     """
-    op = builtin.Dimshuffle(pattern)
-    (inp,) = convert_inputs(inp)
-    (result,) = apply(op, inp)
-    return result
-
-
-dimshuffle = transpose
+    return inp.transpose(pattern)
 
 
 def reshape(inp: Tensor, target_shape: Iterable[int]) -> Tensor:
@@ -745,8 +729,7 @@ def reshape(inp: Tensor, target_shape: Iterable[int]) -> Tensor:
     remain unchanged
 
     :param inp: input tensor.
-    :param target_shape: target shape, the components would be concatenated to form the
-        target shape, and it can contain an element of -1 representing unspec_axis.
+    :param target_shape: target shape, it can contain an element of -1 representing ``unspec_axis``.
 
     Examples:
 
@@ -773,26 +756,7 @@ def reshape(inp: Tensor, target_shape: Iterable[int]) -> Tensor:
           [10 11]]]
 
     """
-    if isinstance(target_shape, (TensorBase, TensorWrapperBase)):
-        target_shape = target_shape.numpy()
-    target_shape = tuple(map(int, target_shape))
-    unspec_axis = None
-    for i, s in enumerate(target_shape):
-        if s < 0:
-            if s != -1:
-                raise ValueError("expect shape[{}] >= -1, got {}".format(i, s))
-            if unspec_axis is not None:
-                raise ValueError("multiple -1 in shape: {} & {}".format(unspec_axis, i))
-            unspec_axis = i
-
-    # TODO: device should be None (cpu)
-    (target_shape,) = Const(target_shape, dtype="int32", device=inp.device)(inp)
-    if unspec_axis is None:
-        op = builtin.Reshape()
-    else:
-        op = builtin.Reshape(unspec_axis=unspec_axis)
-    (x,) = apply(op, inp, target_shape)
-    return x
+    return inp.reshape(target_shape)
 
 
 AxisAddRemove = builtin.AxisAddRemove
@@ -837,7 +801,7 @@ def flatten(inp: Tensor, start_axis: int = 0, end_axis: int = -1) -> Tensor:
     return inp.reshape(*target_shape)
 
 
-def add_axis(inp: Tensor, axis: Union[int, Sequence[int]]) -> Tensor:
+def expand_dims(inp: Tensor, axis: Union[int, Sequence[int]]) -> Tensor:
     r"""
     Adds dimension before given axis.
 
@@ -854,7 +818,7 @@ def add_axis(inp: Tensor, axis: Union[int, Sequence[int]]) -> Tensor:
         import megengine.functional as F
 
         x = tensor([1, 2])
-        out = F.add_axis(x, 0)
+        out = F.expand_dims(x, 0)
         print(out.shape)
 
     Outputs:
@@ -883,12 +847,7 @@ def add_axis(inp: Tensor, axis: Union[int, Sequence[int]]) -> Tensor:
     return result
 
 
-add_axis = add_axis
-
-
-def remove_axis(
-    inp: Tensor, axis: Optional[Union[int, Sequence[int]]] = None
-) -> Tensor:
+def squeeze(inp: Tensor, axis: Optional[Union[int, Sequence[int]]] = None) -> Tensor:
     r"""
     Removes dimension of shape 1.
 
@@ -905,7 +864,7 @@ def remove_axis(
         import megengine.functional as F
 
         x = tensor(np.array([1, 2], dtype=np.int32).reshape(1, 1, 2, 1))
-        out = F.remove_axis(x, 3)
+        out = F.squeeze(x, 3)
         print(out.shape)
 
     Outputs:
@@ -915,25 +874,7 @@ def remove_axis(
         (1, 1, 2)
 
     """
-    Param = builtin.AxisAddRemove.Param
-
-    def get_axes():
-        if axis is None:
-            return [i for i, s in enumerate(inp.shape) if s == 1]
-        try:
-            return [int(axis)]
-        except (TypeError, ValueError):
-            pass
-        return list(map(int, axis))
-
-    axis = get_axes()
-    axis = sorted(i + inp.ndim if i < 0 else i for i in axis)
-    axis = [a - i for i, a in enumerate(axis)]
-
-    param = Param(*map(builtin.AxisAddRemove.AxisDesc.make_remove, axis))
-    op = builtin.AxisAddRemove(param=param)
-    (result,) = apply(op, inp)
-    return result
+    return _remove_axis(inp, axis)
 
 
 def linspace(
@@ -962,7 +903,7 @@ def linspace(
         print(a.numpy())
 
     Outputs:
-    
+
     .. testoutput::
 
         [ 3.    4.75  6.5   8.25 10.  ]
@@ -982,15 +923,15 @@ def linspace(
 
 def arange(
     start: Union[int, float, Tensor] = 0,
-    end: Optional[Union[int, float, Tensor]] = None,
+    stop: Optional[Union[int, float, Tensor]] = None,
     step: Union[int, float, Tensor] = 1,
     dtype="float32",
     device: Optional[CompNode] = None,
 ) -> Tensor:
-    r"""Returns a Tensor with values from start to end with adjacent interval step.
+    r"""Returns a tensor with values from start to stop with adjacent interval step.
 
     :param start: starting value of the squence, shoule be scalar.
-    :param end: ending value of the squence, shoule be scalar.
+    :param stop: ending value of the squence, shoule be scalar.
     :param step: gap between each pair of adjacent values. Default: 1
     :param dtype: result data type.
     :return: generated tensor.
@@ -1004,7 +945,7 @@ def arange(
 
         a = F.arange(5)
         print(a.numpy())
-    
+
     Outputs:
 
     Outputs:
@@ -1014,96 +955,18 @@ def arange(
         [0. 1. 2. 3. 4.]
 
     """
-    if end is None:
-        start, end = 0, start
+    if stop is None:
+        start, stop = 0, start
 
     if isinstance(start, Tensor):
         start = start.astype("float32")
-    if isinstance(end, Tensor):
-        end = end.astype("float32")
+    if isinstance(stop, Tensor):
+        stop = stop.astype("float32")
     if isinstance(step, Tensor):
         step = step.astype("float32")
-    num = ceil(Tensor((end - start) / step, device=device))
+    num = ceil(Tensor((stop - start) / step, device=device))
     stop = start + step * (num - 1)
     result = linspace(start, stop, num, device=device)
     if np.dtype(dtype) == np.int32:
         return result.astype(dtype)
     return result
-
-
-def param_pack_split(inp: Tensor, offsets: List, shapes: List) -> Tensor:
-    r"""
-    Returns split Tensor to Tensor list as offsets and shapes described,
-            only used for parampack.
-
-    :param inp: input tensor.
-    :param offsets: offsets of outputs, length of 2 * n,
-            while n is tensor nums you want to split,
-            format `[begin0, end0, begin1, end1]`.
-    :param shapes: tensor shapes of outputs.
-    :return: split tensors.
-
-    Examples:
-
-    .. testcode::
-
-        import numpy as np
-        import megengine.functional as F
-        from megengine import tensor
-
-        a = tensor(np.ones((10,), np.int32))
-        b, c = F.param_pack_split(a, [0, 1, 1, 10], [(1,), (3, 3)])
-        print(b.numpy())
-        print(c.numpy())
-    
-    Outputs:
-    
-    .. testoutput::
-
-        [1]
-        [[1 1 1]
-         [1 1 1]
-         [1 1 1]]
-
-    """
-    op = builtin.ParamPackSplit()
-    op.offsets = offsets
-    op.shapes = shapes
-    return apply(op, inp)
-
-
-def param_pack_concat(inps: List, offsets: Tensor, offsets_val: List) -> Tensor:
-    r"""
-    Returns concat Tensor, only used for parampack.
-
-    :param inps: input tensors.
-    :param offsets: device value of offsets.
-    :param offsets_val: offsets of inputs, length of 2 * n,
-            format [begin0, end0, begin1, end1].
-    :return: concat tensors
-
-    Examples:
-
-    .. testcode::
-
-        import numpy as np
-        import megengine.functional as F
-        from megengine import tensor
-
-        a = tensor(np.ones((1,), np.int32))
-        b = tensor(np.ones((3, 3), np.int32))
-        offsets_val = [0, 1, 1, 10]
-        offsets = tensor(offsets_val, np.int32)
-        c = F.param_pack_concat([a, b], offsets, offsets_val)
-        print(c.numpy())
-    
-    Outputs:
-    
-    .. testoutput::
-
-        [1 1 1 1 1 1 1 1 1 1]
-
-    """
-    op = builtin.ParamPackConcat()
-    op.offsets = offsets_val
-    return apply(op, *inps, offsets)[0]
diff --git a/imperative/python/megengine/functional/utils.py b/imperative/python/megengine/functional/utils.py
index 7ac9713b..fa38e8b1 100644
--- a/imperative/python/megengine/functional/utils.py
+++ b/imperative/python/megengine/functional/utils.py
@@ -11,18 +11,24 @@ from typing import Iterable, Union
 
 import numpy as np
 
-from ..core.ops.builtin import Copy
+from ..core._wrap import device as as_device
+from ..core.ops.builtin import Copy, Identity
 from ..core.tensor import Tensor
 from ..core.tensor.core import apply
 from .math import topk as _topk
-from .tensor import transpose as _transpose
+from .tensor import broadcast_to, transpose
 
+__all__ = [
+    "topk_accuracy",
+    "copy",
+]
 
-def accuracy(
+
+def topk_accuracy(
     logits: Tensor, target: Tensor, topk: Union[int, Iterable[int]] = 1
 ) -> Union[Tensor, Iterable[Tensor]]:
     r"""
-    Calculate the classification accuracy given predicted logits and ground-truth labels.
+    Calculates the classification accuracy given predicted logits and ground-truth labels.
 
     :param logits: model predictions of shape `[batch_size, num_classes]`,
         representing the probability (likelyhood) of each class.
@@ -40,7 +46,7 @@ def accuracy(
 
         logits = tensor(np.arange(80, dtype=np.int32).reshape(8,10))
         target = tensor(np.arange(8, dtype=np.int32))
-        top1, top5 = F.accuracy(logits, target, (1, 5))
+        top1, top5 = F.topk_accuracy(logits, target, (1, 5))
         print(top1.numpy(), top5.numpy())
 
     Outputs:
@@ -54,8 +60,8 @@ def accuracy(
     _, pred = _topk(logits, k=max(topk), descending=True)
     accs = []
     for k in topk:
-        correct = pred[:, :k].detach() == _transpose(target, (0, "x")).broadcast(
-            target.shape[0], k
+        correct = pred[:, :k].detach() == broadcast_to(
+            transpose(target, (0, "x")), (target.shape[0], k)
         )
         accs.append(correct.astype(np.float32).sum() / target.shape[0])
     if len(topk) == 1:  # type: ignore[arg-type]
@@ -63,25 +69,12 @@ def accuracy(
     return accs
 
 
-def zero_grad(inp: Tensor) -> Tensor:
-    r"""
-    Returns a tensor which is treated as constant during backward gradient calcuation,
-    i.e. its gradient is zero.
-
-    :param inp: Input tensor.
-
-    See implementation of :func:`~.softmax` for example.
-    """
-    print("zero_grad is obsoleted, please use detach instead")
-    raise NotImplementedError
-
-
-def copy(inp, cn):
+def copy(inp, device=None):
     r"""
-    Copy tensor to another device.
+    Copies tensor to another device.
 
     :param inp: input tensor.
-    :param cn: device that you copy to.
+    :param device: destination device.
 
     Examples:
 
@@ -101,4 +94,6 @@ def copy(inp, cn):
 
         [1 2 3]
     """
-    return apply(Copy(comp_node=cn), inp)[0]
+    if device is None:
+        return apply(Identity(), inp)[0]
+    return apply(Copy(comp_node=as_device(device).to_c()), inp)[0]
diff --git a/imperative/python/megengine/hub/exceptions.py b/imperative/python/megengine/hub/exceptions.py
index aab0a134..ede5c46c 100644
--- a/imperative/python/megengine/hub/exceptions.py
+++ b/imperative/python/megengine/hub/exceptions.py
@@ -19,12 +19,12 @@ class InvalidGitHost(FetcherError):
 
 
 class GitPullError(FetcherError):
-    """A git pull error occurred"""
+    """A git pull error occurred."""
 
 
 class GitCheckoutError(FetcherError):
-    """A git checkout error occurred"""
+    """A git checkout error occurred."""
 
 
 class InvalidProtocol(FetcherError):
-    """The protocol provided was somehow invalid"""
+    """The protocol provided was somehow invalid."""
diff --git a/imperative/python/megengine/hub/fetcher.py b/imperative/python/megengine/hub/fetcher.py
index 4f60b3ce..583f4121 100644
--- a/imperative/python/megengine/hub/fetcher.py
+++ b/imperative/python/megengine/hub/fetcher.py
@@ -106,20 +106,20 @@ class GitSSHFetcher(RepoFetcherBase):
 
         :param git_host:
             host address of git repo.
-            example: github.com
+            Example: github.com
         :param repo_info:
             a string with format ``"repo_owner/repo_name[:tag_name/:branch_name]"`` with an optional
             tag/branch. The default branch is ``master`` if not specified.
-            example: ``"brain_sdk/MegBrain[:hub]"``
+            Example: ``"brain_sdk/MegBrain[:hub]"``
         :param use_cache:
-            whether to use locally fetched code or completely re-fetch
+            whether to use locally fetched code or completely re-fetch.
         :param commit:
-            commit id on github or gitlab
+            commit id on github or gitlab.
         :param silent:
             whether to accept the stdout and stderr of the subprocess with PIPE, instead of
-            displaying on the screen
+            displaying on the screen.
         :return:
-            directory where the repo code is stored
+            directory where the repo code is stored.
         """
         if not cls._check_git_host(git_host):
             raise InvalidGitHost("git_host: '{}' is malformed.".format(git_host))
@@ -215,24 +215,24 @@ class GitHTTPSFetcher(RepoFetcherBase):
         silent: bool = True,
     ) -> str:
         """
-        Fetches git repo by HTTPS protocol
+        Fetches git repo by HTTPS protocol.
 
         :param git_host:
-            host address of git repo
-            example: github.com
+            host address of git repo.
+            Example: github.com
         :param repo_info:
             a string with format ``"repo_owner/repo_name[:tag_name/:branch_name]"`` with an optional
             tag/branch. The default branch is ``master`` if not specified.
-            example: ``"brain_sdk/MegBrain[:hub]"``
+            Example: ``"brain_sdk/MegBrain[:hub]"``
         :param use_cache:
-            whether to use locally cached code or completely re-fetch
+            whether to use locally cached code or completely re-fetch.
         :param commit:
-            commit id on github or gitlab
+            commit id on github or gitlab.
         :param silent:
             whether to accept the stdout and stderr of the subprocess with PIPE, instead of
-            displaying on the screen
+            displaying on the screen.
         :return:
-            directory where the repo code is stored
+            directory where the repo code is stored.
         """
         if not cls._check_git_host(git_host):
             raise InvalidGitHost("git_host: '{}' is malformed.".format(git_host))
diff --git a/imperative/python/megengine/hub/hub.py b/imperative/python/megengine/hub/hub.py
index 139256e9..1608c281 100644
--- a/imperative/python/megengine/hub/hub.py
+++ b/imperative/python/megengine/hub/hub.py
@@ -94,24 +94,24 @@ def _init_hub(
     commit: str = None,
     protocol: str = DEFAULT_PROTOCOL,
 ):
-    """Imports hubmodule like python import
+    """Imports hubmodule like python import.
 
     :param repo_info:
         a string with format ``"repo_owner/repo_name[:tag_name/:branch_name]"`` with an optional
         tag/branch. The default branch is ``master`` if not specified.
         Example: ``"brain_sdk/MegBrain[:hub]"``
     :param git_host:
-        host address of git repo
+        host address of git repo.
         Example: github.com
     :param use_cache:
-        whether to use locally cached code or completely re-fetch
+        whether to use locally cached code or completely re-fetch.
     :param commit:
-        commit id on github or gitlab
+        commit id on github or gitlab.
     :param protocol:
         which protocol to use to get the repo, and HTTPS protocol only supports public repo on github.
         The value should be one of HTTPS, SSH.
     :return:
-        hubconf.py as a python module
+        a python module.
     """
     cache_dir = os.path.expanduser(os.path.join(_get_megengine_home(), "hub"))
     os.makedirs(cache_dir, exist_ok=True)
@@ -137,24 +137,24 @@ def list(
     commit: str = None,
     protocol: str = DEFAULT_PROTOCOL,
 ) -> List[str]:
-    """Lists all entrypoints available in repo hubconf
+    """Lists all entrypoints available in repo hubconf.
 
     :param repo_info:
         a string with format ``"repo_owner/repo_name[:tag_name/:branch_name]"`` with an optional
         tag/branch. The default branch is ``master`` if not specified.
         Example: ``"brain_sdk/MegBrain[:hub]"``
     :param git_host:
-        host address of git repo
+        host address of git repo.
         Example: github.com
     :param use_cache:
-        whether to use locally cached code or completely re-fetch
+        whether to use locally cached code or completely re-fetch.
     :param commit:
-        commit id on github or gitlab
+        commit id on github or gitlab.
     :param protocol:
         which protocol to use to get the repo, and HTTPS protocol only supports public repo on github.
         The value should be one of HTTPS, SSH.
     :return:
-        all entrypoint names of the model
+        all entrypoint names of the model.
     """
     hubmodule = _init_hub(repo_info, git_host, use_cache, commit, protocol)
 
@@ -182,14 +182,14 @@ def load(
         tag/branch. The default branch is ``master`` if not specified.
         Example: ``"brain_sdk/MegBrain[:hub]"``
     :param entry:
-        an entrypoint defined in hubconf
+        an entrypoint defined in hubconf.
     :param git_host:
-        host address of git repo
+        host address of git repo.
         Example: github.com
     :param use_cache:
-        whether to use locally cached code or completely re-fetch
+        whether to use locally cached code or completely re-fetch.
     :param commit:
-        commit id on github or gitlab
+        commit id on github or gitlab.
     :param protocol:
         which protocol to use to get the repo, and HTTPS protocol only supports public repo on github.
         The value should be one of HTTPS, SSH.
@@ -217,9 +217,9 @@ def help(
 ) -> str:
     """This function returns docstring of entrypoint ``entry`` by following steps:
 
-    1. Pull the repo code specified by git and repo_info
+    1. Pull the repo code specified by git and repo_info.
     2. Load the entry defined in repo's hubconf.py
-    3. Return docstring of function entry
+    3. Return docstring of function entry.
 
     :param repo_info:
         a string with format ``"repo_owner/repo_name[:tag_name/:branch_name]"`` with an optional
@@ -228,17 +228,17 @@ def help(
     :param entry:
         an entrypoint defined in hubconf.py
     :param git_host:
-        host address of git repo
+        host address of git repo.
         Example: github.com
     :param use_cache:
-        whether to use locally cached code or completely re-fetch
+        whether to use locally cached code or completely re-fetch.
     :param commit:
-        commit id on github or gitlab
+        commit id on github or gitlab.
     :param protocol:
         which protocol to use to get the repo, and HTTPS protocol only supports public repo on github.
         The value should be one of HTTPS, SSH.
     :return:
-        docstring of entrypoint ``entry``
+        docstring of entrypoint ``entry``.
     """
     hubmodule = _init_hub(repo_info, git_host, use_cache, commit, protocol)
 
@@ -255,10 +255,10 @@ def load_serialized_obj_from_url(url: str, model_dir=None) -> Any:
     If the object is already present in ``model_dir``, it's deserialized and
     returned. If no ``model_dir`` is specified, it will be ``MGE_HOME/serialized``.
 
-    :param url: url to serialized object
-    :param model_dir: dir to cache target serialized file
+    :param url: url to serialized object.
+    :param model_dir: dir to cache target serialized file.
 
-    :return: loaded object
+    :return: loaded object.
     """
     if model_dir is None:
         model_dir = os.path.join(_get_megengine_home(), "serialized")
diff --git a/imperative/python/megengine/hub/tools.py b/imperative/python/megengine/hub/tools.py
index 0bf9c98c..a3c6c5d4 100644
--- a/imperative/python/megengine/hub/tools.py
+++ b/imperative/python/megengine/hub/tools.py
@@ -15,10 +15,10 @@ from typing import Iterator
 
 def load_module(name: str, path: str) -> types.ModuleType:
     """
-    Loads module specified by name and path
+    Loads module specified by name and path.
 
-    :param name: module name
-    :param path: module path
+    :param name: module name.
+    :param path: module path.
     """
     spec = importlib.util.spec_from_file_location(name, path)
     module = importlib.util.module_from_spec(spec)
@@ -27,18 +27,18 @@ def load_module(name: str, path: str) -> types.ModuleType:
 
 
 def check_module_exists(module: str) -> bool:
-    """Checks whether python module exists or not
+    """Checks whether python module exists or not.
 
-    :param module: name of module
+    :param module: name of module.
     """
     return importlib.util.find_spec(module) is not None
 
 
 @contextmanager
 def cd(target: str) -> Iterator[None]:
-    """Changes current directory to target
+    """Changes current directory to target.
 
-    :param target: target directory
+    :param target: target directory.
     """
     prev = os.getcwd()
     os.chdir(os.path.expanduser(target))
diff --git a/imperative/python/megengine/jit/tracing.py b/imperative/python/megengine/jit/tracing.py
index e219f607..6fe59b46 100644
--- a/imperative/python/megengine/jit/tracing.py
+++ b/imperative/python/megengine/jit/tracing.py
@@ -36,6 +36,13 @@ active_trace = None
 skip_tracing = False
 
 
+def is_tracing():
+    if active_trace is None:
+        return False
+    else:
+        return not skip_tracing
+
+
 @contextlib.contextmanager
 def exclude_from_trace():
     global skip_tracing
@@ -125,6 +132,9 @@ class trace:
         self._graph_opt_level = opt_level
         self._tensor_shape = tensor_shape
 
+        self._reset()
+
+    def _reset(self):
         self._untraced = True
         self._tinfo = []  # handle -> TensorInfo
         self._seq = []
@@ -257,77 +267,117 @@ class trace:
     def _record_const(self, op, outputs):
         pass
 
-    @contextlib.contextmanager
-    def _setup(self):
+    def _set_active(self, active: bool):
         global active_trace
-        if active_trace:
-            raise NotImplementedError("sorry, not implemented: nested trace")
-        active_trace = self
-
-        if self._untraced:
-            apply.enable(apply_with_tracing)
-            apply.enable(apply_const_with_tracing)
-            if self._symbolic:
-                apply.enable(apply_symbolic_mode)
-                apply.enable(apply_const_symbolic_mode)
-                self._lazy_eval_graph = G.Graph()
+        if active:
+            if active_trace:
+                raise NotImplementedError("sorry, not implemented: nested trace")
+            active_trace = self
         else:
-            apply.enable(apply_compiled_mode)
-            if self._graph is None:
-                self._compile()
-            self._graph.execute()
-
-        yield
-
+            assert active_trace is self
+            active_trace = None
+
+    def _init_trace(self, symbolic: bool):
+        apply.enable(apply_with_tracing)
+        apply.enable(apply_const_with_tracing)
+        if symbolic:
+            apply.enable(apply_symbolic_mode)
+            apply.enable(apply_const_symbolic_mode)
+            self._lazy_eval_graph = G.Graph()
+
+    def _take_escaped_tensors(self):
         escaped_tensors = tuple(self._active_tensors)
         self._active_tensors.clear()
+        return escaped_tensors
 
-        if self._untraced:
-            for x in escaped_tensors:
-                info = self._tinfo[x._TraceMixin__handle]
-                info.data_read = True
-                x._TraceMixin__restore()
-            if self._inputs_to_restore:
-                for x in self._inputs_to_restore:
+    def _lazy_eval(self, lazy_eval_graph, lazy_eval_tensors):
+        active_lazy_eval_tensors = []
+        visited = set()
+        readers = []
+        for x in lazy_eval_tensors:
+            x = x()
+            if x is None or x in visited:
+                continue
+            reader = G.OutputNode(x._LazyEvalTensor__varnode).outputs[0]
+            readers.append(reader)
+            active_lazy_eval_tensors.append(x)
+            visited.add(x)
+        self._apply_graph_options(lazy_eval_graph)
+        lazy_eval_graph.compile(*readers)
+        lazy_eval_graph()
+        for r, x in zip(readers, active_lazy_eval_tensors):
+            assign_raw_tensor(x, as_raw_tensor(r.op.get_value()))
+
+    @contextlib.contextmanager
+    def _setup(self):
+        interrupted = False
+
+        def do_enter():
+            self._set_active(True)
+            if self._untraced:
+                self._init_trace(self._symbolic)
+            else:
+                apply.enable(apply_compiled_mode)
+                if self._graph is None:
+                    self._compile()
+                self._graph.execute()
+
+        def do_finalize():
+            escaped_tensors = self._take_escaped_tensors()
+            if self._untraced:
+                for x in escaped_tensors:
+                    info = self._tinfo[x._TraceMixin__handle]
+                    info.data_read = True
                     x._TraceMixin__restore()
-            if self._symbolic:
-                # eval lazy eval tensors
-                if self._lazy_eval_tensors:
-                    lazy_eval_tensors = []
-                    visited = set()
-                    readers = []
-                    for x in self._lazy_eval_tensors:
-                        x = x()
-                        if x is None or x in visited:
-                            continue
-                        reader = G.OutputNode(x._LazyEvalTensor__varnode).outputs[0]
-                        readers.append(reader)
-                        lazy_eval_tensors.append(x)
-                        visited.add(x)
-                    self._apply_graph_options(self._lazy_eval_graph)
-                    self._lazy_eval_graph.compile(*readers)
-                    self._lazy_eval_graph()
-                    for r, x in zip(readers, lazy_eval_tensors):
-                        assign_raw_tensor(x, as_raw_tensor(r.op.get_value()))
+                if self._inputs_to_restore:
+                    for x in self._inputs_to_restore:
+                        x._TraceMixin__restore()
+                if self._symbolic and self._lazy_eval_tensors:
+                    # eval lazy eval tensors
+                    self._lazy_eval(self._lazy_eval_graph, self._lazy_eval_tensors)
                     self._lazy_eval_graph = None
                     self._lazy_eval_tensors = None
-            self._untraced = False
-        else:
-            if self._pc != len(self._seq):
-                raise TraceMismatchError("premature end")
-            for x in escaped_tensors:
-                assign_raw_tensor(x, as_raw_tensor(x._dev_tensor()))
-            self._graph.wait()
-            self._reset_exec_env()
+                self._untraced = False
+            else:
+                # compiled_tensor leaks
+                if self._pc == len(self._seq):
+                    for x in escaped_tensors:
+                        try:
+                            assign_raw_tensor(x, as_raw_tensor(x._dev_tensor()))
+                        except TraceMismatchError:
+                            # TraceMismatchError thrown in do_exit
+                            pass
+                    self._graph.wait()
+                    self._reset_exec_env()
+
+            # reset status
             self._pc = 0
-
-        self._tensor_remaps = None
-        apply.disable(apply_with_tracing)
-        apply.disable(apply_const_with_tracing)
-        apply.disable(apply_symbolic_mode)
-        apply.disable(apply_const_symbolic_mode)
-        apply.disable(apply_compiled_mode)
-        active_trace = None
+            self._tensor_remaps = None
+            apply.disable(apply_with_tracing)
+            apply.disable(apply_const_with_tracing)
+            apply.disable(apply_symbolic_mode)
+            apply.disable(apply_const_symbolic_mode)
+            apply.disable(apply_compiled_mode)
+            self._set_active(False)
+
+        def do_exit():
+            if not self._untraced and self._pc != len(self._seq):
+                raise TraceMismatchError("premature end")
+            if not self._symbolic or not self._untraced:
+                for x in self._active_tensors:
+                    x._dev_tensor()
+
+        try:
+            do_enter()
+            yield
+            do_exit()
+        except:
+            interrupted = True
+            raise
+        finally:
+            do_finalize()
+            if interrupted:
+                self._reset()
 
     def _begin_excluded_region(self):
         if self._capture_as_const:
@@ -368,6 +418,7 @@ class trace:
     def _compile(self):
         graph = self._graph = G.Graph()
         graph.options.no_force_inplace = True
+        graph.options.async_exec_level = 0b100
         self._apply_graph_options(graph)
         # graph.options.graph_opt_level = 0
         need_reset_nodes = self._need_reset_nodes = []
@@ -570,7 +621,9 @@ class trace:
                 if h not in h2v:
                     assert info.external
                     assert info.bound_data
-                    h2v[h] = graph.make_const(info.bound_data._dev_tensor())
+                    h2v[h] = graph.make_const(
+                        info.bound_data.numpy(), dtype=info.dtype, device=info.device
+                    )
                 ivars.append(h2v[h])
             ovars = apply(op, *ivars)
             assert len(ovars) == len(ohandles)
diff --git a/imperative/python/megengine/logger.py b/imperative/python/megengine/logger.py
index 9e926ca3..49f25e55 100644
--- a/imperative/python/megengine/logger.py
+++ b/imperative/python/megengine/logger.py
@@ -12,7 +12,7 @@ import os
 import sys
 
 _all_loggers = []
-_default_level_name = os.getenv("MEGENGINE_LOGGING_LEVEL", "ERROR")
+_default_level_name = os.getenv("MEGENGINE_LOGGING_LEVEL", "INFO")
 _default_level = logging.getLevelName(_default_level_name.upper())
 
 
diff --git a/imperative/python/megengine/module/__init__.py b/imperative/python/megengine/module/__init__.py
index 916000d0..6c5b48fd 100644
--- a/imperative/python/megengine/module/__init__.py
+++ b/imperative/python/megengine/module/__init__.py
@@ -8,6 +8,7 @@
 # "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 
 from .activation import LeakyReLU, PReLU, ReLU, Sigmoid, Softmax
+from .adaptive_pooling import AdaptiveAvgPool2d, AdaptiveMaxPool2d
 from .batchnorm import BatchNorm1d, BatchNorm2d, SyncBatchNorm
 from .concat import Concat
 from .conv import Conv2d, ConvRelu2d, ConvTranspose2d, LocalConv2d
diff --git a/imperative/python/megengine/module/activation.py b/imperative/python/megengine/module/activation.py
index 817fc5cb..2f17fe9a 100644
--- a/imperative/python/megengine/module/activation.py
+++ b/imperative/python/megengine/module/activation.py
@@ -20,10 +20,10 @@ class Softmax(Module):
     .. math::
             \text{Softmax}(x_{i}) = \frac{exp(x_i)}{\sum_j exp(x_j)}
 
-    It is applied to an n-dimensional input Tensor and rescaling them so that the elements of the
-    n-dimensional output Tensor lie in the range of `[0, 1]` and sum to 1.
+    It is applied to all elements along axis, and rescales elements so that
+    they stay in the range `[0, 1]` and sum to 1.
 
-    :param axis: An axis along which softmax will be applied. By default,
+    :param axis: Along which axis softmax will be applied. By default,
         softmax will apply along the highest ranked axis.
 
     Examples:
@@ -55,6 +55,9 @@ class Softmax(Module):
     def forward(self, inputs):
         return softmax(inputs, self.axis)
 
+    def _module_info_string(self) -> str:
+        return "axis={axis}".format(axis=self.axis)
+
 
 class Sigmoid(Module):
     r"""
@@ -138,8 +141,7 @@ class PReLU(Module):
         \end{cases}
 
     Here :math:`a` is a learnable parameter. When called without arguments, `PReLU()` uses
-    a single paramter :math:`a` across all input channel. If called with `PReLU(num_of_channels)`,
-    a seperate :math:`a` is used for each input channle.
+    a single paramter :math:`a` across all input channel. If called with `PReLU(num_of_channels)`, each input channle will has it's own :math:`a`.
 
     :param num_parameters: number of :math:`a` to learn, there is only two
         values are legitimate: 1, or the number of channels at input. Default: 1
diff --git a/imperative/python/megengine/module/adaptive_pooling.py b/imperative/python/megengine/module/adaptive_pooling.py
new file mode 100644
index 00000000..99e7c57d
--- /dev/null
+++ b/imperative/python/megengine/module/adaptive_pooling.py
@@ -0,0 +1,114 @@
+# -*- coding: utf-8 -*-
+# MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
+#
+# Copyright (c) 2014-2020 Megvii Inc. All rights reserved.
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+from abc import abstractmethod
+from typing import Tuple, Union
+
+from ..functional import adaptive_avg_pool2d, adaptive_max_pool2d
+from ..tensor import Parameter, Tensor
+from .module import Module
+
+
+class _AdaptivePoolNd(Module):
+    def __init__(
+        self, oshp: Union[Tuple[int, int], int, Tensor],
+    ):
+        super(_AdaptivePoolNd, self).__init__()
+        self.oshp = oshp
+
+    @abstractmethod
+    def forward(self, inp):
+        pass
+
+
+class AdaptiveMaxPool2d(_AdaptivePoolNd):
+    r"""Applies a 2D max adaptive pooling over an input.
+
+    For instance, given an input of the size :math:`(N, C, H, W)` and
+    an output shape :math:`(OH, OW)`, this layer generates the output of
+    the size :math:`(N, C, OH, OW)` through a process described as:
+
+    .. math::
+        \begin{aligned}
+            out(N_i, C_j, h, w) ={} & \max_{m=0, \ldots, kH-1} \max_{n=0, \ldots, kW-1}
+                \text{input}(N_i, C_j, \text{stride[0]} \times h + m,
+                \text{stride[1]} \times w + n)
+        \end{aligned}
+
+    Kernel_size and stride can be inferred from input shape and out shape:
+    padding: (0, 0)
+    stride: (floor(IH / OH), floor(IW / OW))
+    kernel_size: (IH - (OH - 1) * stride_h, IW - (OW - 1) * stride_w)
+
+    Examples:
+
+    .. testcode::
+
+        import numpy as np
+        import megengine as mge
+        import megengine.module as M
+
+        m = M.AdaptiveMaxPool2d((2, 2))
+        inp = mge.tensor(np.arange(0, 16).astype("float32").reshape(1, 1, 4, 4))
+        oup = m(inp)
+        print(oup.numpy())
+
+    Outputs:
+
+    .. testoutput::
+
+        [[[[5.  7.]
+           [13. 15.]]]]
+
+    """
+
+    def forward(self, inp):
+        return adaptive_max_pool2d(inp, self.oshp)
+
+
+class AdaptiveAvgPool2d(_AdaptivePoolNd):
+    r"""Applies a 2D average pooling over an input.
+
+    For instance, given an input of the size :math:`(N, C, H, W)` and
+    an output shape :math:`(OH, OW)`, this layer generates the output of
+    the size :math:`(N, C, OH, OW)` through a process described as:
+
+    .. math::
+
+        out(N_i, C_j, h, w)  = \frac{1}{kH * kW} \sum_{m=0}^{kH-1} \sum_{n=0}^{kW-1}
+                               input(N_i, C_j, stride[0] \times h + m, stride[1] \times w + n)
+
+    Kernel_size and stride can be inferred from input shape and out shape:
+    padding: (0, 0)
+    stride: (floor(IH / OH), floor(IW / OW))
+    kernel_size: (IH - (OH - 1) * stride_h, IW - (OW - 1) * stride_w)
+
+    Examples:
+
+    .. testcode::
+
+        import numpy as np
+        import megengine as mge
+        import megengine.module as M
+
+        m = M.AdaptiveAvgPool2d((2, 2))
+        inp = mge.tensor(np.arange(0, 16).astype("float32").reshape(1, 1, 4, 4))
+        oup = m(inp)
+        print(oup.numpy())
+
+    Outputs:
+
+    .. testoutput::
+
+        [[[[2.5  4.5]
+           [10.5 12.5]]]]
+
+    """
+
+    def forward(self, inp):
+        return adaptive_avg_pool2d(inp, self.oshp)
diff --git a/imperative/python/megengine/module/batchnorm.py b/imperative/python/megengine/module/batchnorm.py
index 8651546b..9f2d7bd1 100644
--- a/imperative/python/megengine/module/batchnorm.py
+++ b/imperative/python/megengine/module/batchnorm.py
@@ -11,7 +11,7 @@ from typing import Optional
 import numpy as np
 
 from ..distributed.group import WORLD, Group
-from ..functional import batch_norm2d, sync_batch_norm
+from ..functional.nn import batch_norm, sync_batch_norm
 from ..tensor import Parameter, Tensor
 from . import init
 from .module import Module
@@ -96,7 +96,7 @@ class _BatchNorm(Module):
         else:
             exponential_average_factor = 0.0  # useless
 
-        output = batch_norm2d(
+        output = batch_norm(
             inp,
             self.running_mean if self.track_running_stats else None,
             self.running_var if self.track_running_stats else None,
@@ -113,6 +113,13 @@ class _BatchNorm(Module):
 
         return output
 
+    def _module_info_string(self) -> str:
+        s = (
+            "{num_features}, eps={eps}, momentum={momentum}, affine={affine}, "
+            "track_running_stats={track_running_stats}"
+        )
+        return s.format(**self.__dict__)
+
 
 class SyncBatchNorm(_BatchNorm):
     r"""
@@ -213,8 +220,8 @@ class BatchNorm2d(_BatchNorm):
     of 0.9.
 
     If :attr:`track_running_stats` is set to ``False``, this layer will not
-    keep running estimates, and batch statistics are instead used during
-    evaluation time.
+    keep running estimates, batch statistics is used during
+    evaluation time instead.
 
     .. note::
         This :attr:`momentum` argument is different from one used in optimizer
@@ -229,15 +236,14 @@ class BatchNorm2d(_BatchNorm):
     Spatial Batch Normalization.
 
     :type num_features: int
-    :param num_features: usually the :math:`C` from an input of size
-        :math:`(N, C, H, W)` or the highest ranked dimension of an input with
+    :param num_features: usually :math:`C` from an input of shape
+        :math:`(N, C, H, W)` or the highest ranked dimension of an input
         less than 4D.
     :type eps: float
     :param eps: a value added to the denominator for numerical stability.
         Default: 1e-5
     :type momentum: float
-    :param momentum: the value used for the `running_mean` and `running_var`
-        computation.
+    :param momentum: the value used for the ``running_mean`` and ``running_var`` computation.
         Default: 0.9
     :type affine: bool
     :param affine: a boolean value that when set to True, this module has
diff --git a/imperative/python/megengine/module/conv.py b/imperative/python/megengine/module/conv.py
index 8d07505c..72aae09e 100644
--- a/imperative/python/megengine/module/conv.py
+++ b/imperative/python/megengine/module/conv.py
@@ -70,6 +70,21 @@ class _ConvNd(Module):
     def _infer_bias_shape(self):
         pass
 
+    def _module_info_string(self):
+        s = "{in_channels}, {out_channels}, kernel_size={kernel_size}"
+
+        if self.stride != (1,) * len(self.stride):
+            s += ", stride={stride}"
+        if self.padding != (0,) * len(self.padding):
+            s += ", padding={padding}"
+        if self.dilation != (1,) * len(self.dilation):
+            s += ", dilation={dilation}"
+        if self.groups != 1:
+            s += ", groups={groups}"
+        if self.bias is None:
+            s += ", bias=False"
+        return s.format(**self.__dict__)
+
 
 class Conv2d(_ConvNd):
     r"""Applies a 2D convolution over an input tensor.
@@ -84,8 +99,8 @@ class Conv2d(_ConvNd):
         \sum_{k = 0}^{C_{\text{in}} - 1} \text{weight}(C_{\text{out}_j}, k) \star \text{input}(N_i, k)
 
     where :math:`\star` is the valid 2D cross-correlation operator,
-    :math:`N` is a batch size, :math:`C` denotes a number of channels,
-    :math:`H` is a height of input planes in pixels, and :math:`W` is
+    :math:`N` is batch size, :math:`C` denotes number of channels,
+    :math:`H` is height of input planes in pixels, and :math:`W` is
     width in pixels.
 
     When `groups == in_channels` and `out_channels == K * in_channels`,
@@ -105,9 +120,8 @@ class Conv2d(_ConvNd):
     :param padding: size of the paddings added to the input on both sides of its
         spatial dimensions. Only zero-padding is supported. Default: 0
     :param dilation: dilation of the 2D convolution operation. Default: 1
-    :param groups: number of groups to divide input and output channels into,
-        so as to perform a "grouped convolution". When groups is not 1,
-        in_channels and out_channels must be divisible by groups,
+    :param groups: number of groups into which the input and output channels are divided, so as to perform a "grouped convolution". When ``groups`` is not 1,
+        ``in_channels`` and ``out_channels`` must be divisible by ``groups``,
         and there would be an extra dimension at the beginning of the weight's
         shape. Specifically, the shape of weight would be `(groups,
         out_channel // groups, in_channels // groups, *kernel_size)`.
@@ -115,9 +129,9 @@ class Conv2d(_ConvNd):
         True
     :param conv_mode: Supports `CROSS_CORRELATION` or `CONVOLUTION`. Default:
         `CROSS_CORRELATION`
-    :param compute_mode: When set to `DEFAULT`, no special requirements will be
-        placed on the precision of intermediate results. When set to `FLOAT32`,
-        float32 would be used for accumulator and intermediate result, but only
+    :param compute_mode: When set to "DEFAULT", no special requirements will be
+        placed on the precision of intermediate results. When set to "FLOAT32",
+        "Float32" would be used for accumulator and intermediate result, but only
         effective when input and output are of float16 dtype.
 
     Examples:
@@ -221,7 +235,7 @@ class ConvTranspose2d(_ConvNd):
     r"""Applies a 2D transposed convolution over an input tensor.
 
     This module is also known as a deconvolution or a fractionally-strided convolution.
-    :class:`ConvTranspose2d` can ben seen as the gradient of :class:`Conv2d` operation
+    :class:`ConvTranspose2d` can be seen as the gradient of :class:`Conv2d` operation
     with respect to its input.
 
     Convolution usually reduces the size of input, while transposed convolution works
@@ -237,8 +251,7 @@ class ConvTranspose2d(_ConvNd):
     :param padding: size of the paddings added to the input on both sides of its
         spatial dimensions. Only zero-padding is supported. Default: 0
     :param dilation: dilation of the 2D convolution operation. Default: 1
-    :param groups: number of groups to divide input and output channels into,
-        so as to perform a "grouped convolution". When ``groups`` is not 1,
+    :param groups: number of groups into which the input and output channels are divided, so as to perform a "grouped convolution". When ``groups`` is not 1,
         ``in_channels`` and ``out_channels`` must be divisible by ``groups``,
         and there would be an extra dimension at the beginning of the weight's
         shape. Specifically, the shape of weight would be ``(groups,
@@ -247,9 +260,9 @@ class ConvTranspose2d(_ConvNd):
         True
     :param conv_mode: Supports `CROSS_CORRELATION` or `CONVOLUTION`. Default:
         `CROSS_CORRELATION`
-    :param compute_mode: When set to `DEFAULT`, no special requirements will be
-        placed on the precision of intermediate results. When set to `FLOAT32`,
-        float32 would be used for accumulator and intermediate result, but only
+    :param compute_mode: When set to "DEFAULT", no special requirements will be
+        placed on the precision of intermediate results. When set to "FLOAT32",
+        "Float32" would be used for accumulator and intermediate result, but only
         effective when input and output are of float16 dtype.
     """
 
@@ -327,7 +340,7 @@ class ConvTranspose2d(_ConvNd):
 
 
 class LocalConv2d(Conv2d):
-    r"""Applies a spatial convolution with untied kernels over an input 4D tensor.
+    r"""Applies a spatial convolution with untied kernels over an groupped channeled input 4D tensor.
     It is also known as the locally connected layer.
 
     :param in_channels: number of input channels.
@@ -340,9 +353,9 @@ class LocalConv2d(Conv2d):
     :param stride: stride of the 2D convolution operation. Default: 1
     :param padding: size of the paddings added to the input on both sides of its
         spatial dimensions. Only zero-padding is supported. Default: 0
-    :param groups: number of groups to divide input and output channels into,
-        so as to perform a "grouped convolution". When groups is not 1,
-        in_channels and out_channels must be divisible by groups.
+    :param groups: number of groups into which the input and output channels are divided,
+        so as to perform a "grouped convolution". When ``groups`` is not 1,
+        ``in_channels`` and ``out_channels`` must be divisible by ``groups``.
         The shape of weight is `(groups, output_height, output_width,
         in_channels // groups, *kernel_size, out_channels // groups)`.
     """
diff --git a/imperative/python/megengine/module/dropout.py b/imperative/python/megengine/module/dropout.py
index 0aac9712..6ac86b3f 100644
--- a/imperative/python/megengine/module/dropout.py
+++ b/imperative/python/megengine/module/dropout.py
@@ -11,7 +11,7 @@ from .module import Module
 
 
 class Dropout(Module):
-    r"""Randomly set input elements to zeros with the probability :math:`drop\_prob` during training.
+    r"""Randomly sets input elements to zeros with the probability :math:`drop\_prob` during training.
     Commonly used in large networks to prevent overfitting.
     Note that we perform dropout only during training, we also rescale(multiply) the output tensor
     by :math:`\frac{1}{1 - drop\_prob}`. During inference :class:`~.Dropout` is equal to :class:`~.Identity`.
@@ -28,3 +28,6 @@ class Dropout(Module):
             return dropout(inputs, self.drop_prob, training=True)
         else:
             return inputs
+
+    def _module_info_string(self) -> str:
+        return "drop_prob={drop_prob}".format(drop_prob=self.drop_prob)
diff --git a/imperative/python/megengine/module/elemwise.py b/imperative/python/megengine/module/elemwise.py
index 087563f5..9bc05fbf 100644
--- a/imperative/python/megengine/module/elemwise.py
+++ b/imperative/python/megengine/module/elemwise.py
@@ -34,7 +34,7 @@ class Elemwise(Module):
         * "EXP": exp(x)
         * "TANH": tanh(x)
         * "FUSE_MUL_ADD3": x * y + z
-        * "FAST_TANH": fast_tanh(x)
+        * "FAST_TANH": x * (27. + x * x) / (27. + 9. * x * x)
         * "NEGATE": -x
         * "ACOS": acos(x)
         * "ASIN": asin(x)
@@ -56,9 +56,9 @@ class Elemwise(Module):
         * "SIGMOID_GRAD": sigmoid_grad
         * "SWITCH_GT0": switch_gt0
         * "TANH_GRAD": tanh_grad
-        * "LT": lt
+        * "LT": less
         * "LEQ": leq
-        * "EQ": eq
+        * "EQ": equal
         * "POW": pow
         * "LOG_SUM_EXP": log_sum_exp
         * "FAST_TANH_GRAD": fast_tanh_grad
diff --git a/imperative/python/megengine/module/embedding.py b/imperative/python/megengine/module/embedding.py
index 4a281be2..c9d8da9c 100644
--- a/imperative/python/megengine/module/embedding.py
+++ b/imperative/python/megengine/module/embedding.py
@@ -10,7 +10,7 @@ from typing import Optional
 
 import numpy as np
 
-from ..functional import embedding as embedding_func
+from ..functional.nn import embedding as embedding_func
 from ..tensor import Parameter
 from . import init
 from .module import Module
@@ -26,9 +26,9 @@ class Embedding(Module):
 
     :param num_embeddings: size of embedding dictionary.
     :param embedding_dim: size of each embedding vector.
-    :param padding_idx: should be set to None, not support now.
-    :param max_norm: should be set to None, not support now.
-    :param norm_type: should be set to None, not support now.
+    :param padding_idx: should be set to None, not supportted now.
+    :param max_norm: should be set to None, not supportted now.
+    :param norm_type: should be set to None, not supportted now.
     :param initial_weight: the learnable weights of the module of shape (num_embeddings, embedding_dim).
 
     Examples:
@@ -121,8 +121,8 @@ class Embedding(Module):
         r"""
         Creates Embedding instance from given 2-dimensional FloatTensor.
 
-        :param embeddings: Tensor contained weight for the embedding.
-        :param freeze: If ``True``, the weight does not get updated during the learning process. Default: ``True``.
+        :param embeddings: tensor contained weight for the embedding.
+        :param freeze: if ``True``, the weight does not get updated during the learning process. Default: True.
         :param padding_idx: should be set to None, not support Now.
         :param max_norm: should be set to None, not support Now.
         :param norm_type: should be set to None, not support Now.
diff --git a/imperative/python/megengine/module/identity.py b/imperative/python/megengine/module/identity.py
index 51b31e50..a948d256 100644
--- a/imperative/python/megengine/module/identity.py
+++ b/imperative/python/megengine/module/identity.py
@@ -6,7 +6,7 @@
 # Unless required by applicable law or agreed to in writing,
 # software distributed under the License is distributed on an
 # "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-from ..functional import identity
+from ..functional import copy
 from .module import Module
 
 
@@ -14,4 +14,4 @@ class Identity(Module):
     r"""A placeholder identity operator that will ignore any argument."""
 
     def forward(self, x):
-        return identity(x)
+        return copy(x)
diff --git a/imperative/python/megengine/module/init.py b/imperative/python/megengine/module/init.py
index d155efb4..8cf43752 100644
--- a/imperative/python/megengine/module/init.py
+++ b/imperative/python/megengine/module/init.py
@@ -18,48 +18,48 @@ from ..tensor import Tensor
 
 
 def fill_(tensor: Tensor, val: Union[float, int]) -> None:
-    """Fill the given ``tensor`` with value ``val``.
+    """Fills the given ``tensor`` with value ``val``.
 
-    :param tensor: An n-dimentional tensor to be initialized
-    :param val: The value to be filled throughout the tensor
+    :param tensor: tensor to be initialized.
+    :param val: value to be filled throughout the tensor.
     """
     tensor._reset(full(shape=tensor.shape, value=val, dtype=tensor.dtype))
 
 
 def zeros_(tensor: Tensor) -> None:
-    """Fill the given ``tensor`` with scalar value `0`.
+    """Fills the given ``tensor`` with scalar value `0`.
 
-    :param tensor: An n-dimentional tensor to be initialized
+    :param tensor: tensor to be initialized.
     """
     fill_(tensor, 0)
 
 
 def ones_(tensor: Tensor) -> None:
-    """Fill the given ``tensor`` with the scalar value `1`.
+    """Fills the given ``tensor`` with the scalar value `1`.
 
-    :param tensor: An n-dimentional tensor to be initialized
+    :param tensor: tensor to be initialized.
     """
     fill_(tensor, 1)
 
 
 def uniform_(tensor: Tensor, a: float = 0.0, b: float = 1.0) -> None:
-    r"""Fill the given ``tensor`` with random value sampled from uniform distribution
+    r"""Fills the given ``tensor`` with random value sampled from uniform distribution
     :math:`\mathcal{U}(\text{a}, \text{b})`.
 
-    :param tensor: An n-dimentional tensor to be initialized
-    :param a: Lower bound of the sampling interval
-    :param b: Upper bound of the sampling interval
+    :param tensor: tensor to be initialized.
+    :param a: lower bound of the sampling interval.
+    :param b: upper bound of the sampling interval.
     """
     tensor._reset(uniform(size=tensor.shape, low=a, high=b).astype(tensor.dtype))
 
 
 def normal_(tensor: Tensor, mean: float = 0.0, std: float = 1.0) -> None:
-    r"""Fill the given ``tensor`` with random value sampled from normal distribution
+    r"""Fills the given ``tensor`` with random value sampled from normal distribution
     :math:`\mathcal{N}(\text{mean}, \text{std}^2)`.
 
-    :param tensor: An n-dimentional tensor to be initialized
-    :param mean: The mean of the normal distribution
-    :param std: The standard deviation of the normal distribution
+    :param tensor: tensor to be initialized.
+    :param mean: mean of the normal distribution.
+    :param std: standard deviation of the normal distribution.
     """
     tensor._reset(normal(size=tensor.shape, mean=mean, std=std).astype(tensor.dtype))
 
@@ -67,7 +67,7 @@ def normal_(tensor: Tensor, mean: float = 0.0, std: float = 1.0) -> None:
 def calculate_gain(
     nonlinearity: str, param: Optional[Union[int, float]] = None
 ) -> float:
-    r"""Return a recommended gain value (see the table below) for the given nonlinearity
+    r"""Returns a recommended gain value (see the table below) for the given nonlinearity
     function.
 
     ================= ====================================================
@@ -81,8 +81,8 @@ def calculate_gain(
     Leaky Relu        :math:`\sqrt{\frac{2}{1 + {\text{negative}_\text{slope}}^2}}`
     ================= ====================================================
 
-    :param nonlinearity: Name of the non-linear function
-    :param param: Optional parameter for leaky_relu. Only effective when
+    :param nonlinearity: name of the non-linear function.
+    :param param: optional parameter for leaky_relu. Only effective when
         ``nonlinearity`` is "leaky_relu".
 
     """
@@ -119,10 +119,10 @@ def calculate_gain(
 
 def calculate_fan_in_and_fan_out(tensor: Tensor) -> Tuple[float, float]:
     """
-    Calculate fan_in / fan_out value for given weight tensor. This function assumes
-    input tensor is stored in NCHW format.
+    Calculates fan_in / fan_out value for given weight tensor. This function assumes
+    input tensor is stored in ``NCHW`` format.
 
-    :param tensor: Weight tensor in NCHW format
+    :param tensor: weight tensor in ``NCHW`` format.
     """
     shape = tensor.shape
     ndim = len(shape)
@@ -148,13 +148,13 @@ def calculate_fan_in_and_fan_out(tensor: Tensor) -> Tuple[float, float]:
 
 def calculate_correct_fan(tensor: Tensor, mode: str) -> float:
     """
-    Calculate fan_in or fan_out value for given weight tensor, depending on given
+    Calculates fan_in / fan_out value for given weight tensor, depending on given
     ``mode``.
 
     See :func:`calculate_fan_in_and_fan_out` for details.
 
-    :param tensor: Weight tensor in NCHW format
-    :param mode: ``'fan_in'`` or ``'fan_out'``
+    :param tensor: weight tensor in ``NCHW`` format.
+    :param mode: "fan_in" or "fan_out".
     """
     mode = mode.lower()
     valid_modes = ["fan_in", "fan_out"]
@@ -168,7 +168,7 @@ def calculate_correct_fan(tensor: Tensor, mode: str) -> float:
 
 
 def xavier_uniform_(tensor: Tensor, gain: float = 1.0) -> None:
-    r"""Fill ``tensor`` with random values sampled from :math:`\mathcal{U}(-a, a)`
+    r"""Fills tensor with random values sampled from :math:`\mathcal{U}(-a, a)`
     where
 
     .. math::
@@ -178,8 +178,8 @@ def xavier_uniform_(tensor: Tensor, gain: float = 1.0) -> None:
     `Understanding the difficulty of training deep feedforward neural networks` -
     Glorot, X. & Bengio, Y. (2010).
 
-    :param tensor: An n-dimentional tensor to be initialized
-    :param gain: Scaling factor for :math:`a`.
+    :param tensor: tensor to be initialized.
+    :param gain: scaling factor for :math:`a`.
     """
     fan_in, fan_out = calculate_fan_in_and_fan_out(tensor)
     std = gain * math.sqrt(2.0 / float(fan_in + fan_out))
@@ -188,7 +188,7 @@ def xavier_uniform_(tensor: Tensor, gain: float = 1.0) -> None:
 
 
 def xavier_normal_(tensor: Tensor, gain: float = 1.0) -> None:
-    r"""Fill ``tensor`` with random values sampled from
+    r"""Fills tensor with random values sampled from
     :math:`\mathcal{N}(0, \text{std}^2)` where
 
     .. math::
@@ -198,8 +198,8 @@ def xavier_normal_(tensor: Tensor, gain: float = 1.0) -> None:
     `Understanding the difficulty of training deep feedforward neural networks` -
     Glorot, X. & Bengio, Y. (2010).
 
-    :param tensor: An n-dimentional tensor to be initialized
-    :param gain: Scaling factor for :math:`std`.
+    :param tensor: tensor to be initialized.
+    :param gain: scaling factor for :math:`std`.
     """
     fan_in, fan_out = calculate_fan_in_and_fan_out(tensor)
     std = gain * math.sqrt(2.0 / float(fan_in + fan_out))
@@ -209,7 +209,7 @@ def xavier_normal_(tensor: Tensor, gain: float = 1.0) -> None:
 def msra_uniform_(
     tensor: Tensor, a: float = 0, mode: str = "fan_in", nonlinearity: str = "leaky_relu"
 ) -> None:
-    r"""Fill ``tensor`` wilth random values sampled from
+    r"""Fills tensor wilth random values sampled from
     :math:`\mathcal{U}(-\text{bound}, \text{bound})` where
 
     .. math::
@@ -219,13 +219,13 @@ def msra_uniform_(
     `Delving deep into rectifiers: Surpassing human-level performance on ImageNet
     classification`
 
-    :param tensor: An n-dimentional tensor to be initialized
-    :param a: Optional parameter for calculating gain for leaky_relu. See
+    :param tensor: tensor to be initialized.
+    :param a: optional parameter for calculating gain for leaky_relu. See
         :func:`calculate_gain` for details.
-    :param mode: ``'fan_in'`` or ``'fan_out'``, used to calculate :math:`gain`, the
+    :param mode: "fan_in" or "fan_out", used to calculate :math:`gain`, the
         scaling factor for :math:`bound`. See :func:`calculate_fan_in_and_fan_out` for
         details.
-    :param nonlinearity: Name of the non-linear function used to calculate :math:`gain`.
+    :param nonlinearity: name of the non-linear function used to calculate :math:`gain`.
         See :func:`calculate_gain` for details.
     """
     fan = calculate_correct_fan(tensor, mode)
@@ -238,7 +238,7 @@ def msra_uniform_(
 def msra_normal_(
     tensor: Tensor, a: float = 0, mode: str = "fan_in", nonlinearity: str = "leaky_relu"
 ) -> None:
-    r"""Fill ``tensor`` wilth random values sampled from
+    r"""Fills tensor wilth random values sampled from
     :math:`\mathcal{N}(0, \text{std}^2)` where
 
     .. math::
@@ -248,13 +248,13 @@ def msra_normal_(
     `Delving deep into rectifiers: Surpassing human-level performance on ImageNet
     classification`
 
-    :param tensor: An n-dimentional tensor to be initialized
-    :param a: Optional parameter for calculating gain for leaky_relu. See
+    :param tensor: tensor to be initialized
+    :param a: optional parameter for calculating gain for leaky_relu. See
         :func:`calculate_gain` for details.
-    :param mode: ``'fan_in'`` or ``'fan_out'``, used to calculate :math:`gain`, the
+    :param mode: "fan_in" or "fan_out", used to calculate :math:`gain`, the
         scaling factor for :math:`gain`. See :func:`calculate_fan_in_and_fan_out` for
         details.
-    :param nonlinearity: Name of the non-linear function used to calculate :math:`gain`.
+    :param nonlinearity: name of the non-linear function used to calculate :math:`gain`.
         See :func:`calculate_gain` for details.
     """
     fan = calculate_correct_fan(tensor, mode)
diff --git a/imperative/python/megengine/module/linear.py b/imperative/python/megengine/module/linear.py
index ba5c81aa..8e7ebed7 100644
--- a/imperative/python/megengine/module/linear.py
+++ b/imperative/python/megengine/module/linear.py
@@ -7,7 +7,7 @@
 # "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 import numpy as np
 
-from ..functional import linear
+from ..functional.nn import linear
 from ..tensor import Parameter
 from . import init
 from .module import Module
@@ -25,7 +25,7 @@ class Linear(Module):
 
     :param in_features: size of each input sample.
     :param out_features: size of each output sample.
-    :param bias: If set to ``False``, the layer will not learn an additive bias.
+    :param bias: if it's ``False``, the layer will not learn an additional ``bias``.
         Default: ``True``
 
     Examples:
@@ -78,3 +78,8 @@ class Linear(Module):
 
     def forward(self, x):
         return self._calc_linear(x, self.weight, self.bias)
+
+    def _module_info_string(self) -> str:
+        return "in_features={}, out_features={}, bias={}".format(
+            self.in_features, self.out_features, self.bias is not None
+        )
diff --git a/imperative/python/megengine/module/module.py b/imperative/python/megengine/module/module.py
index bf87be9d..856c0f01 100644
--- a/imperative/python/megengine/module/module.py
+++ b/imperative/python/megengine/module/module.py
@@ -69,14 +69,14 @@ class Module(metaclass=ABCMeta):
         self._forward_pre_hooks = OrderedDict()
         self._forward_hooks = OrderedDict()
 
+        self._modules = []
+
     @abstractmethod
     def forward(self, inputs):
         pass
 
     def register_forward_pre_hook(self, hook: Callable) -> HookHandler:
-        """Register a hook to handle forward inputs. `hook` should be a function
-
-        Note that `inputs` keyword inputs
+        """Registers a hook to handle forward inputs. `hook` should be a function.
 
         :param hook: a function that receive `module` and `inputs`, then return
         a modified `inputs` or `None`.
@@ -85,7 +85,7 @@ class Module(metaclass=ABCMeta):
         return HookHandler(self._forward_pre_hooks, hook)
 
     def register_forward_hook(self, hook: Callable) -> HookHandler:
-        """Register a hook to handle forward results. `hook` should be a function that
+        """Registers a hook to handle forward results. `hook` should be a function that
         receive `module`, `inputs` and `outputs`, then return a modified `outputs` or `None`.
 
         This method return a handler with :meth:`~.HookHandler.remove` interface to delete the hook.
@@ -124,12 +124,12 @@ class Module(metaclass=ABCMeta):
         returned iterable is guaranteed to be identical, as long as all the involved
         module objects' ``__dict__`` does not change thoughout those calls.
 
-        :param recursive: Whether to recursively scan all the submodules.
-        :param with_key: Whether to yield keys along with yielded objects.
-        :param with_parent: Whether to yield ``self`` along with yielded objects.
-        :param prefix: The prefix appended to the yielded keys.
-        :param predicate: The predicate function applied to scanned objects.
-        :param seen: A dict that records whether a module has been traversed yet.
+        :param recursive: whether to recursively scan all the submodules.
+        :param with_key: whether to yield keys along with yielded objects.
+        :param with_parent: whether to yield ``self`` along with yielded objects.
+        :param prefix: prefix appended to the yielded keys.
+        :param predicate: the predication function applied to scanned objects.
+        :param seen: a dict that records whether a module has been traversed yet.
         """
         if seen is None:
             seen = set([id(self)])
@@ -191,10 +191,10 @@ class Module(metaclass=ABCMeta):
         self, prefix: Optional[str] = None, recursive: bool = True, **kwargs
     ) -> Iterable[Tuple[str, Parameter]]:
         """Returns an iterable for key :class:`~.Parameter` pairs of the module, where
-        ``key`` is the dotted path from this module to the :class:`~.Parameter` .
+        ``key`` is the dotted path from this module to the :class:`~.Parameter`.
 
-        :param prefix: The prefix prepended to the keys.
-        :param recursive: If ``True``, returns all :class:`~.Parameter` within this
+        :param prefix: prefix prepended to the keys.
+        :param recursive: if ``True``, returns all :class:`~.Parameter` within this
             module, else only returns :class:`~.Parameter` that are direct attributes
             of this module.
         """
@@ -223,7 +223,7 @@ class Module(metaclass=ABCMeta):
 
         Buffer is defined to be :class:`~.Tensor` excluding :class:`~.Parameter`.
 
-        :param recursive: If ``True``, returns all buffers within this
+        :param recursive: if ``True``, returns all buffers within this
             module, else only returns buffers that are direct attributes
             of this module.
         """
@@ -239,8 +239,8 @@ class Module(metaclass=ABCMeta):
 
         Buffer is defined to be :class:`~.Tensor` excluding :class:`~.Parameter`.
 
-        :param prefix: The prefix prepended to the keys.
-        :param recursive: If ``True``, returns all buffers within this
+        :param prefix: prefix prepended to the keys.
+        :param recursive: if ``True``, returns all buffers within this
             module, else only returns buffers that are direct attributes
             of this module.
         """
@@ -285,7 +285,7 @@ class Module(metaclass=ABCMeta):
         module, including itself, where 'key' is the dotted path from this module to the
         submodules.
 
-        :param prefix: The prefix prepended to the path.
+        :param prefix: prefix prepended to the path.
         """
         if "with_parent" in kwargs and kwargs["with_parent"]:
             yield ("" if prefix is None else prefix), self, None
@@ -296,24 +296,24 @@ class Module(metaclass=ABCMeta):
         )
 
     def apply(self, fn: "Callable[[Module], Any]") -> None:
-        """Apply function ``fn`` to all the modules within this module, including
+        """Applies function ``fn`` to all the modules within this module, including
         itself.
 
-        :param fn: The function to be applied on modules.
+        :param fn: the function to be applied on modules.
         """
         for it in self.modules():
             fn(it)
 
     @deprecated(version="1.0")
     def zero_grad(self) -> None:
-        """Set all parameters' grads to zero
+        """Sets all parameters' grads to zero
         """
         for param in self.parameters():
             if param.grad is not None:
                 param.grad.reset_zero()
 
     def train(self, mode: bool = True, recursive: bool = True) -> None:
-        """Set training mode of all the modules within this module (including itself) to
+        """Sets training mode of all the modules within this module (including itself) to
         ``mode``. This effectively sets the ``training`` attributes of those modules
         to ``mode``, but only has effect on certain modules (e.g.
         :class:`~.BatchNorm2d`, :class:`~.Dropout`, :class:`~.Observer`)
@@ -331,14 +331,14 @@ class Module(metaclass=ABCMeta):
         self.apply(fn)
 
     def eval(self) -> None:
-        """Set training mode of all the modules within this module (including itself) to
+        """Sets training mode of all the modules within this module (including itself) to
         ``False``. See :meth:`~.Module.train` for details.
         """
         self.train(False)
 
     def disable_quantize(self, value=True):
         r"""
-        Set ``module``'s ``quantize_disabled`` attribute and return ``module``.
+        Sets ``module``'s ``quantize_disabled`` attribute and return ``module``.
         Could be used as a decorator.
         """
 
@@ -351,7 +351,7 @@ class Module(metaclass=ABCMeta):
     def replace_param(
         self, params: dict, start_pos: int, seen: Optional[Set[int]] = None
     ):
-        """Replace module's parameters with `params`, used by :class:`~.ParamPack` to
+        """Replaces module's parameters with `params`, used by :class:`~.ParamPack` to
         speedup multimachine training.
         """
         offset = 0
@@ -407,7 +407,7 @@ class Module(metaclass=ABCMeta):
         state_dict: Union[dict, Callable[[str, Tensor], Optional[np.ndarray]]],
         strict=True,
     ):
-        r"""Load a given dictionary created by :func:`state_dict` into this module.
+        r"""Loads a given dictionary created by :func:`state_dict` into this module.
         If ``strict`` is ``True``, the keys of :func:`state_dict` must exactly match the keys
         returned by :func:`state_dict`.
 
@@ -518,3 +518,57 @@ class Module(metaclass=ABCMeta):
             loaded.append(k)
 
         return set(loaded), set(skipped)
+
+    def __setattr__(self, name: str, value):
+        if _is_module(value):
+            modules = self.__dict__.get("_modules")
+            if modules is None:
+                raise AttributeError(
+                    "cannot assign module before Module.__init__() call"
+                )
+            if name not in self.__dict__:
+                modules.append(name)
+        super().__setattr__(name, value)
+
+    def __delattr__(self, name: str):
+        if name in self.__dict__ and _is_module(self.__dict__[name]):
+            modules = self.__dict__.get("_modules")
+            modules.remove(name)
+        super().__delattr__(name)
+
+    def _module_info_string(self) -> str:
+        r"""Set the extra representation of the module.
+        """
+        return ""
+
+    def __repr__(self):
+        def add_indent(repr_str, num_spaces):
+            s = repr_str.split("\n")
+            # don't do anything for single-line stuff
+            if len(s) == 1:
+                return repr_str
+            first = s.pop(0)
+            s = [(num_spaces * " ") + line for line in s]
+            s = "\n".join(s)
+            s = first + "\n" + s
+            return s
+
+        extra_lines = []
+        extra_repr = self._module_info_string()
+        if extra_repr:
+            extra_lines = extra_repr.split("\n")
+        child_lines = [
+            "(" + name + "): " + add_indent(repr(self.__dict__[name]), 2)
+            for name in self._modules
+        ]
+        lines = extra_lines + child_lines
+        main_str = self.__class__.__name__ + "("
+        if lines:
+            # simple one-liner info, which most builtin Modules will use
+            if len(extra_lines) == 1 and not child_lines:
+                main_str += extra_lines[0]
+            else:
+                main_str += "\n  " + "\n  ".join(lines) + "\n"
+
+        main_str += ")"
+        return main_str
diff --git a/imperative/python/megengine/module/pooling.py b/imperative/python/megengine/module/pooling.py
index b5c10a09..10dfc140 100644
--- a/imperative/python/megengine/module/pooling.py
+++ b/imperative/python/megengine/module/pooling.py
@@ -29,6 +29,11 @@ class _PoolNd(Module):
     def forward(self, inp):
         pass
 
+    def _module_info_string(self) -> str:
+        return "kernel_size={kernel_size}, stride={stride}, padding={padding}".format(
+            **self.__dict__
+        )
+
 
 class MaxPool2d(_PoolNd):
     r"""Applies a 2D max pooling over an input.
diff --git a/imperative/python/megengine/module/qat/conv_bn.py b/imperative/python/megengine/module/qat/conv_bn.py
index baa0d769..bb7414d9 100644
--- a/imperative/python/megengine/module/qat/conv_bn.py
+++ b/imperative/python/megengine/module/qat/conv_bn.py
@@ -5,7 +5,7 @@
 # Unless required by applicable law or agreed to in writing,
 # software distributed under the License is distributed on an
 # "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-from ...functional import add_update, ones, relu, sqrt, sum, zeros
+from ...functional import ones, relu, sqrt, sum, zeros
 from ...quantization.utils import fake_quant_bias
 from .. import conv_bn as Float
 from .module import QATModule
@@ -76,18 +76,10 @@ class _ConvBnActivation2d(Float._ConvBnActivation2d, QATModule):
             bn_var.detach() * num_elements_per_channel / (num_elements_per_channel - 1)
         )
         exponential_average_factor = 1 - self.bn.momentum
-        add_update(
-            self.bn.running_mean,
-            delta=bn_mean,
-            alpha=1 - exponential_average_factor,
-            beta=exponential_average_factor,
-        )
-        add_update(
-            self.bn.running_var,
-            delta=bn_var,
-            alpha=1 - exponential_average_factor,
-            beta=exponential_average_factor,
-        )
+        self.bn.running_mean *= self.bn.momentum
+        self.bn.running_mean += exponential_average_factor * bn_mean
+        self.bn.running_var *= self.bn.momentum
+        self.bn.running_var += exponential_average_factor * bn_var
 
     def calc_conv_bn_qat(self, inp, approx=True):
         if self.training and not approx:
diff --git a/imperative/python/megengine/module/qat/linear.py b/imperative/python/megengine/module/qat/linear.py
index 4067d51c..6c57beca 100644
--- a/imperative/python/megengine/module/qat/linear.py
+++ b/imperative/python/megengine/module/qat/linear.py
@@ -18,7 +18,7 @@ class Linear(Float.Linear, QATModule):
     :param in_features: size of each input sample.
     :param out_features: size of each output sample.
     :param bias: If set to ``False``, the layer will not learn an additive bias.
-        Default: ``True``
+        Default: True
 
     """
 
diff --git a/imperative/python/megengine/module/qat/module.py b/imperative/python/megengine/module/qat/module.py
index 544e04af..04da6000 100644
--- a/imperative/python/megengine/module/qat/module.py
+++ b/imperative/python/megengine/module/qat/module.py
@@ -52,7 +52,7 @@ class QATModule(Module):
             self.weight_fake_quant = safe_call(qconfig.weight_fake_quant)
 
     def _enable_exec(self, with_module, func, enable):
-        if not with_module:
+        if not with_module or not func:
             return
         if enable:
             func.enable()
diff --git a/imperative/python/megengine/module/quantized/concat.py b/imperative/python/megengine/module/quantized/concat.py
index 5815d7d9..801627ca 100644
--- a/imperative/python/megengine/module/quantized/concat.py
+++ b/imperative/python/megengine/module/quantized/concat.py
@@ -15,7 +15,7 @@ from .module import QuantizedModule
 
 class Concat(QuantizedModule):
     r"""
-    A :class:`~.QuantizedModule` to do quantized concat, inference only.
+    A :class:`~.QuantizedModule` to do quantized concat, used for inference only.
     """
 
     def __init__(self, dtype=None):
@@ -29,7 +29,7 @@ class Concat(QuantizedModule):
     @classmethod
     def from_qat_module(cls, qat_module: QAT.Concat):
         r"""
-        return a :class:`~.QuantizedModule` instance converted from a
+        Return a :class:`~.QuantizedModule` instance converted from a
         :class:`~.QATModule` instance.
         """
         return cls(qat_module.get_activation_dtype())
diff --git a/imperative/python/megengine/module/quantized/conv.py b/imperative/python/megengine/module/quantized/conv.py
index 0710a2b0..d8935cd7 100644
--- a/imperative/python/megengine/module/quantized/conv.py
+++ b/imperative/python/megengine/module/quantized/conv.py
@@ -11,17 +11,17 @@ import numpy as np
 
 from ... import module as Float
 from ...core.tensor import dtype
-from ...functional import conv_bias_activation
+from ...functional.nn import conv_bias_activation
 from ...tensor import Parameter
 from ..qat import conv as QAT
 from .module import QuantizedModule
 
 
 class Conv2d(Float.Conv2d, QuantizedModule):
-    r"""quantized version of :class:`~.qat.conv.Conv2d`."""
-    r"""Applies a 2D convolution over an quantized input tensor, inference only.
+    r"""Quantized version of :class:`~.qat.conv.Conv2d`."""
+    r"""Applies a 2D convolution over a quantized input tensor, used for inference only.
 
-    The parameter is same with :class: `~.Conv2d`
+    The parameter is same with :class: `~.Conv2d`.
     """
 
     def __init__(
@@ -101,7 +101,7 @@ class Conv2d(Float.Conv2d, QuantizedModule):
 
 
 class ConvRelu2d(Conv2d):
-    r"""quantized version of :class:`~.qat.conv.ConvRelu2d`."""
+    r"""Quantized version of :class:`~.qat.conv.ConvRelu2d`."""
 
     def forward(self, inp):
         return self.calc_conv_quantized(inp, nonlinear_mode="RELU")
diff --git a/imperative/python/megengine/module/quantized/conv_bn.py b/imperative/python/megengine/module/quantized/conv_bn.py
index 529c7932..01ce612d 100644
--- a/imperative/python/megengine/module/quantized/conv_bn.py
+++ b/imperative/python/megengine/module/quantized/conv_bn.py
@@ -11,15 +11,15 @@ from .conv import Conv2d
 
 
 class _ConvBnActivation2d(Conv2d):
-    r"""Applies a 2D convolution over an quantized input tensor, inference only.
+    r"""Applies a 2D convolution over a quantized input tensor, used for inference only.
 
-    The parameter is same with :class: `~.Conv2d`
+    The parameter is same with :class: `~.Conv2d`.
     """
 
     @classmethod
     def from_qat_module(cls, qat_module: QAT._ConvBnActivation2d):
         r"""
-        return a :class:`~.QuantizedModule` instance converted from a
+        Return a :class:`~.QuantizedModule` instance converted from a
         :class:`~.QATModule` instance.
         """
         output_dtype = qat_module.get_activation_dtype()
@@ -43,14 +43,14 @@ class _ConvBnActivation2d(Conv2d):
 
 
 class ConvBn2d(_ConvBnActivation2d):
-    r"""quantized version of :class:`~.qat.conv_bn.ConvBn2d`."""
+    r"""Quantized version of :class:`~.qat.conv_bn.ConvBn2d`."""
 
     def forward(self, inp):
         return self.calc_conv_quantized(inp, nonlinear_mode="IDENTITY")
 
 
 class ConvBnRelu2d(_ConvBnActivation2d):
-    r"""quantized version of :class:`~.qat.conv_bn.ConvBnRelu2d`."""
+    r"""Quantized version of :class:`~.qat.conv_bn.ConvBnRelu2d`."""
 
     def forward(self, inp):
         return self.calc_conv_quantized(inp, nonlinear_mode="RELU")
diff --git a/imperative/python/megengine/module/quantized/elemwise.py b/imperative/python/megengine/module/quantized/elemwise.py
index 8caee62e..5021be1a 100644
--- a/imperative/python/megengine/module/quantized/elemwise.py
+++ b/imperative/python/megengine/module/quantized/elemwise.py
@@ -13,7 +13,7 @@ from .module import QuantizedModule
 
 
 class Elemwise(QuantizedModule):
-    r"""quantized version of :class:`~.qat.elemwise.Elemwise`."""
+    r"""Quantized version of :class:`~.qat.elemwise.Elemwise`."""
 
     _elemwise_multi_type_mode = P.ElemwiseMultiType.Mode
 
@@ -30,7 +30,7 @@ class Elemwise(QuantizedModule):
     @classmethod
     def from_qat_module(cls, qat_module: QAT.Elemwise):
         r"""
-        return a :class:`~.QuantizedModule` instance converted from a
+        Return a :class:`~.QuantizedModule` instance converted from a
         :class:`~.QATModule` instance.
         """
         return cls(qat_module.method.name, qat_module.get_activation_dtype())
diff --git a/imperative/python/megengine/module/quantized/linear.py b/imperative/python/megengine/module/quantized/linear.py
index 2f26d430..c01b2b49 100644
--- a/imperative/python/megengine/module/quantized/linear.py
+++ b/imperative/python/megengine/module/quantized/linear.py
@@ -15,7 +15,7 @@ from .module import QuantizedModule
 
 
 class Linear(QuantizedModule):
-    r"""quantized version of :class:`~.qat.linear.Linear`."""
+    r"""Quantized version of :class:`~.qat.linear.Linear`."""
 
     def __init__(
         self, dtype: np.dtype = None,
@@ -31,7 +31,7 @@ class Linear(QuantizedModule):
         inp_scale = dtype.get_scale(inp.dtype)
         w_scale = dtype.get_scale(self.weight.dtype)
         bias_dtype = dtype.qint32(inp_scale * w_scale)
-        return F.linear(
+        return F.nn.linear(
             inp,
             self.weight,
             None if self.bias is None else self.bias.astype(bias_dtype),
@@ -40,7 +40,7 @@ class Linear(QuantizedModule):
     @classmethod
     def from_qat_module(cls, qat_module: QAT.Linear):
         r"""
-        return a :class:`~.QuantizedModule` instance converted from a
+        Return a :class:`~.QuantizedModule` instance converted from a
         :class:`~.QATModule` instance.
         """
         output_dtype = qat_module.get_activation_dtype()
diff --git a/imperative/python/megengine/module/quantized/module.py b/imperative/python/megengine/module/quantized/module.py
index 4fccdbfa..47e6da96 100644
--- a/imperative/python/megengine/module/quantized/module.py
+++ b/imperative/python/megengine/module/quantized/module.py
@@ -26,6 +26,6 @@ class QuantizedModule(Module):
     @abstractmethod
     def from_qat_module(cls, qat_module: QATModule):
         r"""
-        return a :class:`~.QuantizedModule` instance converted from a
+        Return a :class:`~.QuantizedModule` instance converted from a
         :class:`~.QATModule` instance.
         """
diff --git a/imperative/python/megengine/module/quantized/quant_dequant.py b/imperative/python/megengine/module/quantized/quant_dequant.py
index 0c245011..e4541674 100644
--- a/imperative/python/megengine/module/quantized/quant_dequant.py
+++ b/imperative/python/megengine/module/quantized/quant_dequant.py
@@ -11,7 +11,7 @@ from .module import QuantizedModule
 
 class QuantStub(QuantizedModule):
     r"""
-    quantized version of :class:`~.qat.quant_dequant.QuantStub`,
+    Quantized version of :class:`~.qat.quant_dequant.QuantStub`,
     will convert input to quantized dtype.
     """
 
@@ -25,7 +25,7 @@ class QuantStub(QuantizedModule):
     @classmethod
     def from_qat_module(cls, qat_module: QAT.QuantStub):
         r"""
-        return a :class:`~.QuantizedModule` instance converted from a
+        Return a :class:`~.QuantizedModule` instance converted from a
         :class:`~.QATModule` instance.
         """
         return cls(qat_module.get_activation_dtype())
@@ -33,7 +33,7 @@ class QuantStub(QuantizedModule):
 
 class DequantStub(QuantizedModule):
     r"""
-    quantized version of :class:`~.qat.quant_dequant.DequantStub`,
+    Quantized version of :class:`~.qat.quant_dequant.DequantStub`,
     will restore quantized input to float32 dtype.
     """
 
@@ -43,7 +43,7 @@ class DequantStub(QuantizedModule):
     @classmethod
     def from_qat_module(cls, qat_module: QAT.DequantStub):
         r"""
-        return a :class:`~.QuantizedModule` instance converted from a
+        Return a :class:`~.QuantizedModule` instance converted from a
         :class:`~.QATModule` instance.
         """
         return cls()
diff --git a/imperative/python/megengine/module/sequential.py b/imperative/python/megengine/module/sequential.py
index 210dd196..ce021eff 100644
--- a/imperative/python/megengine/module/sequential.py
+++ b/imperative/python/megengine/module/sequential.py
@@ -26,40 +26,40 @@ class Sequential(Module):
         import megengine as mge
         import megengine.module as M
         import megengine.functional as F
+        from collections import OrderedDict
 
         batch_size = 64
         data = mge.tensor(np.zeros((batch_size, 1, 28, 28)), dtype=np.float32)
         label = mge.tensor(np.zeros(batch_size,), dtype=np.int32)
 
         data = data.reshape(batch_size, -1)
-        net = M.Sequential(
+        net0 = M.Sequential(
                 M.Linear(28 * 28, 320),
-                M.Linear(320, 500),
-                M.Linear(500, 320),
                 M.Linear(320, 10)
             )
-        pred = net(data)
+        pred0 = net0(data)
 
-        loss = F.cross_entropy_with_softmax(pred, label)
+        modules = OrderedDict()
+        modules["fc0"] = nn.Linear(28 * 28, 320)
+        modules["fc1"] = nn.Linear(320, 10)
+        net1 = nn.Sequential(modules)
 
+        pred1 = net1(data)
     """
 
     def __init__(self, *args):
         super().__init__()
         self.layer_keys = []
-        self.layer_values = []
         if len(args) == 1 and isinstance(args[0], OrderedDict):
             for key, module in args[0].items():
                 # self.add_module(key, module)
                 setattr(self, key, module)
                 self.layer_keys.append(key)
-                self.layer_values.append(module)
         else:
             for idx, module in enumerate(args):
                 # self.add_module(str(idx), module)
                 setattr(self, str(idx), module)
                 self.layer_keys.append(str(idx))
-                self.layer_values.append(module)
 
     def __getitem__(self, idx):
         if isinstance(idx, slice):
@@ -67,11 +67,10 @@ class Sequential(Module):
                 OrderedDict(zip(self.layer_keys[idx], self.layer_values[idx]))
             )
         else:
-            return self.layer_values[idx]
+            return getattr(self, self.layer_keys[idx])
 
     def __setitem__(self, idx, module):
         key = self.layer_keys[idx]
-        self.layer_values[idx] = module
         return setattr(self, key, module)
 
     def __delitem__(self, idx):
@@ -79,11 +78,9 @@ class Sequential(Module):
             for key in self.layer_keys[idx]:
                 delattr(self, key)
                 del self.layer_keys[idx]
-                del self.layer_values[idx]
         else:
             delattr(self, self.layer_keys[idx])
             del self.layer_keys[idx]
-            del self.layer_values[idx]
 
     def __len__(self):
         return len(self.layer_keys)
@@ -91,6 +88,10 @@ class Sequential(Module):
     def __iter__(self):
         return iter(self.layer_values)
 
+    @property
+    def layer_values(self):
+        return [getattr(self, key) for key in self.layer_keys]
+
     def forward(self, inp):
         for layer in self.layer_values:
             inp = layer(inp)
diff --git a/imperative/python/megengine/optimizer/adadelta.py b/imperative/python/megengine/optimizer/adadelta.py
index 1a9558ea..15ee9e53 100644
--- a/imperative/python/megengine/optimizer/adadelta.py
+++ b/imperative/python/megengine/optimizer/adadelta.py
@@ -22,13 +22,13 @@ class Adadelta(Optimizer):
 
     :param params: iterable of parameters to optimize or dicts defining
         parameter groups.
-    :param lr: coefficient that scale delta before it is applied
-        to the parameters (default: 1.0).
+    :param lr: coefficient that scales delta before it is applied
+        to the parameters. Default: 1.0
     :param rho: coefficient used for computing a running average
-        of squared gradients (default: 0.9).
+        of squared gradients. Default: 0.9
     :param eps: term added to the denominator to improve
-        numerical stability (default: 1e-6).
-    :param weight_decay: weight decay (L2 penalty) (default: 0).
+        numerical stability. Default: 1e-6
+    :param weight_decay: weight decay (L2 penalty). Default: 0
     """
 
     def __init__(
diff --git a/imperative/python/megengine/optimizer/adagrad.py b/imperative/python/megengine/optimizer/adagrad.py
index 7a229747..0d822d5c 100644
--- a/imperative/python/megengine/optimizer/adagrad.py
+++ b/imperative/python/megengine/optimizer/adagrad.py
@@ -23,12 +23,12 @@ class Adagrad(Optimizer):
 
     :param params: iterable of parameters to optimize or dicts defining
         parameter groups.
-    :param lr: coefficient that scale delta before it is applied
-        to the parameters (default: 1e-2).
-    :param lr_decay: learning rate decay (default: 0)
+    :param lr: coefficient that scales delta before it is applied
+        to the parameters. Default: 1e-2
+    :param lr_decay: learning rate decay. Default: 0
     :param eps: term added to the denominator to improve
-        numerical stability (default: 1e-10).
-    :param weight_decay: weight decay (L2 penalty) (default: 0).
+        numerical stability. Default: 1e-10
+    :param weight_decay: weight decay (L2 penalty). Default: 0
     """
 
     def __init__(
diff --git a/imperative/python/megengine/optimizer/lr_scheduler.py b/imperative/python/megengine/optimizer/lr_scheduler.py
index d2b6c859..f873772e 100644
--- a/imperative/python/megengine/optimizer/lr_scheduler.py
+++ b/imperative/python/megengine/optimizer/lr_scheduler.py
@@ -14,8 +14,8 @@ from .optimizer import Optimizer
 class LRScheduler(metaclass=ABCMeta):
     r"""Base class for all learning rate based schedulers.
 
-    :param optimizer: Wrapped optimizer.
-    :param current_epoch: The index of current epoch. Default: -1
+    :param optimizer: wrapped optimizer.
+    :param current_epoch: the index of current epoch. Default: -1
     """
 
     def __init__(  # pylint: disable=too-many-branches
@@ -53,7 +53,8 @@ class LRScheduler(metaclass=ABCMeta):
     def load_state_dict(self, state_dict):
         r"""Loads the schedulers state.
 
-        :param state_dict (dict): scheduler state.
+        :type state_dict: dict
+        :param state_dict: scheduler state.
         """
         raise NotImplementedError
 
diff --git a/imperative/python/megengine/optimizer/multi_step_lr.py b/imperative/python/megengine/optimizer/multi_step_lr.py
index 602f9228..b5cba7e2 100644
--- a/imperative/python/megengine/optimizer/multi_step_lr.py
+++ b/imperative/python/megengine/optimizer/multi_step_lr.py
@@ -17,10 +17,12 @@ class MultiStepLR(LRScheduler):
     r"""Decays the learning rate of each parameter group by gamma once the
         number of epoch reaches one of the milestones.
 
-    :param optimizer: Wrapped optimizer.
-    :param milestones (list): List of epoch indices. Must be increasing.
-    :param gamma (float): Multiplicative factor of learning rate decay. Default: 0.1.
-    :param current_epoch: The index of current epoch. Default: -1.
+    :param optimizer: wrapped optimizer.
+    :type milestones: list
+    :param milestones: list of epoch indices which should be increasing.
+    :type gamma: float
+    :param gamma: multiplicative factor of learning rate decay. Default: 0.1
+    :param current_epoch: the index of current epoch. Default: -1
     """
 
     def __init__(
@@ -55,7 +57,8 @@ class MultiStepLR(LRScheduler):
     def load_state_dict(self, state_dict):
         r"""Loads the schedulers state.
 
-        :param state_dict (dict): scheduler state.
+        :type state_dict: dict
+        :param state_dict: scheduler state.
         """
         tmp_dict = {}
         for key in ["milestones", "gamma", "current_epoch"]:
diff --git a/imperative/python/megengine/quantization/fake_quant.py b/imperative/python/megengine/quantization/fake_quant.py
index 06f0cff8..774a7cae 100644
--- a/imperative/python/megengine/quantization/fake_quant.py
+++ b/imperative/python/megengine/quantization/fake_quant.py
@@ -22,10 +22,10 @@ class _FakeQuantize(Module):
     r"""
     A Basic Fake Quant module.
 
-    :param dtype: A string indicating the target quantization type of input.
-    :param narrow_range: Whether the absolute value of ``qmin`` is the same as ``qmax``,
+    :param dtype: a string indicating the target quantization type of input.
+    :param narrow_range: whether the absolute value of ``qmin`` is the same as ``qmax``,
         instead of 1 greater. Usually True for weight and False for activation.
-    :param enable: Whether do ``normal_forward`` or ``fake_quant_forward``.
+    :param enable: whether do ``normal_forward`` or ``fake_quant_forward``.
     """
 
     def __init__(self, dtype: str, narrow_range: bool = False, enable: bool = True):
@@ -127,7 +127,7 @@ class TQT(_FakeQuantize):
             # when disable, TQT will do normal forward, initialize scale weight
             tmp_scale = F.maximum(F.abs(q_dict["min_val"]), F.abs(q_dict["max_val"]))
             tmp_scale = F.log(tmp_scale / 127) / math.log(2)
-            F.add_update(self.scale, tmp_scale, alpha=0.0, beta=1.0, bias=0.0)
+            self.scale[...] = tmp_scale
         return inp
 
     def get_qparams(self):
diff --git a/imperative/python/megengine/quantization/observer.py b/imperative/python/megengine/quantization/observer.py
index c3ffa77c..26d5465d 100644
--- a/imperative/python/megengine/quantization/observer.py
+++ b/imperative/python/megengine/quantization/observer.py
@@ -21,8 +21,8 @@ class Observer(Module):
     r"""
     A base class for Observer Module.
 
-    :param dtype: a string indicating to collect scale and zero_point of which dtype
-    :param narrow_range: Whether the absolute value of ``qmin`` is the same as ``qmax``,
+    :param dtype: a string indicating to collect scale and zero_point of which dtype.
+    :param narrow_range: whether the absolute value of ``qmin`` is the same as ``qmax``,
         instead of 1 greater. Usually True for weight and False for activation.
     """
 
diff --git a/imperative/python/megengine/quantization/utils.py b/imperative/python/megengine/quantization/utils.py
index 810bbbb3..139f097d 100644
--- a/imperative/python/megengine/quantization/utils.py
+++ b/imperative/python/megengine/quantization/utils.py
@@ -63,7 +63,7 @@ qparam_dict = {
 
 
 def get_qparam_dict(mode: QuantMode):
-    """Return the quantization parameters dictory according to the mode.
+    """Return the quantization parameters dictionary according to the mode.
     """
     return qparam_dict.get(mode, None)
 
@@ -91,7 +91,7 @@ def fake_quant_tensor(inp: Tensor, qmin: int, qmax: int, q_dict: Dict) -> Tensor
 
 
 def fake_quant_bias(bias: Tensor, inp: Tensor, w_qat: Tensor) -> Tensor:
-    """Apply fake quantization to bias, the special scale from input tensor
+    """Apply fake quantization to bias, with the special scale from input tensor
     and weight tensor, the quantized type set to qint32 also.
 
     :param bias: the bias tensor which need to be faked.
diff --git a/imperative/python/megengine/random/distribution.py b/imperative/python/megengine/random/distribution.py
index 2190b8b7..fe074b37 100644
--- a/imperative/python/megengine/random/distribution.py
+++ b/imperative/python/megengine/random/distribution.py
@@ -21,12 +21,12 @@ __all__ = ["normal", "uniform"]
 def normal(
     mean: float = 0, std: float = 1, size: Optional[Iterable[int]] = None
 ) -> Tensor:
-    r"""Random variable with Gaussian distribution $N(\mu, \sigma)$
+    r"""Random variable with Gaussian distribution :math:`N(\mu, \sigma)`.
 
-    :param size: Output tensor size
-    :param mean: The mean or expectation of the distribution
-    :param std: The standard deviation of the distribution (variance = $\sigma ^ 2$)
-    :return: The output tensor
+    :param size: output tensor size.
+    :param mean: the mean or expectation of the distribution.
+    :param std: the standard deviation of the distribution (variance = :math:`\sigma ^ 2`).
+    :return: the output tensor.
 
     Examples:
 
@@ -59,12 +59,12 @@ def normal(
 def uniform(
     low: float = 0, high: float = 1, size: Optional[Iterable[int]] = None
 ) -> Tensor:
-    r"""Random variable with uniform distribution $U(0, 1)$
+    r"""Random variable with uniform distribution $U(0, 1)$.
 
-    :param size: Output tensor size
-    :param low: Lower range
-    :param high: Upper range
-    :return: The output tensor
+    :param size: output tensor size.
+    :param low: lower range.
+    :param high: upper range.
+    :return: the output tensor.
 
     Examples:
 
diff --git a/imperative/python/megengine/serialization.py b/imperative/python/megengine/serialization.py
index 300d92b5..29a1954e 100644
--- a/imperative/python/megengine/serialization.py
+++ b/imperative/python/megengine/serialization.py
@@ -88,9 +88,9 @@ def load(f, map_location=None, pickle_module=pickle):
     :type map_location: str, dict or a function specifying the map rules
     :param map_location: Default: ``None``.
 
-        .. note::
+    .. note::
 
-            map_location defines device mapping. See examples for usage.
+        map_location defines device mapping. See examples for usage.
 
     :type pickle_module:
     :param pickle_module: Default: ``pickle``.
diff --git a/imperative/python/megengine/tensor.py b/imperative/python/megengine/tensor.py
index a8eae821..5d13530a 100644
--- a/imperative/python/megengine/tensor.py
+++ b/imperative/python/megengine/tensor.py
@@ -14,7 +14,7 @@ from .core import Tensor as _Tensor
 from .core.ops.builtin import Copy
 from .core.tensor.core import apply
 from .core.tensor.raw_tensor import as_device
-from .device import get_default_device
+from .device import _valid_device, get_default_device
 from .utils.deprecation import deprecated
 
 
@@ -37,6 +37,12 @@ class Tensor(_Tensor):
         self *= 0
 
     def to(self, device):
+        if isinstance(device, str) and not _valid_device(device):
+            raise ValueError(
+                "invalid device name {}. For the correct format of the device name, please refer to the instruction of megengine.device.set_default_device()".format(
+                    device
+                )
+            )
         cn = as_device(device).to_c()
         return apply(Copy(comp_node=cn), self)[0]
 
@@ -61,7 +67,7 @@ class Tensor(_Tensor):
 
         state = {
             "data": self.numpy(),
-            "device": str(self.device),
+            "device": self.device.logical_name,
             "dtype": self.dtype,
             "qdict": self.q_dict,
         }
@@ -69,13 +75,13 @@ class Tensor(_Tensor):
 
     def __setstate__(self, state):
         data = state.pop("data")
-        device = state.pop("device")
+        logical_device = state.pop("device")
         if self.dmap_callback is not None:
-            assert isinstance(device, str)
-            device = self.dmap_callback(device)
+            assert isinstance(logical_device, str)
+            logical_device = self.dmap_callback(logical_device)
         dtype = state.pop("dtype")
         self.q_dict = state.pop("qdict")
-        super().__init__(data, dtype=dtype, device=device)
+        super().__init__(data, dtype=dtype, device=logical_device)
 
     def detach(self):
         r"""
diff --git a/imperative/python/megengine/test/__init__.py b/imperative/python/megengine/test/__init__.py
deleted file mode 100644
index 44ed54c2..00000000
--- a/imperative/python/megengine/test/__init__.py
+++ /dev/null
@@ -1,67 +0,0 @@
-# -*- coding: utf-8 -*-
-# MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
-#
-# Copyright (c) 2014-2020 Megvii Inc. All rights reserved.
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-import numpy as np
-
-
-def assertTensorClose(
-    v0, v1, *, max_err: float = 1e-6, allow_special_values: bool = False, name=None
-):
-    """
-    :param allow_special_values: whether to allow :attr:`v0` and :attr:`v1` to contain inf and nan values.
-    :param max_err: relative error
-    """
-    __tracebackhide__ = True  # pylint: disable=unused-variable
-
-    assert (
-        v0.dtype == v1.dtype
-    ), "Two Tensor must have same dtype, but the inputs are {} and {}".format(
-        v0.dtype, v1.dtype
-    )
-    v0 = np.ascontiguousarray(v0, dtype=np.float32).copy()
-    v1 = np.ascontiguousarray(v1, dtype=np.float32).copy()
-    if allow_special_values:
-        # check nan and rm it
-        v0_nan_mask = np.isnan(v0)
-        if np.any(v0_nan_mask):
-            assert np.array_equiv(v0_nan_mask, np.isnan(v1)), (v0, v1)
-            v0[v0_nan_mask] = 0
-            v1[v0_nan_mask] = 0
-        # check inf and rm it
-        v0_inf_mask = v0 == float("inf")
-        if np.any(v0_inf_mask):
-            assert np.array_equiv(v0_inf_mask, v1 == float("inf")), (v0, v1)
-            v0[v0_inf_mask] = 0
-            v1[v0_inf_mask] = 0
-        # check -inf and rm it
-        v0_inf_mask = v0 == float("-inf")
-        if np.any(v0_inf_mask):
-            assert np.array_equiv(v0_inf_mask, v1 == float("-inf")), (v0, v1)
-            v0[v0_inf_mask] = 0
-            v1[v0_inf_mask] = 0
-    else:
-        assert np.isfinite(v0.sum()) and np.isfinite(v1.sum()), (v0, v1)
-
-    assert v0.shape == v1.shape, "Two tensor must have same shape({} v.s. {})".format(
-        v0.shape, v1.shape
-    )
-    vdiv = np.max([np.abs(v0), np.abs(v1), np.ones_like(v0)], axis=0)
-    err = np.abs(v0 - v1) / vdiv
-    check = err > max_err
-    if check.sum():
-        idx = tuple(i[0] for i in np.nonzero(check))
-        if name is None:
-            name = "tensor"
-        else:
-            name = "tensor {}".format(name)
-        raise AssertionError(
-            "{} not equal: "
-            "shape={} nonequal_idx={} v0={} v1={} err={}".format(
-                name, v0.shape, idx, v0[idx], v1[idx], err[idx]
-            )
-        )
diff --git a/imperative/python/megengine/core/utils/comp_graph_tools.py b/imperative/python/megengine/utils/comp_graph_tools.py
similarity index 75%
rename from imperative/python/megengine/core/utils/comp_graph_tools.py
rename to imperative/python/megengine/utils/comp_graph_tools.py
index ceffcc7e..0dfce687 100644
--- a/imperative/python/megengine/core/utils/comp_graph_tools.py
+++ b/imperative/python/megengine/utils/comp_graph_tools.py
@@ -8,13 +8,17 @@
 import collections
 from typing import Dict, List
 
-from .. import _imperative_rt
-from .._imperative_rt import OperatorNode, VarNode
+import numpy
+
+from ..core import _imperative_rt
+from ..core._imperative_rt import OperatorNode, VarNode
+from ..core.tensor import megbrain_graph as G
+from ..core.tensor.raw_tensor import as_raw_tensor
 
 
 def get_dep_vars(var: VarNode, var_type: str = None) -> List[VarNode]:
-    """return :class:`.tensor.core.megbrain_graph.VarNode` of type ``var_type`` that input ``var``
-    depands on. If ``var_type`` is None, return all types.
+    """Returns :class:`.tensor.core.megbrain_graph.VarNode` of type ``var_type`` that input ``var``
+    depands on. If ``var_type`` is None, returns all types.
     """
     outputs = []
     memo = set()
@@ -42,14 +46,14 @@ def get_dep_vars(var: VarNode, var_type: str = None) -> List[VarNode]:
 
 
 def get_owner_opr_inputs(var: VarNode) -> List[VarNode]:
-    """get the inputs of owner opr of a variable
+    """Gets the inputs of owner opr of a variable.
     """
     assert isinstance(var, VarNode)
     return var.owner.inputs
 
 
 def get_owner_opr_type(var: VarNode) -> str:
-    """get the type of owner opr of a variable
+    """Gets the type of owner opr of a variable.
 
     """
     assert isinstance(var, VarNode)
@@ -57,16 +61,16 @@ def get_owner_opr_type(var: VarNode) -> str:
 
 
 def get_opr_type(opr: OperatorNode) -> str:
-    """get the type of a opr
+    """Gets the type of an opr.
     """
     assert isinstance(opr, OperatorNode)
     return opr.type
 
 
 def graph_traversal(outputs: VarNode):
-    """helper function to traverse the computing graph and return enough useful information
+    """Helper function to traverse the computing graph and return enough useful information.
 
-    :param outputs: model outputs
+    :param outputs: model outputs.
     :return:  tuple (map_oprs, map_vars, var2oprs, opr2receivers, indegree2opr, opr2indegree)
         WHERE
         map_oprs is dict from opr_id to actual opr
@@ -120,11 +124,11 @@ def graph_traversal(outputs: VarNode):
 
 
 def get_oprs_seq(outputs: List[VarNode], prune_reshape=False) -> List[OperatorNode]:
-    """get oprs in some topological order for a dumped model
+    """Gets oprs in some topological order for a dumped model.
 
-    :param outputs: model outputs
-    :param prune_reshape: whether to prune the operators useless during inference
-    :return: opr list with some correct execution order
+    :param outputs: model outputs.
+    :param prune_reshape: whether to prune the useless operators during inference.
+    :return: opr list with some correct execution order.
     """
 
     def topological_sort(map_oprs, opr2receivers, indegree2opr, opr2indegree):
@@ -190,13 +194,13 @@ def get_oprs_seq(outputs: List[VarNode], prune_reshape=False) -> List[OperatorNo
 
 
 def replace_vars(dst: VarNode, varmap: Dict[VarNode, VarNode]) -> List[VarNode]:
-    """replace vars in the graph
+    """Replaces vars in the graph.
 
-    :param dst: target vars representing the graph
-    :param varmap: the map that specifies how to replace the vars
+    :param dst: target vars representing the graph.
+    :param varmap: the map that specifies how to replace the vars.
 
     :return: new vars that correspond to ``dst`` with all the dependencies
-        replaced
+        replaced.
     """
     dst_vec = []
     repl_src_vec = []
@@ -217,13 +221,13 @@ def replace_vars(dst: VarNode, varmap: Dict[VarNode, VarNode]) -> List[VarNode]:
 def replace_oprs(
     dst: List[VarNode], oprmap: Dict[OperatorNode, OperatorNode]
 ) -> List[VarNode]:
-    """Replace operators in the graph.
+    """Replaces operators in the graph.
 
-    :param dst: target vars representing the graph
-    :param oprmap: the map that specifies how to replace the operators
+    :param dst: target vars representing the graph.
+    :param oprmap: the map that specifies how to replace the operators.
 
     :return: new vars that correspond to ``dst`` with all the dependencies
-        replaced
+        replaced.
     """
     dst_vec = []
     repl_src_vec = []
@@ -242,12 +246,42 @@ def replace_oprs(
 
 
 def set_priority_to_id(dest_vars):
-    """For all oprs in the subgraph constructed by dest_vars
-       set its priority to id if its original priority is zero
-    :param dest_vars: target vars representing the graph
+    """For all oprs in the subgraph constructed by dest_vars,
+       sets its priority to id if its original priority is zero.
+    :param dest_vars: target vars representing the graph.
     """
     dest_vec = []
     for i in dest_vars:
         assert isinstance(i, VarNode)
         dest_vec.append(i)
     _imperative_rt.graph._set_priority_to_id(dest_vec)
+
+
+def load_and_inference(file, inp_data_list: List[numpy.ndarray]) -> List[numpy.ndarray]:
+    """Loads a serialized computing graph and run inference with input data.
+
+    :param file: path or handle of the input file.
+    :param inp_data_list: list of input data.
+    :return: list of inference results.
+
+    """
+    *_, out_list = G.load_graph(file)
+    inputs = get_dep_vars(out_list, "Host2DeviceCopy")
+    replace_dict = {}
+    inp_node_list = []
+    for i in inputs:
+        inp_node = G.InputNode(
+            device="xpux", dtype=inputs[0].dtype, graph=inputs[0].graph
+        )
+        replace_dict[i] = inp_node.outputs[0]
+        inp_node_list.append(inp_node)
+    new_out = replace_vars(out_list, replace_dict)
+    out_node_list = [G.OutputNode(i) for i in new_out]
+    new_out_list = [i.outputs[0] for i in out_node_list]
+    cg = new_out_list[0].graph
+    func = cg.compile(new_out_list)
+    for node, value in zip(inp_node_list, inp_data_list):
+        node.set_value(as_raw_tensor(value)._dev_tensor())
+    func.execute()
+    out_data_list = [o.get_value().numpy() for o in out_node_list]
+    return out_data_list
diff --git a/imperative/python/megengine/utils/http_download.py b/imperative/python/megengine/utils/http_download.py
index add2a649..b24e7c93 100644
--- a/imperative/python/megengine/utils/http_download.py
+++ b/imperative/python/megengine/utils/http_download.py
@@ -23,16 +23,16 @@ HTTP_CONNECTION_TIMEOUT = 5
 
 
 class HTTPDownloadError(BaseException):
-    """The class that represents http request error"""
+    """The class that represents http request error."""
 
 
 def download_from_url(url: str, dst: str, http_read_timeout=120):
     """
-    Downloads file from given url to ``dst``
+    Downloads file from given url to ``dst``.
 
-    :param url: source URL
-    :param dst: saving path
-    :param http_read_timeout: how many seconds to wait for data before giving up
+    :param url: source URL.
+    :param dst: saving path.
+    :param http_read_timeout: how many seconds to wait for data before giving up.
     """
     dst = os.path.expanduser(dst)
     dst_dir = os.path.dirname(dst)
diff --git a/imperative/python/megengine/utils/max_recursion_limit.py b/imperative/python/megengine/utils/max_recursion_limit.py
index d7bce6e8..c8acda3e 100644
--- a/imperative/python/megengine/utils/max_recursion_limit.py
+++ b/imperative/python/megengine/utils/max_recursion_limit.py
@@ -73,6 +73,6 @@ _max_recursion_limit_context_manager = AlternativeRecursionLimit(2 ** 31 - 1)
 
 
 def max_recursion_limit():
-    r"""Sets recursion limit to the max possible value
+    r"""Sets recursion limit to the max possible value.
     """
     return _max_recursion_limit_context_manager
diff --git a/imperative/python/megengine/utils/plugin.py b/imperative/python/megengine/utils/plugin.py
index 2ded7758..7b9343a9 100644
--- a/imperative/python/megengine/utils/plugin.py
+++ b/imperative/python/megengine/utils/plugin.py
@@ -12,13 +12,13 @@ import numpy as np
 
 
 def load_tensor_binary(fobj):
-    """load a tensor dumped by the :class:`BinaryOprIODump` plugin; the actual
+    """Load a tensor dumped by the :class:`BinaryOprIODump` plugin; the actual
     tensor value dump is implemented by ``mgb::debug::dump_tensor``.
 
     Multiple values can be compared by ``tools/compare_binary_iodump.py``.
 
-    :param fobj: file object, or a string that contains the file name
-    :return: tuple ``(tensor_value, tensor_name)``
+    :param fobj: file object, or a string that contains the file name.
+    :return: tuple ``(tensor_value, tensor_name)``.
     """
     if isinstance(fobj, str):
         with open(fobj, "rb") as fin:
diff --git a/imperative/python/megengine/utils/profile_analyzer.py b/imperative/python/megengine/utils/profile_analyzer.py
index 75cc0c0c..62419766 100644
--- a/imperative/python/megengine/utils/profile_analyzer.py
+++ b/imperative/python/megengine/utils/profile_analyzer.py
@@ -16,7 +16,7 @@ import numpy as np
 
 class NonExistNum:
     """An object that behaves like a number but means a field does not exist; It is
-    always greater than any real number
+    always greater than any real number.
     """
 
     def __truediv__(self, _):
@@ -69,12 +69,12 @@ class OprProfRst:
 
     footprint = None
     """A mapping from ``"memory"`` or ``"computation"`` to the actual number
-    of corresponding operations"""
+    of corresponding operations."""
 
     def __init__(self, entry: dict):
         """Opr profiling initialization, which sets up name, type and id of opr_info.
 
-        :param entry: profiling json exec_graph items
+        :param entry: profiling json exec_graph items.
         """
         assert isinstance(entry, dict)
         self.opr_info = collections.OrderedDict()
@@ -84,7 +84,7 @@ class OprProfRst:
         self.footprint = collections.defaultdict(NonExistNum)
 
     def update_device_prof_info(self, dev_time: dict):
-        """Updates device profiling info
+        """Updates device profiling info.
 
         :param dev_time: device time for single opr,
             is an attribute of profiling result.
@@ -93,7 +93,7 @@ class OprProfRst:
         self.time_dict["device"].append(copy.deepcopy(dev_time))
 
     def update_host_prof_info(self, host_time: dict):
-        """Updates host profiling info
+        """Updates host profiling info.
 
         :param host_time: host time for single opr,
             is an attribute of profiling result.
@@ -102,7 +102,7 @@ class OprProfRst:
         self.time_dict["host"].append(copy.deepcopy(host_time))
 
     def update_footprint(self, footprint: dict):
-        """Updates opr footprint
+        """Updates opr footprint.
 
         :param footprint: footprint for single opr,
             is an attribute of profiling result.
@@ -128,7 +128,7 @@ class Record:
     ]
 
     def __init__(self, time: float, info: dict, footprint: dict):
-        """Initializes single record
+        """Initializes single record.
 
         :param time: opr running time, evaluated by applying users providing
             function to OprProfRst.
@@ -153,7 +153,7 @@ class Record:
             self.opr_id = int(self.opr_id)
 
     def get_column_by_name(self, name: str = None):
-        """extracts column value by its column name
+        """Extracts column value by its column name.
 
         :param name: column name, None for time.
         """
@@ -165,7 +165,7 @@ class Record:
 
 class ProfileAnalyzer:
     def __init__(self, obj: dict, opr_filter: Callable = lambda opr, inp, out: True):
-        """Initializes ProfileAnalyzer
+        """Initializes ProfileAnalyzer.
 
         :param obj: dict dumped from json str.
         :param opr_filter: function that filter oprs.
@@ -202,11 +202,11 @@ class ProfileAnalyzer:
     def _aggregate(
         self, records: List[Record], aop: Union[str, Callable], atype: Optional[str]
     ) -> List[Record]:
-        """Aggregate operation
-
-        :param records: selected records
+        """Aggregate operation.
+    
+        :param records: selected records.
         :param aop: aggregate operation, if aop is str, we would replace it
-            with associated numpy function wth aop name"
+            with associated numpy function wth aop name".
         :param atype: the type aggregated by, None for aggregating all into single
             record.
         """
@@ -247,10 +247,10 @@ class ProfileAnalyzer:
         return rst
 
     def _sort(self, records: List[Record], sort_by: str) -> List[Record]:
-        """sort operation
+        """Sort operation.
 
         :param records: the records after aggregate operation.
-        :param sort_by: keyword for sorting the list
+        :param sort_by: keyword for sorting the list.
         """
         if sort_by is None:
             return records
@@ -271,14 +271,14 @@ class ProfileAnalyzer:
         sort_by: str = None,
         top_k: int = 0,
     ) -> List[Record]:
-        """Select operation
+        """Select operation.
 
         :param time_func: time_func provided by user, would apply to every
-            OprProfRst
+            OprProfRst.
         :param opr_filter: filter satisfied operatiors.
         :param aggregate: function that apply to list of records which are
-            aggregated by atype
-        :param aggregate_by: the type aggregated by
+            aggregated by atype.
+        :param aggregate_by: the type aggregated by.
         :param sort_by: keyword for sorting all records.
         :param top_k: specify the maximum number of records.
         :return: the records that go through select, aggregate, sort.
@@ -304,18 +304,18 @@ class TimeFuncHelper:
 
     @staticmethod
     def _eval_time(prof_type, end_key, func, opr_prof):
-        """Eval time
+        """Eval time.
 
         :type prof_type: str
-        :param prof_type: 'host' or 'device'
+        :param prof_type: 'host' or 'device'.
         :type end_key: str
-        :param end_key: 'kern' or 'end'
+        :param end_key: 'kern' or 'end'.
         :type func: function
         :param func: apply to list of all ``thread`` of ``gpu`` time.
         :type opr_prof: `class OprProfRst`
-        :param opr_prof: operator profiling result
+        :param opr_prof: operator profiling result.
         :rtype: float
-        :return: time
+        :return: time.
         """
 
         if prof_type not in opr_prof.time_dict:
@@ -327,10 +327,10 @@ class TimeFuncHelper:
     def eval_time_func(prof_type: str, end_key: str, func: Callable) -> float:
         """Eval oprerator profile time.
 
-        :param prof_type: 'host' or 'device'
-        :param end_key: 'kern' or 'end'
+        :param prof_type: 'host' or 'device'.
+        :param end_key: 'kern' or 'end'.
         :param func: apply to list of all ``thread`` of ``gpu`` time.
-        :return: Eval time results
+        :return: eval time results.
         """
         return functools.partial(TimeFuncHelper._eval_time, prof_type, end_key, func)
 
@@ -338,18 +338,18 @@ class TimeFuncHelper:
     def _min_start(
         prof_type, end_key, func, opr_prof
     ):  # pylint: disable=unused-argument
-        """Eval minimum start time
+        """Eval minimum start time.
 
         :type prof_type: str
-        :param prof_type: 'host' or 'device'
+        :param prof_type: 'host' or 'device'.
         :type end_key: str
-        :param end_key: 'kern' or 'end'
+        :param end_key: 'kern' or 'end'.
         :type func: function
         :param func: apply to list of all ``thread`` of ``gpu`` time.
         :type opr_prof: `class OprProfRst`
-        :param opr_prof: operator profiling result
+        :param opr_prof: operator profiling result.
         :rtype: float
-        :return: time
+        :return: time.
         """
         if prof_type not in opr_prof.time_dict:
             return None
@@ -360,12 +360,12 @@ class TimeFuncHelper:
     def min_start_func(
         prof_type: str, end_key: str, func: Callable
     ) -> float:  # pylint: disable=unused-argument
-        """Eval oprerator profile min start time
+        """Eval oprerator profile min start time.
 
-        :param prof_type: 'host' or 'device'
-        :param end_key: 'kern' or 'end'
+        :param prof_type: 'host' or 'device'.
+        :param end_key: 'kern' or 'end'.
         :param func: apply to list of all ``thread`` of ``gpu`` time.
-        :return: Eval time results
+        :return: eval time results.
         """
         return functools.partial(TimeFuncHelper._min_start, prof_type, end_key, func)
 
@@ -374,15 +374,15 @@ class TimeFuncHelper:
         """Eval maximum end time
 
         :type prof_type: str
-        :param prof_type: 'host' or 'device'
+        :param prof_type: 'host' or 'device'.
         :type end_key: str
-        :param end_key: 'kern' or 'end'
+        :param end_key: 'kern' or 'end'.
         :type func: function
         :param func: apply to list of all ``thread`` of ``gpu`` time.
         :type opr_prof: `class OprProfRst`
-        :param opr_prof: operator profiling result
+        :param opr_prof: operator profiling result.
         :rtype: float
-        :return: time
+        :return: time.
         """
         if prof_type not in opr_prof.time_dict:
             return None
@@ -391,11 +391,11 @@ class TimeFuncHelper:
 
     @staticmethod
     def max_end_func(prof_type: str, end_key: str, func: Callable) -> float:
-        """Eval oprerator profile max end time
+        """Eval oprerator profile max end time.
 
-        :param prof_type: 'host' or 'device'
-        :param end_key: 'kern' or 'end'
+        :param prof_type: 'host' or 'device'.
+        :param end_key: 'kern' or 'end'.
         :param func: apply to list of all ``thread`` of ``gpu`` time.
-        :return: Eval time results
+        :return: eval time results.
         """
         return functools.partial(TimeFuncHelper._max_end, prof_type, end_key, func)
diff --git a/imperative/python/megengine/utils/profiler.py b/imperative/python/megengine/utils/profiler.py
index 2b805afb..668ed9b4 100644
--- a/imperative/python/megengine/utils/profiler.py
+++ b/imperative/python/megengine/utils/profiler.py
@@ -9,13 +9,155 @@
 import base64
 import json
 import os
-from typing import List, Optional
+import re
+from typing import Iterable, List, Optional
 
 from ..core._imperative_rt import OperatorNodeConfig, ProfileEntry
 from ..core._imperative_rt import ProfilerImpl as _Profiler
 from ..core._imperative_rt.imperative import sync
 from ..core._imperative_rt.ops import CollectiveCommMode
-from ..core.ops.builtin import GetVarShape
+
+
+def _make_dict(**kwargs):
+    unused_keys = []
+    for k, v in kwargs.items():
+        if v is None:
+            unused_keys.append(k)
+    for k in unused_keys:
+        del kwargs[k]
+    return kwargs
+
+
+def _print_opnode_config(config):
+    return _make_dict(
+        name=config.name, dtype=config.dtype, comp_node_arr=config.comp_node_arr,
+    )
+
+
+def _dump_chrome_timeline(entries: List[ProfileEntry], path: str):
+    pid = os.getpid()
+    trace_events = []
+
+    def append_event(**kwargs):
+        trace_events.append(_make_dict(**kwargs))
+
+    for id, entry in enumerate(entries):
+        op = entry.op
+        name = type(op).__name__
+        host_begin, host_end = entry.host
+        device_list = entry.device_list
+        args = Profiler.fetch_attrs(op)
+        args["__id__"] = "[{}]".format(id)
+        cat = name
+        for ts, ph in [(host_begin, "B"), (host_end, "E")]:
+            append_event(
+                name=name, ph=ph, ts=ts * 1000, pid=pid, tid="host", args=args, cat=cat,
+            )
+        for device, device_begin, device_end in device_list:
+            for ts, ph in [(device_begin(), "B"), (device_end(), "E")]:
+                append_event(
+                    name=name, ph=ph, ts=ts * 1000, pid=pid, tid=str(device), args=args,
+                )
+    with open("{}.chrome_timeline.json".format(path), "w") as f:
+        json.dump(trace_events, f, indent=2)
+
+
+def _dump_compatible(entries: List[ProfileEntry], path: str):
+    obj = {
+        "graph_exec": {"var": [], "operator": {}},
+        "profiler": {"device": {}, "host": {}, "opr_footprint": {}},
+    }
+    var_list = obj["graph_exec"]["var"]
+    operator_dict = obj["graph_exec"]["operator"]
+    device_dict = obj["profiler"]["device"]
+    host_dict = obj["profiler"]["host"]
+    opr_foot_print_dict = obj["profiler"]["opr_footprint"]
+
+    def add_var(var) -> int:
+        var_id = len(var_list)
+        var_list.append(
+            {"comp_node": str(var[2]),}
+        )
+        return var_id
+
+    for op_id, entry in enumerate(entries):
+        operator_dict[op_id] = {
+            "input": [add_var(var) for var in entry.inputs],
+            "output": [add_var(var) for var in entry.outputs],
+            "name": str(entry.op.ctype()),
+            "type": "imperative",
+            "id": entry.id,
+        }
+        op_device_dict = {}
+        for device, device_begin, device_end in entry.device_list:
+            op_device_dict[str(device)] = {
+                "start": device_begin(),
+                "kern": device_begin(),
+                "end": device_end(),
+            }
+        device_dict[op_id] = op_device_dict
+        host_begin, host_end = entry.host
+        host_dict[op_id] = {
+            "host": {"start": host_begin, "kern": host_begin, "end": host_end}
+        }
+        opr_footprint = {
+            "out_shapes": [oup[1] for oup in entry.outputs],
+            "in_shapes": [inp[1] for inp in entry.inputs],
+            "params": {},
+        }
+        if entry.memory > 0:
+            opr_footprint["memory"] = entry.memory
+        if entry.computation > 0:
+            opr_footprint["computation"] = entry.computation
+        opr_foot_print_dict[op_id] = opr_footprint
+    with open("{}.compatible.json".format(path), "w") as f:
+        json.dump(obj, f, indent=2)
+
+
+def _dump_graphviz(entries: List[ProfileEntry], path: str):
+    import graphviz
+    import json
+
+    graph = graphviz.Digraph()
+    graph.graph_attr["ordering"] = "out"
+    var_cache = {}
+
+    def cache_var(var_id, var_shape):
+        if var_id not in var_cache:
+            var_name = "var({})".format(var_id)
+            var_label = "{}\nshape:{}\n".format(var_name, shape)
+            graph.node(var_name, var_label)
+            var_cache[var_id] = var_name
+        return var_cache[var_id]
+
+    for op_id, entry in enumerate(entries):
+        op = entry.op
+        op_name = "op({})".format(op_id)
+        op_type = type(op).__name__
+        op_attrs = Profiler.fetch_attrs(op)
+        label_lines = []
+        if "param" in op_attrs:
+            del op_attrs["param"]
+        label_lines.append("{}:{}".format(op_name, op_type))
+        for k, v in op_attrs.items():
+            label_lines.append("attr[{}]: {}".format(k, v))
+        op_param_str = entry.param
+        if len(op_param_str) > 0:
+            op_param = json.loads(op_param_str)
+            for k, v in op_param.items():
+                label_lines.append("param[{}]:{}".format(k, v))
+        host_begin, host_end = entry.host
+        label_lines.append("time[host]: {:f}ms".format(host_end - host_begin))
+        for device, device_begin, device_end in entry.device_list:
+            device_time = device_end() - device_begin()
+            label_lines.append("time[{}]: {:f}ms".format(device, device_time))
+        op_label = "\n".join(label_lines)
+        graph.node(op_name, op_label, shape="rectangle")
+        for var_id, shape, device in entry.inputs:
+            graph.edge(cache_var(var_id, shape), op_name)
+        for var_id, shape, device in entry.outputs:
+            graph.edge(op_name, cache_var(var_id, shape))
+    graph.save("{}.graphviz.dot".format(path))
 
 
 class Profiler:
@@ -23,7 +165,7 @@ class Profiler:
     Profile graph execution in imperative mode.
 
     :type path: Optional[str]
-    :param path: default path for profiler to dump
+    :param path: default path prefix for profiler to dump.
 
     Examples:
 
@@ -31,59 +173,67 @@ class Profiler:
 
         import megengine as mge
         import megengine.module as M
-        import megengine.utils.profiler.Profiler
+        from megengine.utils.profiler import Profiler
 
         # With Learnable Parameters
         for iter in range(0, 10):
             # Only profile record of last iter would be saved
-            with Profiler("profile.json"):
+            with Profiler("profile"):
                 # your code here
         
         # Then open the profile file in chrome timeline window
     """
 
-    # see https://github.com/catapult-project/catapult/blob/master/tracing/tracing/base/color_scheme.html
-    GOOD = "good"
-    BAD = "bad"
-    TERRIBLE = "terrible"
+    CHROME_TIMELINE = "chrome_timeline"
+    COMPATIBLE = "compatible"
+    GRAPHVIZ = "graphviz"
+
+    WITH_FOOTPRINT = 1
 
-    BLACK = "black"
-    GREY = "grey"
-    WHITE = "white"
-    YELLOW = "yellow"
-    OLIVE = "olive"
+    _type_map = {
+        OperatorNodeConfig: lambda x: _print_opnode_config(x),
+        bytes: lambda x: base64.encodebytes(x).decode("ascii"),
+        CollectiveCommMode: lambda x: str(x),
+    }
 
-    def __init__(self, path: str = "profile.json"):
+    _dumper_map = {
+        CHROME_TIMELINE: _dump_chrome_timeline,
+        COMPATIBLE: _dump_compatible,
+        GRAPHVIZ: _dump_graphviz,
+    }
+
+    def __init__(
+        self,
+        path: str = "profile",
+        *,
+        formats: Iterable[str] = (CHROME_TIMELINE,),
+        type_filter: str = ".*",
+        exit_dump: bool = True
+    ) -> None:
         self._impl = _Profiler()
         self._path = path
-        self._color_map = {}
-        self._type_map = {
-            OperatorNodeConfig: lambda x: self.print_opnode_config(x),
-            bytes: lambda x: base64.encodebytes(x).decode("ascii"),
-            CollectiveCommMode: lambda x: str(x),
-        }
+
+        if isinstance(formats, str):
+            formats = (formats,)
+
+        self._filter = type_filter
+        self._dumpers = [Profiler._dumper_map[fmt] for fmt in formats]
+        self._exit_dump = exit_dump
 
     def __enter__(self):
         sync()
-        self._impl.start()
+        self._impl.start(Profiler.WITH_FOOTPRINT)
         return self
 
-    def __exit__(self, val, type, trace):
+    def __exit__(self, val, tp, trace):
+        if self._exit_dump:
+            self.dump()
         sync()
         self._impl.stop()
-        if self._path is not None:
-            self.dump()
-
-    def recolor(self, target: str, color: str):
-        self._color_map[target] = color
-        return self
+        self._impl.clear()
 
-    def print_opnode_config(self, config):
-        return self.make_dict(
-            name=config.name, dtype=config.dtype, comp_node_arr=config.comp_node_arr,
-        )
-
-    def fetch_attrs(self, op):
+    @classmethod
+    def fetch_attrs(cls, op):
         attrs = dir(op)
         results = {}
         for attr in attrs:
@@ -93,61 +243,29 @@ class Profiler:
             if callable(value):
                 continue
             value_type = type(value)
-            if value_type in self._type_map:
-                value = self._type_map[value_type](value)
+            if value_type in cls._type_map:
+                value = cls._type_map[value_type](value)
             results[attr] = value
         return results
 
-    def make_dict(self, **kwargs):
-        unused_keys = []
-        for k, v in kwargs.items():
-            if v is None:
-                unused_keys.append(k)
-        for k in unused_keys:
-            del kwargs[k]
-        return kwargs
-
     def dump(self, path: Optional[str] = None):
-        pid = os.getpid()
+        sync()
+        raw = [
+            entry
+            for entry in self._impl.dump()
+            if re.match(self._filter, type(entry.op).__name__)
+        ]
         if path is None:
             path = self._path
-        trace_events = []
-
-        def append_event(**kwargs):
-            trace_events.append(self.make_dict(**kwargs))
-
-        entries: List[ProfileEntry] = self._impl.dump()
-
-        for id, entry in enumerate(entries):
-            op = entry.op
-            name = type(op).__name__
-            host_begin, host_end = entry.host
-            device_list = entry.device_list
-            args = self.fetch_attrs(op)
-            args["__id__"] = "[{}]".format(id)
-            cname = self._color_map[name] if name in self._color_map else None
-            cat = name
-            for ts, ph in [(host_begin, "B"), (host_end, "E")]:
-                append_event(
-                    name=name,
-                    ph=ph,
-                    ts=ts * 1000,
-                    pid=pid,
-                    tid="host",
-                    args=args,
-                    cname=cname,
-                    cat=cat,
-                )
-            for device, device_begin, device_end in device_list:
-                for ts, ph in [(device_begin(), "B"), (device_end(), "E")]:
-                    append_event(
-                        name=name,
-                        ph=ph,
-                        ts=ts * 1000,
-                        pid=pid,
-                        tid=str(device),
-                        args=args,
-                        cname=cname,
-                    )
-        with open(path, "w") as f:
-            json.dump(trace_events, f, indent=2)
+        for dumper in self._dumpers:
+            dumper(raw, path)
+
+    def __call__(self, func):
+        def wrapper(*args, **kwargs):
+            with self:
+                return func(*args, **kwargs)
+
+        return wrapper
+
+
+profile = Profiler
diff --git a/imperative/python/megengine/utils/tensor_sanity_check.py b/imperative/python/megengine/utils/tensor_sanity_check.py
index b77bdd45..55e7241e 100644
--- a/imperative/python/megengine/utils/tensor_sanity_check.py
+++ b/imperative/python/megengine/utils/tensor_sanity_check.py
@@ -7,17 +7,15 @@ class TensorSanityCheck:
     
     Examples:
 
-    .. testcode::
+    .. code-block:: python
+
         from megengine import tensor
         from megengine.utils.tensor_sanity_check import TensorSanityCheck
         with TensorSanityCheck() as checker:
             a = tensor([1, 2])
             b = tensor([3, 4])
             c = a + b
-            print(c.numpy())
-    
-    .. testoutput::
-        [4 6]
+
     """
 
     def __init__(self):
diff --git a/imperative/python/megengine/utils/types.py b/imperative/python/megengine/utils/types.py
index 03f0709e..0475ebf7 100644
--- a/imperative/python/megengine/utils/types.py
+++ b/imperative/python/megengine/utils/types.py
@@ -11,10 +11,10 @@ import functools
 
 
 def get_ndtuple(value, *, n, allow_zero=True):
-    r"""Converts possibly 1D tuple to nd tuple
+    r"""Converts possibly 1D tuple to nd tuple.
 
     :type allow_zero: bool
-    :param allow_zero: whether to allow zero tuple value"""
+    :param allow_zero: whether to allow zero tuple value."""
     if not isinstance(value, collections.abc.Iterable):
         value = int(value)
         value = tuple([value for i in range(n)])
diff --git a/imperative/python/src/common.cpp b/imperative/python/src/common.cpp
index aeb1f9e9..136d6dde 100644
--- a/imperative/python/src/common.cpp
+++ b/imperative/python/src/common.cpp
@@ -55,10 +55,16 @@ void init_common(py::module m) {
     auto&& PyCompNode = py::class_<CompNode>(m, "CompNode")
         .def(py::init())
         .def(py::init(py::overload_cast<const std::string&>(&CompNode::load)))
+        .def_property_readonly("logical_name", [](const CompNode& cn) {
+            return cn.to_string_logical();
+        })
         .def("create_event", &CompNode::create_event, py::arg("flags") = 0ul)
         .def("_set_default_device", &set_default_device)
         .def("_get_default_device", &get_default_device)
         .def("__str__", &CompNode::to_string_logical)
+        .def("__repr__", [](const CompNode& cn) {
+            return py::str("\"" + cn.to_string() + "\" from \"" + cn.to_string_logical() + "\"");
+        })
         .def_static("_sync_all", &CompNode::sync_all)
         .def(py::self == py::self)
         .def_static("_get_device_count", &CompNode::get_device_count,
diff --git a/imperative/python/src/dispatcher.cpp b/imperative/python/src/dispatcher.cpp
index 2d2cd844..616e1791 100644
--- a/imperative/python/src/dispatcher.cpp
+++ b/imperative/python/src/dispatcher.cpp
@@ -151,16 +151,19 @@ struct Dispatcher {
 public:
     static constexpr auto tp_name = "Dispatcher";
 
-    PyObject* tp_vectorcall(PyObject*const* args, Py_ssize_t nargs) {
-        if (!prepare_call(args, nargs)) return nullptr;
-        return do_call([=](PyObject* func){return _PyObject_FastCall(func, const_cast<PyObject**>(args), nargs);});
-    }
-
     PyObject* tp_call(PyObject* args, PyObject* kwargs) {
         if (!prepare_call(&PyTuple_GET_ITEM(args, 0), PyTuple_GET_SIZE(args))) return nullptr;
         return do_call([=](PyObject* func){return PyObject_Call(func, args, kwargs);});
     }
 
+#if PY_MINOR_VERSION >= 6
+    PyObject* tp_vectorcall(PyObject*const* args, Py_ssize_t nargs) {
+        if (!prepare_call(args, nargs)) return nullptr;
+        return do_call([=](PyObject* func){return _PyObject_FastCall(func, const_cast<PyObject**>(args), nargs);});
+    }
+#endif
+
+#if PY_MINOR_VERSION >= 6
     PyObject* super(PyObject*const* args, Py_ssize_t nargs) {
         if (stack.empty()) {
             PyErr_SetString(PyExc_RuntimeError, "super called at top level");
@@ -169,6 +172,16 @@ public:
         stack.emplace_back_safely(stack.back()).mro_offset++;
         return do_call([=](PyObject* func){return _PyObject_FastCall(func, const_cast<PyObject**>(args), nargs);});
     }
+#else
+    PyObject* super(PyObject* args, PyObject* kwargs) {
+        if (stack.empty()) {
+            PyErr_SetString(PyExc_RuntimeError, "super called at top level");
+            return nullptr;
+        }
+        stack.emplace_back_safely(stack.back()).mro_offset++;
+        return do_call([=](PyObject* func){return PyObject_Call(func, args, kwargs);});
+    }
+#endif
 
     void enable(PyObject* func) {
         auto obj = py::reinterpret_borrow<py::object>(func);
@@ -204,7 +217,11 @@ void init_dispatcher(py::module m) {
         .def<&Dispatcher::enable>("enable")
         .def<&Dispatcher::disable>("disable")
         .def<&Dispatcher::clear_cache>("clear_cache")
+#if PY_MINOR_VERSION >= 6
         .def<&Dispatcher::tp_vectorcall>("call")
+#else
+        .def<&Dispatcher::tp_call>("call")
+#endif
         .def<&Dispatcher::super>("super")
         .finalize();
     if (!dispatcher_type) throw py::error_already_set();
diff --git a/imperative/python/src/graph_rt.cpp b/imperative/python/src/graph_rt.cpp
index d6a9bba8..dfe306af 100644
--- a/imperative/python/src/graph_rt.cpp
+++ b/imperative/python/src/graph_rt.cpp
@@ -49,17 +49,28 @@ class _CompGraphProfilerImpl {
             return json->to_string();
         }
 };
+
+struct WeakRendezvousArray:
+    public std::vector<std::weak_ptr<RendezvousBase>>,
+    public UserDataContainer::UserData {
+    MGB_TYPEINFO_OBJ_DECL;
+};
+MGB_TYPEINFO_OBJ_IMPL(WeakRendezvousArray);
 }
 #define DEF_READWRITE(name) .def_readwrite(#name, &CURRENT_CLASS::name)
 
 template<typename T>
 auto def_rendezvous(py::object m, const char* name) {
     return py::class_<Rendezvous<T>, std::shared_ptr<Rendezvous<T>>>(m, name)
-        .def(py::init([](){return std::make_shared<Rendezvous<T>>();}))
+        .def(py::init([](){return Rendezvous<T>::make();}))
         .def("set", [](Rendezvous<T>& r, T v) {r.set(std::move(v));})
         .def("get", [](Rendezvous<T>& r) {return r.get();}, py::call_guard<py::gil_scoped_release>())
         .def("drop", &Rendezvous<T>::drop)
-        .def("reset", &Rendezvous<T>::reset);
+        .def("reset", &Rendezvous<T>::reset)
+        .def("set_exception", [](Rendezvous<T>& r, std::string&& message) {
+            r.set_exception(std::make_exception_ptr(
+                    std::runtime_error(std::move(message))));
+        });
 }
 
 using TensorAttr = LogicalTensorDesc;
@@ -186,7 +197,21 @@ void init_graph_rt(py::module m) {
 
     py::class_<cg::AsyncExecutable>(m, "AsyncExecutable")
         .def("execute", &cg::AsyncExecutable::execute, py::call_guard<py::gil_scoped_release>())
-        .def("wait", &cg::AsyncExecutable::wait, py::call_guard<py::gil_scoped_release>());
+        .def("wait", &cg::AsyncExecutable::wait, py::call_guard<py::gil_scoped_release>())
+        // only used for exception handle
+        .def_property_readonly("_all_rendezvous", [](cg::AsyncExecutable* exec) {
+            auto ud = exec->owner_graph()->options().user_data
+                        .get_user_data<WeakRendezvousArray>();
+            std::vector<std::shared_ptr<RendezvousBase>> ret;
+            if (ud.second) {
+                for (auto&& r: *ud.first[0]) {
+                    if (auto p = r.lock()) {
+                        ret.emplace_back(std::move(p));
+                    }
+                }
+            }
+            return ret;
+        });
 
     auto PyComputingGraph = py::class_<cg::ComputingGraph, std::shared_ptr<cg::ComputingGraph>>(m, "ComputingGraph")
         .def(py::init(py::overload_cast<>(&cg::ComputingGraph::make)))
@@ -267,7 +292,7 @@ void init_graph_rt(py::module m) {
             {"opr_types", to_json(opr_types)},
             {"dtypes", to_json(dtype_names)},
             {"elemwise_modes", to_json(elemwise_modes)},
-        });
+        })->to_string();
     });
 
     m.def("dump_graph", [](
@@ -483,13 +508,20 @@ void init_graph_rt(py::module m) {
         },
         py::arg(), py::arg(), py::arg(), py::arg() = py::none(), py::arg() = py::tuple(), py::arg("graph") = py::none());
 
-    auto output_callback = [](auto callback, const std::vector<cg::VarNode*>& inputs, bool borrow = false) {
+    auto output_callback = [](auto callback, const std::vector<cg::VarNode*>& inputs,
+            std::shared_ptr<RendezvousBase> r = {}, bool borrow = false, bool prefer_host_value = false) {
+        if (r) {
+            mgb_assert(inputs.size());
+            auto cg = inputs[0]->owner_graph();
+            cg->options().user_data.get_user_data_or_create<WeakRendezvousArray>()
+                    ->emplace_back(r);
+        }
         SymbolVarArray sinputs;
         for (auto i : inputs) {
             sinputs.emplace_back(i);
         }
         static_assert(!std::is_reference<decltype(callback)>::value);
-        opr::OutputCallback::Param param{std::move(callback), borrow};
+        opr::OutputCallback::Param param{std::move(callback), borrow, prefer_host_value};
         auto output = opr::OutputCallback::make(std::move(param), sinputs);
         return output.node();
     };
@@ -508,7 +540,7 @@ void init_graph_rt(py::module m) {
         auto f = [p](DeviceTensorND dv) {
             p->set(std::move(dv));
         };
-        return output_callback(std::move(f), std::move(inputs));
+        return output_callback(std::move(f), std::move(inputs), p);
     });
 
     m.def("value_output_callback", [output_callback](std::shared_ptr<Rendezvous<HostNDWithEvent>> p, std::vector<cg::VarNode*> inputs) {
@@ -519,13 +551,13 @@ void init_graph_rt(py::module m) {
             hv_with_event.second->record();
             p->set(std::move(hv_with_event));
         };
-        return output_callback(std::move(f), std::move(inputs), true);
+        return output_callback(std::move(f), std::move(inputs), p, true, true);
     });
 
     m.def("attr_output_callback", [output_callback](std::shared_ptr<Rendezvous<TensorAttr>> p, std::vector<cg::VarNode*> inputs) {
         auto f = [p](DeviceTensorND dv) {
             p->set(TensorAttr{TensorLayout{dv.shape(), dv.dtype()}, dv.comp_node()});
         };
-        return output_callback(std::move(f), std::move(inputs), true);
+        return output_callback(std::move(f), std::move(inputs), p, true);
     });
 }
diff --git a/imperative/python/src/graph_rt.h b/imperative/python/src/graph_rt.h
index ee2a11da..a7ad80b1 100644
--- a/imperative/python/src/graph_rt.h
+++ b/imperative/python/src/graph_rt.h
@@ -25,7 +25,7 @@ class GraphNodePtr {
     T* m_node;
 public:
     GraphNodePtr(T* node) :
-        m_graph(node ? nullptr : node->owner_graph()->shared_from_this()),
+        m_graph(node ? node->owner_graph()->shared_from_this() : nullptr),
         m_node(node) {}
     T* operator->() {return m_node;}
     T& operator*() {return *m_node;}
@@ -35,18 +35,36 @@ public:
 
 PYBIND11_DECLARE_HOLDER_TYPE(T, GraphNodePtr<T>, true);
 
+class RendezvousBase {
+public:
+    virtual ~RendezvousBase() = default;
+    virtual void set_exception(std::exception_ptr p) = 0;
+};
+
 template<typename R>
-class Rendezvous {
+class Rendezvous: public RendezvousBase {
     std::mutex m_lock;
     int m_read_ahead = 0;
     bool m_drop_next = false;
     std::promise<R> m_promise;
-public:
     Rendezvous() = default;
+    struct Factory {
+        template<typename ...Args>
+        static auto make_rendezvous(Args&& ...args) {
+            auto ptr = new Rendezvous<R>{std::forward(args)...};
+            return std::shared_ptr<Rendezvous<R>>(ptr);
+        }
+    };
+public:
     Rendezvous(const Rendezvous& rhs) = delete;
     Rendezvous(Rendezvous&& rhs) = delete;
     Rendezvous& operator=(const Rendezvous& rhs) = delete;
 
+    template<typename ...Args>
+    static auto make(Args&& ...args) {
+        return Factory::make_rendezvous(std::forward<Args>(args)...);
+    }
+
     R get() {
         std::future<R> f;
         {
@@ -96,6 +114,29 @@ public:
         m_read_ahead = 0;
         m_drop_next = false;
     }
+
+    void set_exception(std::exception_ptr e) {
+        if (e) {
+            MGB_LOCK_GUARD(m_lock);
+            if (m_read_ahead >= 0) {
+                mgb_assert(m_read_ahead <= 1);
+                if (m_drop_next) {
+                    m_drop_next = false;
+                } else {
+                    m_promise.set_exception(e);
+                }
+                if (m_read_ahead == 1) {
+                    m_promise = {};
+                }
+                --m_read_ahead;
+            } else {
+                mgb_assert(m_read_ahead == -1);
+                // TODO: maybe exception should be ignored
+                // if value was already set ?
+                m_promise.set_exception(e);
+            }
+        }
+    }
 };
 
 void init_graph_rt(pybind11::module m);
diff --git a/imperative/python/src/utils.cpp b/imperative/python/src/utils.cpp
index ed851af2..3169f272 100644
--- a/imperative/python/src/utils.cpp
+++ b/imperative/python/src/utils.cpp
@@ -204,17 +204,27 @@ void init_utils(py::module m) {
     py::class_<ProfileEntry>(m, "ProfileEntry")
             .def_readwrite("op", &ProfileEntry::op)
             .def_readwrite("host", &ProfileEntry::host)
-            .def_readwrite("device_list", &ProfileEntry::device_list);
+            .def_readwrite("device_list", &ProfileEntry::device_list)
+            .def_readwrite("inputs", &ProfileEntry::inputs)
+            .def_readwrite("outputs", &ProfileEntry::outputs)
+            .def_readwrite("id", &ProfileEntry::id)
+            .def_readwrite("parent", &ProfileEntry::parent)
+            .def_readwrite("memory", &ProfileEntry::memory)
+            .def_readwrite("computation", &ProfileEntry::computation)
+            .def_property_readonly("param", [](ProfileEntry& self)->std::string{
+                if(self.param){
+                    return self.param->to_string();
+                } else {
+                    return {};
+                }
+            });
 
     py::class_<mgb::imperative::Profiler>(m, "ProfilerImpl")
             .def(py::init<>())
-            .def("start",
-                 [](mgb::imperative::Profiler& profiler) { profiler.start(); })
-            .def("stop",
-                 [](mgb::imperative::Profiler& profiler) { profiler.stop(); })
-            .def("dump", [](mgb::imperative::Profiler& profiler) {
-                return profiler.get_profile();
-            });
+            .def("start", &mgb::imperative::Profiler::start)
+            .def("stop", &mgb::imperative::Profiler::stop)
+            .def("clear", &mgb::imperative::Profiler::clear)
+            .def("dump", &mgb::imperative::Profiler::get_profile);
 
     using mgb::imperative::TensorSanityCheck;
     py::class_<TensorSanityCheck>(m, "TensorSanityCheckImpl")
diff --git a/imperative/python/test/conftest.py b/imperative/python/test/conftest.py
new file mode 100644
index 00000000..f0de9679
--- /dev/null
+++ b/imperative/python/test/conftest.py
@@ -0,0 +1,4 @@
+import os
+import sys
+
+sys.path.append(os.path.join(os.path.dirname(__file__), "helpers"))
diff --git a/imperative/python/test/helpers/utils.py b/imperative/python/test/helpers/utils.py
new file mode 100644
index 00000000..4724fd26
--- /dev/null
+++ b/imperative/python/test/helpers/utils.py
@@ -0,0 +1,67 @@
+import numpy as np
+
+from megengine import tensor
+
+
+def _default_compare_fn(x, y):
+    np.testing.assert_allclose(x.numpy(), y, rtol=1e-6)
+
+
+def opr_test(cases, func, compare_fn=_default_compare_fn, ref_fn=None, **kwargs):
+    """
+    :param cases: the list which have dict element, the list length should be 2 for dynamic shape test.
+           and the dict should have input,
+           and should have output if ref_fn is None.
+           should use list for multiple inputs and outputs for each case.
+    :param func: the function to run opr.
+    :param compare_fn: the function to compare the result and expected, use
+        ``np.testing.assert_allclose`` if None.
+    :param ref_fn: the function to generate expected data, should assign output if None.
+
+    Examples:
+
+    .. code-block::
+
+        dtype = np.float32
+        cases = [{"input": [10, 20]}, {"input": [20, 30]}]
+        opr_test(cases,
+                 F.eye,
+                 ref_fn=lambda n, m: np.eye(n, m).astype(dtype),
+                 dtype=dtype)
+
+    """
+
+    def check_results(results, expected):
+        if not isinstance(results, (tuple, list)):
+            results = (results,)
+        for r, e in zip(results, expected):
+            compare_fn(r, e)
+
+    def get_param(cases, idx):
+        case = cases[idx]
+        inp = case.get("input", None)
+        outp = case.get("output", None)
+        if inp is None:
+            raise ValueError("the test case should have input")
+        if not isinstance(inp, (tuple, list)):
+            inp = (inp,)
+        if ref_fn is not None and callable(ref_fn):
+            outp = ref_fn(*inp)
+        if outp is None:
+            raise ValueError("the test case should have output or reference function")
+        if not isinstance(outp, (tuple, list)):
+            outp = (outp,)
+
+        return inp, outp
+
+    if len(cases) == 0:
+        raise ValueError("should give one case at least")
+
+    if not callable(func):
+        raise ValueError("the input func should be callable")
+
+    inp, outp = get_param(cases, 0)
+    inp_tensor = [tensor(inpi) for inpi in inp]
+
+    results = func(*inp_tensor, **kwargs)
+    check_results(results, outp)
diff --git a/imperative/python/test/integration/test_converge.py b/imperative/python/test/integration/test_converge.py
index 1beded21..d16570c7 100644
--- a/imperative/python/test/integration/test_converge.py
+++ b/imperative/python/test/integration/test_converge.py
@@ -80,7 +80,7 @@ def test_training_converge():
     def train(data, label):
         with gm:
             pred = net(data)
-            loss = F.cross_entropy_with_softmax(pred, label)
+            loss = F.nn.cross_entropy(pred, label)
             gm.backward(loss)
         return loss
 
diff --git a/imperative/python/test/integration/test_correctness.py b/imperative/python/test/integration/test_correctness.py
index db77e374..d33bde89 100644
--- a/imperative/python/test/integration/test_correctness.py
+++ b/imperative/python/test/integration/test_correctness.py
@@ -24,7 +24,6 @@ from megengine.jit import SublinearMemoryConfig
 from megengine.module import AvgPool2d, BatchNorm2d, Conv2d, Linear, Module
 from megengine.optimizer import SGD
 from megengine.tensor import Tensor
-from megengine.test import assertTensorClose
 
 
 def get_gpu_name():
@@ -93,7 +92,7 @@ class MnistNet(Module):
 def train(data, label, net, opt, gm):
     with gm:
         pred = net(data)
-        loss = F.cross_entropy_with_softmax(pred, label)
+        loss = F.nn.cross_entropy(pred, label)
         gm.backward(loss)
     return loss
 
@@ -172,13 +171,13 @@ def run_train(
     loss = train_func(data, label, net, opt, gm)
     opt.step()
 
-    assertTensorClose(loss.numpy(), checkpoint["loss"], max_err=max_err)
+    np.testing.assert_allclose(loss.numpy(), checkpoint["loss"], atol=max_err)
 
     for param, param_ref in zip(
         net.state_dict().items(), checkpoint["net_updated"].items()
     ):
         assert param[0] == param_ref[0]
-        assertTensorClose(param[1], param_ref[1], max_err=max_err)
+        np.testing.assert_allclose(param[1], param_ref[1], atol=max_err)
 
 
 def run_eval(
@@ -209,7 +208,7 @@ def run_eval(
 
     for _ in range(3):
         new_value = eval_fun(data, net=net)
-        assertTensorClose(new_value.numpy(), refer_value.numpy(), max_err=max_err)
+        np.testing.assert_allclose(new_value.numpy(), refer_value.numpy(), atol=max_err)
 
 
 def test_correctness():
diff --git a/imperative/python/test/integration/test_dp_correctness.py b/imperative/python/test/integration/test_dp_correctness.py
index e35a4e8f..3491cf5f 100644
--- a/imperative/python/test/integration/test_dp_correctness.py
+++ b/imperative/python/test/integration/test_dp_correctness.py
@@ -27,7 +27,6 @@ from megengine.functional.debug_param import set_conv_execution_strategy
 from megengine.module import AvgPool2d, BatchNorm2d, Conv2d, Linear, Module
 from megengine.optimizer import SGD
 from megengine.tensor import Tensor
-from megengine.test import assertTensorClose
 
 p_num = 4
 
@@ -99,7 +98,7 @@ def train(data, label, net, opt, gm):
     opt.clear_grad()
     with gm:
         pred = net(data)
-        loss = F.cross_entropy_with_softmax(pred, label)
+        loss = F.nn.cross_entropy(pred, label)
         gm.backward(loss)
     opt.step()
     return loss
@@ -181,7 +180,7 @@ def run_test(
 
         loss = train(data_train, label_train, net, opt, gm)
 
-        assertTensorClose(loss.numpy(), checkpoint["loss"], max_err=max_err)
+        np.testing.assert_allclose(loss.numpy(), checkpoint["loss"], atol=max_err)
 
         if dist.get_rank():
             return
@@ -189,7 +188,7 @@ def run_test(
             net.state_dict().items(), checkpoint["net_updated"].items()
         ):
             assert param[0] == param_ref[0]
-            assertTensorClose(param[1], param_ref[1], max_err=max_err)
+            np.testing.assert_allclose(param[1], param_ref[1], atol=max_err)
 
     procs = []
     for rank in range(p_num):
diff --git a/imperative/python/test/integration/test_trace_dump.py b/imperative/python/test/integration/test_trace_dump.py
index f6e27398..149148a4 100644
--- a/imperative/python/test/integration/test_trace_dump.py
+++ b/imperative/python/test/integration/test_trace_dump.py
@@ -47,13 +47,17 @@ class XORNet(M.Module):
         self.num_class = 2
         super().__init__()
         self.fc0 = M.Linear(self.num_class, self.mid_dim, bias=True)
+        self.bn0 = M.BatchNorm1d(self.mid_dim)
         self.fc1 = M.Linear(self.mid_dim, self.mid_dim, bias=True)
+        self.bn1 = M.BatchNorm1d(self.mid_dim)
         self.fc2 = M.Linear(self.mid_dim, self.num_class, bias=True)
 
     def forward(self, x):
         x = self.fc0(x)
+        x = self.bn0(x)
         x = F.tanh(x)
         x = self.fc1(x)
+        x = self.bn1(x)
         x = F.tanh(x)
         x = self.fc2(x)
         return x
@@ -72,7 +76,7 @@ def test_xornet_trace_dump():
         with gm:
             net.train()
             pred = net(data)
-            loss = F.cross_entropy_with_softmax(pred, label)
+            loss = F.nn.cross_entropy(pred, label)
             gm.backward(loss)
         return pred, loss
 
@@ -80,7 +84,7 @@ def test_xornet_trace_dump():
     def val_fun(data, label):
         net.eval()
         pred = net(data)
-        loss = F.cross_entropy_with_softmax(pred, label)
+        loss = F.nn.cross_entropy(pred, label)
         return pred, loss
 
     @trace(symbolic=True, capture_as_const=True)
diff --git a/imperative/python/test/run.sh b/imperative/python/test/run.sh
index 0f1f0fd7..825c1017 100755
--- a/imperative/python/test/run.sh
+++ b/imperative/python/test/run.sh
@@ -1,6 +1,7 @@
 #!/bin/bash -e
 
-test_dirs="test"
+test_dirs="test megengine"
+
 TEST_PLAT=$1
 
 if [[ "$TEST_PLAT" == cpu ]]; then
diff --git a/imperative/python/test/unit/core/test_autodiff.py b/imperative/python/test/unit/core/test_autodiff.py
index 1cc7d453..a910da26 100644
--- a/imperative/python/test/unit/core/test_autodiff.py
+++ b/imperative/python/test/unit/core/test_autodiff.py
@@ -306,7 +306,7 @@ def test_AxisAddRemove():
     x = TensorWrapper(x_np)
 
     grad = Grad().wrt(x, callback=save_to(x))
-    y = F.remove_axis(F.add_axis(x, 2), 0)
+    y = F.squeeze(F.expand_dims(x, 2), 0)
 
     grad(y, F.ones_like(y))
     np.testing.assert_equal(
@@ -319,7 +319,7 @@ def test_Broadcast():
     x = TensorWrapper(x_np)
 
     grad = Grad().wrt(x, callback=save_to(x))
-    y = F.broadcast(x, (3, 3, 10))
+    y = F.broadcast_to(x, (3, 3, 10))
 
     grad(y, F.ones_like(y))
     np.testing.assert_equal(np.ones((3, 3, 1), dtype=np.float32) * 10, x.grad.numpy())
diff --git a/imperative/python/test/unit/core/test_dtype_bfloat16.py b/imperative/python/test/unit/core/test_dtype_bfloat16.py
new file mode 100644
index 00000000..768ccee2
--- /dev/null
+++ b/imperative/python/test/unit/core/test_dtype_bfloat16.py
@@ -0,0 +1,63 @@
+# MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
+#
+# Copyright (c) 2014-2020 Megvii Inc. All rights reserved.
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+
+import pickle
+
+import numpy as np
+
+from megengine.core.tensor.dtype import bfloat16
+from megengine.core.tensor.raw_tensor import as_raw_tensor
+
+
+def test_define():
+    np.testing.assert_allclose(
+        np.array([0.5, 0.13425, 3.4687, -1.34976, -9.34673, 0.0], dtype=bfloat16),
+        np.array([0.5, 0.133789, 3.46875, -1.351562, -9.375, 0.0], dtype=np.float32),
+        atol=1e-6,
+    )
+
+
+def test_cast():
+    dtypes = [np.int8, np.int16, np.int32, np.float32, np.float64]
+    fp32_values = [0.34985, 10.943, -0.5, -19.3, 21.49673]
+    bf16_values = [0.349609, 10.9375, -0.5, -19.25, 21.5]
+    int_values = [34, 10, -5, -19, 21]
+    for dtype in dtypes:
+        np.testing.assert_allclose(
+            np.array(fp32_values, dtype=bfloat16).astype(dtype),
+            np.array(bf16_values, dtype=dtype),
+            atol=1e-6,
+        )
+        np.testing.assert_allclose(
+            np.array(int_values, dtype=dtype),
+            np.array(int_values, dtype=bfloat16).astype(dtype),
+            atol=1e-6,
+        )
+
+
+def test_shared_nd():
+    data = np.array([-3.4, 1.394683, 2.323497, -7.439948, -5.2397], dtype=bfloat16)
+    snd = as_raw_tensor(data, dtype=bfloat16, device="xpux")
+    assert snd.numpy().dtype == bfloat16
+    np.testing.assert_allclose(
+        snd.numpy(), [-3.40625, 1.398438, 2.328125, -7.4375, -5.25], atol=1e-6
+    )
+
+    data = np.array([-9.34964, -8.342, 9.4385, 0.18746, 1.48], dtype=bfloat16)
+    snd = as_raw_tensor(data, dtype=bfloat16, device="xpux")
+    np.testing.assert_allclose(
+        snd.numpy(), [-9.375, -8.3125, 9.4375, 0.1875, 1.476562], atol=1e-6
+    )
+
+
+def test_pickle():
+    x = np.ascontiguousarray(np.random.rand(8192), dtype=bfloat16)
+    pkl = pickle.dumps(x, pickle.HIGHEST_PROTOCOL)
+    y = pickle.loads(pkl)
+    assert x.dtype is y.dtype
+    np.testing.assert_allclose(x.astype(np.float32), y.astype(np.float32), atol=1e-6)
diff --git a/imperative/python/test/unit/core/test_dtype_intbx.py b/imperative/python/test/unit/core/test_dtype_intbx.py
new file mode 100644
index 00000000..08cbd355
--- /dev/null
+++ b/imperative/python/test/unit/core/test_dtype_intbx.py
@@ -0,0 +1,100 @@
+# MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
+#
+# Copyright (c) 2014-2020 Megvii Inc. All rights reserved.
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+
+import pickle
+
+import numpy as np
+import pytest
+
+from megengine.core.tensor.dtype import intb1, intb2, intb4
+from megengine.core.tensor.raw_tensor import as_raw_tensor
+
+
+def bit_define_test(bit, low_bit_type):
+    max_value = (1 << bit) - 1
+    min_value = 1 - (1 << bit)
+
+    a = np.array([i for i in range(min_value, max_value + 2, 2)], dtype=low_bit_type)
+
+    for i in range(max_value + 1):
+        np.testing.assert_equal(a[i], i * 2 - max_value)
+        np.testing.assert_equal(str(a[i]), str(i * 2 - max_value))
+
+    with pytest.raises(ValueError):
+        np.arange(min_value, max_value, dtype=low_bit_type)
+
+    with pytest.raises(ValueError):
+        np.arange(min_value - 2, max_value + 4, 2, dtype=low_bit_type)
+
+    np.testing.assert_allclose(
+        np.arange(min_value, 12, 2, dtype=low_bit_type),
+        (np.arange((13 - min_value) // 2, dtype=np.int8) % (max_value + 1)) * 2
+        - max_value,
+    )
+
+    np.testing.assert_allclose(
+        np.arange(max_value, max_value - 20, -2, dtype=low_bit_type),
+        (np.arange(max_value, max_value - 10, -1, dtype=np.int8) % (max_value + 1)) * 2
+        - max_value,
+    )
+
+
+def test_define():
+    bit_define_test(1, intb1)
+    bit_define_test(2, intb2)
+    bit_define_test(4, intb4)
+
+
+def _bit_cast_test(bit, low_bit_type):
+    dtypes = [np.int8, np.int16, np.int32, np.float32, np.float64]
+
+    max_value = (1 << bit) - 1
+    min_value = 1 - (1 << bit)
+    for dtype in dtypes:
+        np.testing.assert_allclose(
+            np.arange(min_value, max_value + 2, 2, dtype=low_bit_type).astype(dtype),
+            np.arange(min_value, max_value + 2, 2, dtype=dtype),
+        )
+
+    with pytest.raises(ValueError):
+        np.array([2, 1, -1], dtype=int).astype(low_bit_type)
+    with pytest.raises(ValueError):
+        np.array([min_value - 2, 1, max_value + 2], dtype=int).astype(low_bit_type)
+
+
+def test_cast():
+    _bit_cast_test(1, intb1)
+    _bit_cast_test(2, intb2)
+    _bit_cast_test(4, intb4)
+
+
+def _shared_nd_test(bit, low_bit_type):
+    max_value = (1 << bit) - 1
+    min_value = 1 - (1 << bit)
+
+    data = np.arange(min_value, max_value + 2, 2, dtype=low_bit_type)
+    snd = as_raw_tensor(data, dtype=low_bit_type, device="xpux")
+    np.testing.assert_allclose(snd.numpy(), range(min_value, max_value + 2, 2))
+
+    data = np.arange(min_value, max_value + 2, 4, dtype=low_bit_type)
+    snd = as_raw_tensor(data, dtype=low_bit_type, device="xpux")
+    np.testing.assert_allclose(snd.numpy(), range(min_value, max_value + 2, 4))
+
+
+def test_shared_nd():
+    _shared_nd_test(1, intb1)
+    _shared_nd_test(2, intb2)
+    _shared_nd_test(4, intb4)
+
+
+def test_pickle():
+    x = np.ascontiguousarray(np.random.randint(2, size=8192) * 2 - 1, dtype=intb1)
+    pkl = pickle.dumps(x, pickle.HIGHEST_PROTOCOL)
+    y = pickle.loads(pkl)
+    assert x.dtype is y.dtype
+    np.testing.assert_allclose(x.astype(np.float32), y.astype(np.float32))
diff --git a/imperative/python/test/unit/core/test_dtype_quant.py b/imperative/python/test/unit/core/test_dtype_quant.py
new file mode 100644
index 00000000..e06cb297
--- /dev/null
+++ b/imperative/python/test/unit/core/test_dtype_quant.py
@@ -0,0 +1,215 @@
+# MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
+#
+# Copyright (c) 2014-2020 Megvii Inc. All rights reserved.
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+
+from functools import partial
+
+import numpy as np
+import pytest
+
+import megengine.core.tensor.megbrain_graph as G
+from megengine.core.ops import builtin as ops
+from megengine.core.tensor.core import apply
+from megengine.core.tensor.dtype import (
+    _metadata_dict,
+    convert_from_qint4,
+    convert_from_qint8,
+    convert_from_quint4,
+    convert_from_quint8,
+    convert_to_qint4,
+    convert_to_qint8,
+    convert_to_quint4,
+    convert_to_quint8,
+    get_scale,
+    get_zero_point,
+    is_quantize,
+    qint4,
+    qint8,
+    quint4,
+    quint8,
+)
+from megengine.core.tensor.raw_tensor import as_raw_tensor
+from megengine.distributed.helper import get_device_count_by_fork
+
+
+def test_dtype_quint8():
+    with pytest.raises(ValueError):
+        blah = quint8(0.05, 0.233)
+    with pytest.raises(ValueError):
+        blah = quint8(0.02, 777)
+    with pytest.raises(ValueError):
+        blah = quint8(0.02, -1)
+    dt = quint8(0.01, 135)
+    assert isinstance(dt, np.dtype)
+    assert "mgb_dtype" in dt.metadata
+    np.testing.assert_allclose(dt.metadata["mgb_dtype"]["scale"], 0.01)
+    np.testing.assert_equal(dt.metadata["mgb_dtype"]["zero_point"], 135)
+
+    assert is_quantize(dt)
+    np.testing.assert_allclose(get_scale(dt), 0.01)
+    np.testing.assert_equal(get_zero_point(dt), 135)
+
+
+def test_dtype_qint8():
+    dt = qint8(0.01)
+    assert isinstance(dt, np.dtype)
+    assert "mgb_dtype" in dt.metadata
+    np.testing.assert_allclose(dt.metadata["mgb_dtype"]["scale"], 0.01)
+
+    assert is_quantize(dt) == True
+    np.testing.assert_allclose(get_scale(dt), 0.01)
+
+
+def _get_compiled_result(inp, dtype, shape, device, calc_func=None):
+    graph = G.Graph()
+    # graph.options.async_exec_level = 0b100
+    inp_node = G.InputNode(device=device, dtype=dtype, shape=shape, graph=graph)
+    temp_rst = calc_func(inp_node.outputs[0])
+    oup_node = G.OutputNode(temp_rst)
+    func = graph.compile(oup_node.outputs[0])
+    inp_node.set_value(as_raw_tensor(inp, dtype=dtype, device=device)._dev_tensor())
+    func.execute()
+    return oup_node.get_value().numpy()
+
+
+def _check_result_attr(oup, dtype, dtype_str, is_unsigned=True):
+    metadata = _metadata_dict[dtype_str]
+    assert "mgb_dtype" in oup.dtype.metadata
+    assert is_quantize(oup.dtype)
+    np.testing.assert_equal(oup.dtype.metadata["mgb_dtype"]["name"], metadata.name)
+    np.testing.assert_allclose(get_scale(oup.dtype), get_scale(dtype))
+    if is_unsigned:
+        np.testing.assert_equal(get_zero_point(oup.dtype), get_zero_point(dtype))
+
+
+def test_dtype_int8_ffi_handle():
+    device = "xpux"
+    shape = (3, 3, 3)
+    data = np.random.random(shape).astype(np.float32) * 5 - 1
+
+    def identity(x):
+        return x
+
+    dtype = quint8(0.01, 127)
+    inp = convert_to_quint8(data, dtype)
+    oup = _get_compiled_result(inp, dtype, shape, device, calc_func=identity)
+    _check_result_attr(oup, dtype, "quint8")
+    np.testing.assert_allclose(convert_from_quint8(oup), convert_from_quint8(inp))
+
+    dtype = qint8(0.01)
+    inp = convert_to_qint8(data, dtype)
+    oup = _get_compiled_result(inp, dtype, shape, device, calc_func=identity)
+    _check_result_attr(oup, dtype, "qint8", is_unsigned=False)
+    np.testing.assert_allclose(convert_from_qint8(oup), convert_from_qint8(inp))
+
+
+def test_quint8_typecvt():
+    device = "xpux"
+    shape = (3, 3, 3)
+    data = np.random.random(shape).astype(np.float32) * 5 - 1
+
+    def typecvt(x, dt=None):
+        (y,) = apply(ops.TypeCvt(param=dt), x)
+        return y
+
+    # convert to quint8
+    dtype = quint8(0.01, 135)
+    oup = _get_compiled_result(
+        data, np.float32, shape, device, calc_func=partial(typecvt, dt=dtype)
+    )
+    _check_result_attr(oup, dtype, "quint8")
+    np.testing.assert_equal(oup, convert_to_quint8(data, dtype))
+
+    # convert from quint8 to float32
+    oup_float = _get_compiled_result(
+        oup, dtype, shape, device, calc_func=partial(typecvt, dt=np.float32)
+    )
+    assert oup_float.dtype == np.float32
+    np.testing.assert_equal(
+        oup_float, convert_from_quint8(convert_to_quint8(data, dtype))
+    )
+
+
+def test_dtype_quint4():
+    with pytest.raises(ValueError):
+        blah = quint4(0.05, 0.233)
+    with pytest.raises(ValueError):
+        blah = quint4(0.02, 18)
+    with pytest.raises(ValueError):
+        blah = quint4(0.02, -1)
+    dt = quint4(0.01, 8)
+    assert isinstance(dt, np.dtype)
+    assert "mgb_dtype" in dt.metadata
+    np.testing.assert_allclose(dt.metadata["mgb_dtype"]["scale"], 0.01)
+    np.testing.assert_equal(dt.metadata["mgb_dtype"]["zero_point"], 8)
+
+    assert is_quantize(dt)
+    np.testing.assert_allclose(get_scale(dt), 0.01)
+    np.testing.assert_equal(get_zero_point(dt), 8)
+
+
+def test_dtype_qint4():
+    dt = qint4(0.01)
+    assert isinstance(dt, np.dtype)
+    assert "mgb_dtype" in dt.metadata
+    np.testing.assert_allclose(dt.metadata["mgb_dtype"]["scale"], 0.01)
+
+    assert is_quantize(dt)
+    np.testing.assert_allclose(get_scale(dt), 0.01)
+
+
+def test_dtype_int4_ffi_handle():
+    device = "xpux"
+    shape = (3, 3, 3)
+    data = np.random.random(shape).astype(np.float32) * 5 - 1
+    print(data)
+
+    def identity(x):
+        return x
+
+    dtype = quint4(0.01, 7)
+    inp = convert_to_quint4(data, dtype)
+    oup = _get_compiled_result(inp, dtype, shape, device, calc_func=identity)
+    _check_result_attr(oup, dtype, "quint4")
+    np.testing.assert_allclose(convert_from_quint4(oup), convert_from_quint4(inp))
+
+    dtype = qint4(0.01)
+    inp = convert_to_qint4(data, dtype)
+    oup = _get_compiled_result(inp, dtype, shape, device, calc_func=identity)
+    _check_result_attr(oup, dtype, "qint4", is_unsigned=False)
+    np.testing.assert_allclose(convert_from_qint4(oup), convert_from_qint4(inp))
+
+
+@pytest.mark.skipif(
+    get_device_count_by_fork("gpu") != 0,
+    reason="TypeCvt to quint4 is not supported on GPU",
+)
+def test_quint4_typecvt():
+    device = "xpux"
+    shape = (3, 3, 3)
+    data = np.random.random(shape).astype(np.float32) * 5 - 1
+
+    def typecvt(x, dt=None):
+        (y,) = apply(ops.TypeCvt(param=dt), x)
+        return y
+
+    # convert to quint4
+    dtype = quint4(0.01, 5)
+    oup = _get_compiled_result(
+        data, np.float32, shape, device, calc_func=partial(typecvt, dt=dtype)
+    )
+    _check_result_attr(oup, dtype, "quint4")
+    np.testing.assert_equal(oup, convert_to_quint4(data, dtype))
+
+    # convert from quint4 to float32
+    oup_float = _get_compiled_result(
+        oup, dtype, shape, device, calc_func=partial(typecvt, dt=np.float32)
+    )
+    assert oup_float.dtype == np.float32
+    np.testing.assert_equal(
+        oup_float, convert_from_quint4(convert_to_quint4(data, dtype))
+    )
diff --git a/imperative/python/test/unit/core/test_indexing_op.py b/imperative/python/test/unit/core/test_indexing_op.py
index e369ea08..80478d63 100644
--- a/imperative/python/test/unit/core/test_indexing_op.py
+++ b/imperative/python/test/unit/core/test_indexing_op.py
@@ -519,6 +519,18 @@ def test_advance_indexing_with_bool():
     np.testing.assert_equal(a[b], aa[bb].numpy())
     np.testing.assert_equal(a[:, [True, False]], aa[:, [True, False]].numpy())
 
+    a = np.array([[True, False], [False, True]])
+    b = np.array([1])
+    aa = Tensor(a)
+    bb = Tensor(b)
+    np.testing.assert_equal(a[b], aa[bb].numpy())
+    b = np.array([[True, True], [False, True]])
+    bb = Tensor(b)
+    np.testing.assert_equal(a[b], aa[bb].numpy())
+    a[b] = False
+    aa[bb] = False
+    np.testing.assert_equal(a, aa.numpy())
+
     # XXX: trace does not expect empty condtake tensor
     if not use_tensor_shape():
         a = np.ones((2, 2), dtype=np.int32)
diff --git a/imperative/python/test/unit/core/test_megbrain_graph.py b/imperative/python/test/unit/core/test_megbrain_graph.py
index 3fb6a9de..e7997606 100644
--- a/imperative/python/test/unit/core/test_megbrain_graph.py
+++ b/imperative/python/test/unit/core/test_megbrain_graph.py
@@ -11,7 +11,6 @@ from concurrent.futures import Future
 import numpy as np
 
 import megengine.functional as F
-from megengine.core._imperative_rt import DeviceTensorND
 from megengine.core.tensor import megbrain_graph as mgb_graph
 from megengine.core.tensor.raw_tensor import as_raw_tensor
 
@@ -83,3 +82,20 @@ def test_op():
     f()
 
     np.testing.assert_equal(x.numpy(), -y.result().numpy())
+
+
+def test_exception():
+    err_msg = "QwQ"
+
+    def throw_exc():
+        raise RuntimeError(err_msg)
+
+    g = mgb_graph.Graph()
+    x, _ = mgb_graph.input_callback(throw_exc, device="xpux", dtype="float32", graph=g)
+    y = mgb_graph.OutputNode(F.neg(x))
+    f = g.compile(y.outputs[0])
+    try:
+        f.execute()
+        y.get_value()
+    except Exception as exc:
+        assert err_msg in str(exc)
diff --git a/imperative/python/test/unit/core/test_tensor_wrapper.py b/imperative/python/test/unit/core/test_tensor_wrapper.py
index a90b109a..2f1a590d 100644
--- a/imperative/python/test/unit/core/test_tensor_wrapper.py
+++ b/imperative/python/test/unit/core/test_tensor_wrapper.py
@@ -35,11 +35,16 @@ def test_matmul():
 
 
 def test_reduce():
-    for m in ["sum", "prod", "min", "max", "mean"]:
-        x_np = np.random.rand(10).astype("float32")
-        x = TensorWrapper(x_np)
-        y = getattr(x, m)(axis=-1, keepdims=True)
-        np.testing.assert_almost_equal(y.numpy(), getattr(x_np, m)(-1), decimal=6)
+    def test_x(x_np):
+        for m in ["sum", "prod", "min", "max", "mean"]:
+            x = TensorWrapper(x_np)
+            y = getattr(x, m)(axis=-1, keepdims=True)
+            np.testing.assert_almost_equal(y.numpy(), getattr(x_np, m)(-1), decimal=6)
+
+    test_x((10 * np.random.rand(10) + 1).astype("int32"))
+    test_x(np.random.rand(10).astype("float32"))
+    test_x(np.array([True, True, True]))
+    test_x(np.array([True, False, True]))
 
 
 def test_set_subtensor():
diff --git a/imperative/python/test/unit/data/__init__.py b/imperative/python/test/unit/data/__init__.py
deleted file mode 100644
index e69de29b..00000000
diff --git a/imperative/python/test/unit/data/test_dataloader.py b/imperative/python/test/unit/data/test_dataloader.py
new file mode 100644
index 00000000..6bb0f3e3
--- /dev/null
+++ b/imperative/python/test/unit/data/test_dataloader.py
@@ -0,0 +1,183 @@
+# -*- coding: utf-8 -*-
+# MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
+#
+# Copyright (c) 2014-2020 Megvii Inc. All rights reserved.
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+import os
+import time
+
+import numpy as np
+import pytest
+
+from megengine.data.collator import Collator
+from megengine.data.dataloader import DataLoader
+from megengine.data.dataset import ArrayDataset
+from megengine.data.sampler import RandomSampler, SequentialSampler
+from megengine.data.transform import PseudoTransform, Transform
+
+
+def init_dataset():
+    sample_num = 100
+    rand_data = np.random.randint(0, 255, size=(sample_num, 1, 32, 32), dtype=np.uint8)
+    label = np.random.randint(0, 10, size=(sample_num,), dtype=int)
+    dataset = ArrayDataset(rand_data, label)
+    return dataset
+
+
+def test_dataloader_init():
+    dataset = init_dataset()
+    with pytest.raises(ValueError):
+        dataloader = DataLoader(dataset, num_workers=2, divide=True)
+    with pytest.raises(ValueError):
+        dataloader = DataLoader(dataset, num_workers=-1)
+    with pytest.raises(ValueError):
+        dataloader = DataLoader(dataset, timeout=-1)
+    with pytest.raises(ValueError):
+        dataloader = DataLoader(dataset, num_workers=0, divide=True)
+
+    dataloader = DataLoader(dataset)
+    assert isinstance(dataloader.sampler, SequentialSampler)
+    assert isinstance(dataloader.transform, PseudoTransform)
+    assert isinstance(dataloader.collator, Collator)
+
+    dataloader = DataLoader(
+        dataset, sampler=RandomSampler(dataset, batch_size=6, drop_last=False)
+    )
+    assert len(dataloader) == 17
+    dataloader = DataLoader(
+        dataset, sampler=RandomSampler(dataset, batch_size=6, drop_last=True)
+    )
+    assert len(dataloader) == 16
+
+
+def test_dataloader_serial():
+    dataset = init_dataset()
+    dataloader = DataLoader(
+        dataset, sampler=RandomSampler(dataset, batch_size=4, drop_last=False)
+    )
+    for (data, label) in dataloader:
+        assert data.shape == (4, 1, 32, 32)
+        assert label.shape == (4,)
+
+
+def test_dataloader_parallel():
+    # set max shared memory to 100M
+    os.environ["MGE_PLASMA_MEMORY"] = "100000000"
+
+    dataset = init_dataset()
+    dataloader = DataLoader(
+        dataset,
+        sampler=RandomSampler(dataset, batch_size=4, drop_last=False),
+        num_workers=2,
+        divide=False,
+    )
+    for (data, label) in dataloader:
+        assert data.shape == (4, 1, 32, 32)
+        assert label.shape == (4,)
+
+    dataloader = DataLoader(
+        dataset,
+        sampler=RandomSampler(dataset, batch_size=4, drop_last=False),
+        num_workers=2,
+        divide=True,
+    )
+    for (data, label) in dataloader:
+        assert data.shape == (4, 1, 32, 32)
+        assert label.shape == (4,)
+
+
+def test_dataloader_parallel_timeout():
+    dataset = init_dataset()
+
+    class TimeoutTransform(Transform):
+        def __init__(self):
+            pass
+
+        def apply(self, input):
+            time.sleep(10)
+            return input
+
+    dataloader = DataLoader(
+        dataset,
+        sampler=RandomSampler(dataset, batch_size=4, drop_last=False),
+        transform=TimeoutTransform(),
+        num_workers=2,
+        timeout=2,
+    )
+    with pytest.raises(RuntimeError, match=r".*timeout.*"):
+        data_iter = iter(dataloader)
+        batch_data = next(data_iter)
+
+
+def test_dataloader_parallel_worker_exception():
+    dataset = init_dataset()
+
+    class FakeErrorTransform(Transform):
+        def __init__(self):
+            pass
+
+        def apply(self, input):
+            y = x + 1
+            return input
+
+    dataloader = DataLoader(
+        dataset,
+        sampler=RandomSampler(dataset, batch_size=4, drop_last=False),
+        transform=FakeErrorTransform(),
+        num_workers=2,
+    )
+    with pytest.raises(RuntimeError, match=r"worker.*died"):
+        data_iter = iter(dataloader)
+        batch_data = next(data_iter)
+
+
+def _multi_instances_parallel_dataloader_worker():
+    dataset = init_dataset()
+
+    for divide_flag in [True, False]:
+        train_dataloader = DataLoader(
+            dataset,
+            sampler=RandomSampler(dataset, batch_size=4, drop_last=False),
+            num_workers=2,
+            divide=divide_flag,
+        )
+        val_dataloader = DataLoader(
+            dataset,
+            sampler=RandomSampler(dataset, batch_size=10, drop_last=False),
+            num_workers=2,
+            divide=divide_flag,
+        )
+        for idx, (data, label) in enumerate(train_dataloader):
+            assert data.shape == (4, 1, 32, 32)
+            assert label.shape == (4,)
+            if idx % 5 == 0:
+                for val_data, val_label in val_dataloader:
+                    assert val_data.shape == (10, 1, 32, 32)
+                    assert val_label.shape == (10,)
+
+
+def test_dataloader_parallel_multi_instances():
+    # set max shared memory to 100M
+    os.environ["MGE_PLASMA_MEMORY"] = "100000000"
+
+    _multi_instances_parallel_dataloader_worker()
+
+
+def test_dataloader_parallel_multi_instances_multiprocessing():
+    # set max shared memory to 100M
+    os.environ["MGE_PLASMA_MEMORY"] = "100000000"
+
+    import multiprocessing as mp
+
+    # mp.set_start_method("spawn")
+    processes = []
+    for i in range(4):
+        p = mp.Process(target=_multi_instances_parallel_dataloader_worker)
+        p.start()
+        processes.append(p)
+
+    for p in processes:
+        p.join()
diff --git a/imperative/python/test/unit/distributed/test_distributed.py b/imperative/python/test/unit/distributed/test_distributed.py
index f81b9f42..29eed7ef 100644
--- a/imperative/python/test/unit/distributed/test_distributed.py
+++ b/imperative/python/test/unit/distributed/test_distributed.py
@@ -10,12 +10,17 @@ import multiprocessing as mp
 import platform
 import queue
 
+import numpy as np
 import pytest
 
 import megengine as mge
 import megengine.distributed as dist
 from megengine.core.ops.builtin import CollectiveComm, ParamPackConcat, ParamPackSplit
-from megengine.distributed.helper import get_device_count_by_fork
+from megengine.distributed.helper import (
+    get_device_count_by_fork,
+    param_pack_concat,
+    param_pack_split,
+)
 
 
 def _assert_q_empty(q):
@@ -195,3 +200,19 @@ def test_oprmm_hashable():
     rhs = (CollectiveComm(), ParamPackConcat(), ParamPackSplit())
     assert lhs == rhs
     assert hash(lhs) == hash(rhs)
+
+
+def test_param_pack_split():
+    a = mge.Tensor(np.ones((10,), np.int32))
+    b, c = param_pack_split(a, [0, 1, 1, 10], [(1,), (3, 3)])
+    assert np.allclose(b.numpy(), a.numpy()[1])
+    assert np.allclose(c.numpy(), a.numpy()[1:].reshape(3, 3))
+
+
+def test_param_pack_concat():
+    a = mge.Tensor(np.ones((1,), np.int32))
+    b = mge.Tensor(np.ones((3, 3), np.int32))
+    offsets_val = [0, 1, 1, 10]
+    offsets = mge.Tensor(offsets_val, np.int32)
+    c = param_pack_concat([a, b], offsets, offsets_val)
+    assert np.allclose(np.concatenate([a.numpy(), b.numpy().flatten()]), c.numpy())
diff --git a/imperative/python/test/unit/functional/__init__.py b/imperative/python/test/unit/functional/__init__.py
deleted file mode 100644
index 1207b5d9..00000000
--- a/imperative/python/test/unit/functional/__init__.py
+++ /dev/null
@@ -1,8 +0,0 @@
-# -*- coding: utf-8 -*-
-# MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
-#
-# Copyright (c) 2014-2020 Megvii Inc. All rights reserved.
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
diff --git a/imperative/python/test/unit/functional/test_elemwise.py b/imperative/python/test/unit/functional/test_elemwise.py
index db7db5cc..30421dd8 100644
--- a/imperative/python/test/unit/functional/test_elemwise.py
+++ b/imperative/python/test/unit/functional/test_elemwise.py
@@ -10,34 +10,33 @@ import numpy as np
 
 import megengine.functional as F
 from megengine import tensor
-from megengine.test import assertTensorClose
 
 
 def test_abs():
-    assertTensorClose(
+    np.testing.assert_allclose(
         F.abs(tensor([-3.0, -4.0, -5.0])).numpy(),
         np.abs(np.array([-3.0, -4.0, -5.0], dtype=np.float32)),
     )
 
-    assertTensorClose(F.abs(-3.0).numpy(), np.abs(np.float32(-3.0)))
+    np.testing.assert_allclose(F.abs(-3.0).numpy(), np.abs(np.float32(-3.0)))
 
 
 def test_multiply():
-    assertTensorClose(
+    np.testing.assert_allclose(
         F.mul(-3.0, -4.0).numpy(), np.multiply(np.float32(-3.0), np.float32(-4.0))
     )
 
-    assertTensorClose(
+    np.testing.assert_allclose(
         F.mul(tensor([3.0, 4.0]), 4.0).numpy(),
         np.multiply(np.array([3.0, 4.0], dtype=np.float32), 4.0),
     )
 
-    assertTensorClose(
+    np.testing.assert_allclose(
         F.mul(4.0, tensor([3.0, 4.0])).numpy(),
         np.multiply(4.0, np.array([3.0, 4.0], dtype=np.float32)),
     )
 
-    assertTensorClose(
+    np.testing.assert_allclose(
         F.mul(tensor([3.0, 4.0]), tensor([3.0, 4.0])).numpy(),
         np.multiply(
             np.array([3.0, 4.0], dtype=np.float32),
@@ -48,27 +47,31 @@ def test_multiply():
 
 def test_clamp():
     """Fix an issue when `lower` or `upper` is 0, it will be recognized as `False` and
-    `F.clamp` will fall into wrong conditions unexpectedly.
+    `F.clip` will fall into wrong conditions unexpectedly.
     """
     x = np.linspace(-6, 6, dtype="float32")
-    assertTensorClose(F.clamp(tensor(x) + 3, 0, 6).numpy(), np.clip(x + 3, 0, 6))
-    assertTensorClose(F.clamp(tensor(x) - 3, -6, 0).numpy(), np.clip(x - 3, -6, 0))
+    np.testing.assert_allclose(
+        F.clip(tensor(x) + 3, 0, 6).numpy(), np.clip(x + 3, 0, 6)
+    )
+    np.testing.assert_allclose(
+        F.clip(tensor(x) - 3, -6, 0).numpy(), np.clip(x - 3, -6, 0)
+    )
 
 
 def test_isnan():
     for case in [[1, float("nan"), 0]]:
-        assertTensorClose(F.isnan(tensor(case)).numpy(), np.isnan(case))
+        np.testing.assert_allclose(F.isnan(tensor(case)).numpy(), np.isnan(case))
 
 
 def test_isinf():
     for case in [[1, float("inf"), 0]]:
-        assertTensorClose(F.isinf(tensor(case)).numpy(), np.isinf(case))
+        np.testing.assert_allclose(F.isinf(tensor(case)).numpy(), np.isinf(case))
 
 
 def test_sign():
     for case in [[1, -1, 0]]:
         x = tensor(case)
-        assertTensorClose(F.sign(x).numpy(), np.sign(case).astype(x.dtype))
+        np.testing.assert_allclose(F.sign(x).numpy(), np.sign(case).astype(x.dtype))
 
 
 def test_cosh():
@@ -110,14 +113,6 @@ def test_atanh():
     np.testing.assert_almost_equal(y_np, y_mge, decimal=5)
 
 
-def test_fast_tanh():
-    np.random.seed(42)
-    x = np.random.randn(100).astype("float32")
-    y_np = x * (27.0 + x * x) / (27.0 + 9.0 * x * x)
-    y_mge = F.fast_tanh(tensor(x)).numpy()
-    np.testing.assert_almost_equal(y_np, y_mge, decimal=6)
-
-
 def test_hswish():
     np.random.seed(42)
     x = np.random.randn(100).astype("float32")
diff --git a/imperative/python/test/unit/functional/test_functional.py b/imperative/python/test/unit/functional/test_functional.py
index 0778fb15..ec99c5ca 100644
--- a/imperative/python/test/unit/functional/test_functional.py
+++ b/imperative/python/test/unit/functional/test_functional.py
@@ -7,9 +7,11 @@
 # software distributed under the License is distributed on an
 # "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 import itertools
+from functools import partial
 
 import numpy as np
 import pytest
+from utils import opr_test
 
 import megengine.core.ops.builtin as builtin
 import megengine.core.tensor.dtype as dtype
@@ -18,69 +20,6 @@ from megengine import Parameter, Tensor, is_cuda_available, tensor
 from megengine.core._trace_option import use_tensor_shape
 from megengine.core.autodiff.grad import Grad
 from megengine.core.tensor.utils import make_shape_tuple
-from megengine.test import assertTensorClose
-
-
-def _default_compare_fn(x, y):
-    assertTensorClose(x.numpy(), y)
-
-
-def opr_test(cases, func, compare_fn=_default_compare_fn, ref_fn=None, **kwargs):
-    """
-    func: the function to run opr.
-    compare_fn: the function to compare the result and expected, use assertTensorClose if None.
-    ref_fn: the function to generate expected data, should assign output if None.
-    cases: the list which have dict element, the list length should be 2 for dynamic shape test.
-           and the dict should have input,
-           and should have output if ref_fn is None.
-           should use list for multiple inputs and outputs for each case.
-    kwargs: The additional kwargs for opr func.
-
-    simple examples:
-
-        dtype = np.float32
-        cases = [{"input": [10, 20]}, {"input": [20, 30]}]
-        opr_test(cases,
-                 F.eye,
-                 ref_fn=lambda n, m: np.eye(n, m).astype(dtype),
-                 dtype=dtype)
-
-    """
-
-    def check_results(results, expected):
-        if not isinstance(results, (tuple, list)):
-            results = (results,)
-        for r, e in zip(results, expected):
-            compare_fn(r, e)
-
-    def get_param(cases, idx):
-        case = cases[idx]
-        inp = case.get("input", None)
-        outp = case.get("output", None)
-        if inp is None:
-            raise ValueError("the test case should have input")
-        if not isinstance(inp, (tuple, list)):
-            inp = (inp,)
-        if ref_fn is not None and callable(ref_fn):
-            outp = ref_fn(*inp)
-        if outp is None:
-            raise ValueError("the test case should have output or reference function")
-        if not isinstance(outp, (tuple, list)):
-            outp = (outp,)
-
-        return inp, outp
-
-    if len(cases) == 0:
-        raise ValueError("should give one case at least")
-
-    if not callable(func):
-        raise ValueError("the input func should be callable")
-
-    inp, outp = get_param(cases, 0)
-    inp_tensor = [tensor(inpi) for inpi in inp]
-
-    results = func(*inp_tensor, **kwargs)
-    check_results(results, outp)
 
 
 def test_where():
@@ -163,43 +102,43 @@ def test_interpolate():
     def linear_interpolate():
         inp = tensor(np.arange(1, 3, dtype=np.float32).reshape(1, 1, 2))
 
-        out = F.interpolate(inp, scale_factor=2.0, mode="LINEAR")
-        out2 = F.interpolate(inp, 4, mode="LINEAR")
+        out = F.nn.interpolate(inp, scale_factor=2.0, mode="LINEAR")
+        out2 = F.nn.interpolate(inp, 4, mode="LINEAR")
 
-        assertTensorClose(
+        np.testing.assert_allclose(
             out.numpy(), np.array([[[1.0, 1.25, 1.75, 2.0]]], dtype=np.float32)
         )
-        assertTensorClose(
+        np.testing.assert_allclose(
             out2.numpy(), np.array([[[1.0, 1.25, 1.75, 2.0]]], dtype=np.float32)
         )
 
     def many_batch_interpolate():
         inp = tensor(np.arange(1, 9, dtype=np.float32).reshape(2, 1, 2, 2))
 
-        out = F.interpolate(inp, [4, 4])
-        out2 = F.interpolate(inp, scale_factor=2.0)
+        out = F.nn.interpolate(inp, [4, 4])
+        out2 = F.nn.interpolate(inp, scale_factor=2.0)
 
-        assertTensorClose(out.numpy(), out2.numpy())
+        np.testing.assert_allclose(out.numpy(), out2.numpy())
 
     def assign_corner_interpolate():
         inp = tensor(np.arange(1, 5, dtype=np.float32).reshape(1, 1, 2, 2))
 
-        out = F.interpolate(inp, [4, 4], align_corners=True)
-        out2 = F.interpolate(inp, scale_factor=2.0, align_corners=True)
+        out = F.nn.interpolate(inp, [4, 4], align_corners=True)
+        out2 = F.nn.interpolate(inp, scale_factor=2.0, align_corners=True)
 
-        assertTensorClose(out.numpy(), out2.numpy())
+        np.testing.assert_allclose(out.numpy(), out2.numpy())
 
     def error_shape_linear_interpolate():
         inp = tensor(np.arange(1, 5, dtype=np.float32).reshape(1, 1, 2, 2))
 
         with pytest.raises(ValueError):
-            F.interpolate(inp, scale_factor=2.0, mode="LINEAR")
+            F.nn.interpolate(inp, scale_factor=2.0, mode="LINEAR")
 
     def inappropriate_scale_linear_interpolate():
         inp = tensor(np.arange(1, 3, dtype=np.float32).reshape(1, 1, 2))
 
         with pytest.raises(ValueError):
-            F.interpolate(inp, scale_factor=[2.0, 3.0], mode="LINEAR")
+            F.nn.interpolate(inp, scale_factor=[2.0, 3.0], mode="LINEAR")
 
     linear_interpolate()
     many_batch_interpolate()
@@ -232,7 +171,7 @@ def test_roi_align():
     grad = Grad().wrt(inp_feat, callback=_save_to(inp_feat))
 
     output_shape = (7, 7)
-    out_feat = F.roi_align(
+    out_feat = F.nn.roi_align(
         inp_feat,
         rois,
         output_shape=output_shape,
@@ -255,7 +194,7 @@ def test_roi_pooling():
     inp_feat, rois = _gen_roi_inp()
     grad = Grad().wrt(inp_feat, callback=_save_to(inp_feat))
     output_shape = (7, 7)
-    out_feat = F.roi_pooling(
+    out_feat = F.nn.roi_pooling(
         inp_feat, rois, output_shape=output_shape, mode="max", scale=1.0 / 4,
     )
     assert make_shape_tuple(out_feat.shape) == (
@@ -268,12 +207,72 @@ def test_roi_pooling():
     assert make_shape_tuple(inp_feat.grad.shape) == make_shape_tuple(inp_feat.shape)
 
 
+def test_adaptive_avg_pool2d():
+    inp = tensor(np.arange(0, 16, dtype=np.float32).reshape(1, 1, 4, 4))
+    oshp = (2, 2)
+    grad = Grad().wrt(inp, callback=_save_to(inp))
+    outp = F.adaptive_avg_pool2d(inp, oshp,)
+    assert make_shape_tuple(outp.shape) == (inp.shape[0], inp.shape[1], *oshp,)
+    np.testing.assert_equal(
+        outp.numpy(), np.array([[[[2.5, 4.5], [10.5, 12.5]]]], dtype=np.float32)
+    )
+
+    grad(outp, tensor(F.ones_like(outp)))
+    assert make_shape_tuple(inp.grad.shape) == make_shape_tuple(inp.shape)
+    np.testing.assert_equal(
+        inp.grad.numpy(),
+        np.array(
+            [
+                [
+                    [
+                        [0.25, 0.25, 0.25, 0.25],
+                        [0.25, 0.25, 0.25, 0.25],
+                        [0.25, 0.25, 0.25, 0.25],
+                        [0.25, 0.25, 0.25, 0.25],
+                    ]
+                ]
+            ],
+            dtype=np.float32,
+        ),
+    )
+
+
+def test_adaptive_max_pool2d():
+    inp = tensor(np.arange(0, 16, dtype=np.float32).reshape(1, 1, 4, 4))
+    oshp = (2, 2)
+    grad = Grad().wrt(inp, callback=_save_to(inp))
+    outp = F.adaptive_max_pool2d(inp, oshp,)
+    assert make_shape_tuple(outp.shape) == (inp.shape[0], inp.shape[1], *oshp,)
+    np.testing.assert_equal(
+        outp.numpy(), np.array([[[[5, 7], [13, 15]]]], dtype=np.float32)
+    )
+
+    grad(outp, tensor(F.ones_like(outp)))
+    assert make_shape_tuple(inp.grad.shape) == make_shape_tuple(inp.shape)
+    np.testing.assert_equal(
+        inp.grad.numpy(),
+        np.array(
+            [
+                [
+                    [
+                        [0.0, 0.0, 0.0, 0.0],
+                        [0.0, 1.0, 0.0, 1.0],
+                        [0.0, 0.0, 0.0, 0.0],
+                        [0.0, 1.0, 0.0, 1.0],
+                    ]
+                ]
+            ],
+            dtype=np.float32,
+        ),
+    )
+
+
 def test_one_hot():
     def onehot_low_dimension():
         inp = tensor(np.arange(1, 4, dtype=np.int32))
         out = F.one_hot(inp, num_classes=4)
 
-        assertTensorClose(
+        np.testing.assert_allclose(
             out.numpy(), np.eye(4, dtype=np.int32)[np.arange(1, 4, dtype=np.int32)]
         )
 
@@ -286,47 +285,12 @@ def test_one_hot():
         inp = tensor(arr)
         out = F.one_hot(inp, 10)
 
-        assertTensorClose(out.numpy(), np.eye(10, dtype=np.int32)[arr])
+        np.testing.assert_allclose(out.numpy(), np.eye(10, dtype=np.int32)[arr])
 
     onehot_low_dimension()
     onehot_high_dimension()
 
 
-def test_add_update():
-    shape = (2, 3)
-    v = np.random.random(shape).astype(np.float32)
-    b = Tensor(v)
-
-    u = F.add_update(b, 1)
-    assertTensorClose(u.numpy(), v + 1)
-    u = F.add_update(b, 1)
-    assertTensorClose(u.numpy(), v + 2)
-
-    x = np.ones((2, 2), dtype=np.float32)
-    y = x * 0.5
-    dest = tensor(x)
-    delta = tensor(y)
-    r = F.add_update(dest, delta, alpha=0.9, beta=0.1, bias=0.1)
-    assertTensorClose(r.numpy(), x * 0.9 + y * 0.1 + 0.1)
-
-
-def test_add_update_params():
-    b = np.random.random((2, 3)).astype(np.float32)
-    y = Tensor(b)
-
-    # @jit.trace
-    def f(x):
-        return F.add_update(y, x)
-
-    f(np.zeros((2, 3)).astype(np.float32))
-
-    z = Tensor(np.zeros((2, 3)).astype(np.float32))
-    F.add_update(y, z, beta=0.1)
-
-    res = f(np.ones((2, 3)).astype(np.float32))
-    assertTensorClose(res.numpy(), b + 1)
-
-
 def test_binary_cross_entropy():
     data1_shape = (2, 2)
     label1_shape = (2, 2)
@@ -337,15 +301,15 @@ def test_binary_cross_entropy():
         return 1 / (1 + np.exp(-x))
 
     def compare_fn(x, y):
-        assertTensorClose(x.numpy(), y, max_err=5e-4)
+        np.testing.assert_allclose(x.numpy(), y, atol=5e-4)
 
     np.random.seed(123)
-    data1 = sigmoid(np.random.uniform(size=data1_shape).astype(np.float32))
+    data1 = np.random.uniform(size=data1_shape).astype(np.float32)
     label1 = np.random.uniform(size=label1_shape).astype(np.float32)
     expect1 = np.array([0.6361], dtype=np.float32)
 
     np.random.seed(123)
-    data2 = sigmoid(np.random.uniform(size=data2_shape).astype(np.float32))
+    data2 = np.random.uniform(size=data2_shape).astype(np.float32)
     label2 = np.random.uniform(size=label2_shape).astype(np.float32)
     expect2 = np.array([0.6750], dtype=np.float32)
 
@@ -353,7 +317,17 @@ def test_binary_cross_entropy():
         {"input": [data1, label1], "output": expect1,},
         {"input": [data2, label2], "output": expect2,},
     ]
-    opr_test(cases, F.binary_cross_entropy, compare_fn=compare_fn)
+    opr_test(cases, F.nn.binary_cross_entropy, compare_fn=compare_fn)
+
+    cases = [
+        {"input": [sigmoid(data1), label1], "output": expect1,},
+        {"input": [sigmoid(data2), label2], "output": expect2,},
+    ]
+    opr_test(
+        cases,
+        partial(F.nn.binary_cross_entropy, with_logits=False),
+        compare_fn=compare_fn,
+    )
 
 
 def test_hinge_loss():
@@ -366,7 +340,7 @@ def test_hinge_loss():
         expect = np.clip(0, np.inf, 1 - data * label).sum(axis=1).mean()
         cases.append({"input": [data, label], "output": expect})
 
-    opr_test(cases, F.hinge_loss)
+    opr_test(cases, F.nn.hinge_loss)
 
     # cases with L2 norm
     cases = []
@@ -377,7 +351,7 @@ def test_hinge_loss():
         cases.append({"input": [data, label], "output": expect})
 
     def hinge_loss_with_l2_norm(pred, label):
-        return F.hinge_loss(pred, label, "L2")
+        return F.nn.hinge_loss(pred, label, "L2")
 
     opr_test(cases, hinge_loss_with_l2_norm)
 
@@ -394,29 +368,10 @@ def test_nms():
     )
     inp = tensor(x)
     scores = tensor([0.5, 0.8, 0.9, 0.6], dtype=np.float32)
-    result = F.nms(inp, scores=scores, iou_thresh=0.5)
+    result = F.nn.nms(inp, scores=scores, iou_thresh=0.5)
     np.testing.assert_equal(result.numpy(), np.array([2, 1, 3], dtype=np.int32))
 
 
-def test_batched_nms():
-    x = np.array(
-        [
-            [0, 0, 100, 100],
-            [0.5, 0.5, 1.5, 1.5],
-            [20, 20, 100, 100],
-            [0.5, 0.5, 1.0, 1.0],
-            [10, 10, 100, 100],
-            [0.5, 0.5, 1.0, 1.0],
-        ],
-        dtype=np.float32,
-    )
-    inp = tensor(x)
-    scores = tensor([0.6, 0.9, 0.5, 0.6, 0.8, 0.7], dtype=np.float32)
-    idxs = tensor([0, 1, 0, 1, 0, 1], dtype=np.int32)
-    results = F.batched_nms(inp, scores=scores, idxs=idxs, iou_thresh=0.5)
-    np.testing.assert_equal(results.numpy(), np.array([1, 4, 5], dtype=np.int32))
-
-
 @pytest.mark.skip(reason="cuda does not support nchw int8")
 def test_conv_bias():
     inp_scale = 1.5
@@ -483,7 +438,7 @@ def test_conv_bias():
                 inp = convert_to_nchw4(inp)
                 w = convert_to_nchw4(w)
                 b = convert_to_nchw4(b)
-            return F.conv_bias_activation(
+            return F.nn.conv_bias_activation(
                 inp,
                 w,
                 b,
@@ -505,7 +460,7 @@ def test_conv_bias():
             result = F.transpose(result, (0, 1, 4, 2, 3))
         expected = F.flatten(expected)
         result = F.flatten(result)
-        assertTensorClose(result.numpy(), expected.numpy(), max_err=outp_scale)
+        np.testing.assert_allclose(result.numpy(), expected.numpy(), atol=outp_scale)
 
     run(1, 4, 4, 24, 33, 1, 1, 2, 3, 1, 1, False)
     run(10, 12, 24, 46, 46, 1, 1, 2, 1, 3, 1, False)
@@ -519,6 +474,15 @@ def test_conv_bias():
     run(10, 36, 8, 46, 26, 2, 2, 2, 1, 1, 2, True, "RELU")
 
 
+def test_zero_stride_numpy_array():
+    inp = np.random.randn(3, 224, 224).astype(np.float32)
+    inp = inp[np.newaxis, :]
+
+    inp = tensor(inp, dtype=np.float32)
+    weight = tensor(np.random.randn(16, 3, 3, 3), dtype=np.float32)
+    out = F.conv2d(inp, weight, None, (2, 2), (3, 3), (1, 1), 1)
+
+
 def test_condtake():
     x = np.array([[1, 2, 3], [4, 5, 6]])
     y = np.array([[True, False, True], [False, True, True]])
@@ -544,3 +508,5 @@ def test_nms_is_same():
     assert op1 != op3
     assert op1 != op4
     assert op3 != op4
+
+
diff --git a/imperative/python/test/unit/functional/test_distributed.py b/imperative/python/test/unit/functional/test_functional_distributed.py
similarity index 100%
rename from imperative/python/test/unit/functional/test_distributed.py
rename to imperative/python/test/unit/functional/test_functional_distributed.py
diff --git a/imperative/python/test/unit/functional/test_loss.py b/imperative/python/test/unit/functional/test_loss.py
index 50b9a7dc..8bfd1cd5 100644
--- a/imperative/python/test/unit/functional/test_loss.py
+++ b/imperative/python/test/unit/functional/test_loss.py
@@ -12,15 +12,34 @@ import megengine.functional as F
 from megengine import tensor
 
 
-def test_cross_entropy_with_softmax():
+def test_cross_entropy_with_logits():
     data = tensor([1, 100]).astype(np.float32).reshape((1, 2))
     label = tensor([1]).astype(np.int32)
-    loss = F.cross_entropy_with_softmax(data, label)
+    loss = F.nn.cross_entropy(data, label)
     np.testing.assert_allclose(loss.numpy(), 0.0)
     label = tensor([0]).astype(np.int32)
-    loss = F.cross_entropy_with_softmax(data, label)
+    loss = F.nn.cross_entropy(data, label)
     np.testing.assert_allclose(loss.numpy(), 100 - 1)
 
     label = np.array([1])
-    loss = F.cross_entropy_with_softmax(data, label)
+    loss = F.nn.cross_entropy(data, label)
     np.testing.assert_allclose(loss.numpy(), 0.0)
+
+
+def test_cross_entropy():
+    def softmax(x):
+        x = np.exp(x)
+        x /= x.sum(1, keepdims=True)
+        return x
+
+    def ref(x, y):
+        return np.mean([-np.log(x[i, y[i]]) for i in range(len(y))])
+
+    x = (np.random.rand(5, 10) - 0.5) * 4
+    y = np.random.randint(10, size=(5,))
+    for i in range(len(x)):
+        x[i, y[i]] += np.random.rand() * 2
+    x = softmax(x)
+    l_ref = ref(x, y)
+    l = F.nn.cross_entropy(tensor(x, "float32"), tensor(y, "int32"), with_logits=False)
+    np.testing.assert_allclose(l.numpy(), l_ref)
diff --git a/imperative/python/test/unit/functional/test_math.py b/imperative/python/test/unit/functional/test_math.py
index 64e0b10f..367b50a8 100644
--- a/imperative/python/test/unit/functional/test_math.py
+++ b/imperative/python/test/unit/functional/test_math.py
@@ -9,76 +9,10 @@
 from functools import partial
 
 import numpy as np
+from utils import opr_test
 
 import megengine.functional as F
 from megengine import tensor
-from megengine.test import assertTensorClose
-
-
-def _default_compare_fn(x, y):
-    assertTensorClose(x.numpy(), y)
-
-
-def opr_test(cases, func, compare_fn=_default_compare_fn, ref_fn=None, **kwargs):
-    """
-    func: the function to run opr.
-    compare_fn: the function to compare the result and expected, use assertTensorClose if None.
-    ref_fn: the function to generate expected data, should assign output if None.
-    cases: the list which have dict element, the list length should be 2 for dynamic shape test.
-           and the dict should have input,
-           and should have output if ref_fn is None.
-           should use list for multiple inputs and outputs for each case.
-    kwargs: The additional kwargs for opr func.
-
-    simple examples:
-
-        dtype = np.float32
-        cases = [{"input": [10, 20]}, {"input": [20, 30]}]
-        opr_test(cases,
-                 F.eye,
-                 ref_fn=lambda n, m: np.eye(n, m).astype(dtype),
-                 dtype=dtype)
-
-    """
-
-    def check_results(results, expected):
-        if not isinstance(results, tuple):
-            results = (results,)
-        for r, e in zip(results, expected):
-            compare_fn(r, e)
-
-    def get_param(cases, idx):
-        case = cases[idx]
-        inp = case.get("input", None)
-        outp = case.get("output", None)
-        if inp is None:
-            raise ValueError("the test case should have input")
-        if not isinstance(inp, list):
-            inp = (inp,)
-        else:
-            inp = tuple(inp)
-        if ref_fn is not None and callable(ref_fn):
-            outp = ref_fn(*inp)
-        if outp is None:
-            raise ValueError("the test case should have output or reference function")
-        if not isinstance(outp, list):
-            outp = (outp,)
-        else:
-            outp = tuple(outp)
-
-        return inp, outp
-
-    if len(cases) == 0:
-        raise ValueError("should give one case at least")
-
-    if not callable(func):
-        raise ValueError("the input func should be callable")
-
-    inp, outp = get_param(cases, 0)
-    inp_tensor = [tensor(inpi) for inpi in inp]
-
-    results = func(*inp_tensor, **kwargs)
-    check_results(results, outp)
 
 
 def common_test_reduce(opr, ref_opr):
@@ -190,11 +124,11 @@ def test_normalize():
             norm = np.sum(x ** p, axis=axis, keepdims=True) ** (1.0 / p)
         return x / np.clip(norm, a_min=eps, a_max=np.inf)
 
-    # Test L-2 norm along all dimensions
-    opr_test(cases, F.normalize, ref_fn=np_normalize)
+    # # Test L-2 norm along all dimensions
+    # opr_test(cases, F.normalize, ref_fn=np_normalize)
 
-    # Test L-1 norm along all dimensions
-    opr_test(cases, partial(F.normalize, p=1), ref_fn=partial(np_normalize, p=1))
+    # # Test L-1 norm along all dimensions
+    # opr_test(cases, partial(F.normalize, p=1), ref_fn=partial(np_normalize, p=1))
 
     # Test L-2 norm along the second dimension
     opr_test(cases, partial(F.normalize, axis=1), ref_fn=partial(np_normalize, axis=1))
diff --git a/imperative/python/test/unit/functional/test_tensor.py b/imperative/python/test/unit/functional/test_tensor.py
index c17cf310..b58668aa 100644
--- a/imperative/python/test/unit/functional/test_tensor.py
+++ b/imperative/python/test/unit/functional/test_tensor.py
@@ -6,93 +6,36 @@
 # Unless required by applicable law or agreed to in writing,
 # software distributed under the License is distributed on an
 # "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+import os
 import platform
 
 import numpy as np
 import pytest
+from utils import opr_test
 
 import megengine.functional as F
 from megengine import tensor
 from megengine.core._trace_option import use_tensor_shape
 from megengine.core.tensor.utils import astensor1d
 from megengine.distributed.helper import get_device_count_by_fork
-from megengine.test import assertTensorClose
-
-
-def _default_compare_fn(x, y):
-    assertTensorClose(x.numpy(), y)
-
-
-def opr_test(cases, func, compare_fn=_default_compare_fn, ref_fn=None, **kwargs):
-    """
-    func: the function to run opr.
-    compare_fn: the function to compare the result and expected, use assertTensorClose if None.
-    ref_fn: the function to generate expected data, should assign output if None.
-    cases: the list which have dict element, the list length should be 2 for dynamic shape test.
-           and the dict should have input,
-           and should have output if ref_fn is None.
-           should use list for multiple inputs and outputs for each case.
-    kwargs: The additional kwargs for opr func.
-
-    simple examples:
-
-        dtype = np.float32
-        cases = [{"input": [10, 20]}, {"input": [20, 30]}]
-        opr_test(cases,
-                 F.eye,
-                 ref_fn=lambda n, m: np.eye(n, m).astype(dtype),
-                 dtype=dtype)
-
-    """
-
-    def check_results(results, expected):
-        if not isinstance(results, tuple):
-            results = (results,)
-        for r, e in zip(results, expected):
-            compare_fn(r, e)
-
-    def get_param(cases, idx):
-        case = cases[idx]
-        inp = case.get("input", None)
-        outp = case.get("output", None)
-        if inp is None:
-            raise ValueError("the test case should have input")
-        if not isinstance(inp, list):
-            inp = (inp,)
-        else:
-            inp = tuple(inp)
-        if ref_fn is not None and callable(ref_fn):
-            outp = ref_fn(*inp)
-        if outp is None:
-            raise ValueError("the test case should have output or reference function")
-        if not isinstance(outp, list):
-            outp = (outp,)
-        else:
-            outp = tuple(outp)
-
-        return inp, outp
-
-    if len(cases) == 0:
-        raise ValueError("should give one case at least")
-
-    if not callable(func):
-        raise ValueError("the input func should be callable")
-
-    inp, outp = get_param(cases, 0)
-    inp_tensor = [tensor(inpi) for inpi in inp]
-
-    results = func(*inp_tensor, **kwargs)
-    check_results(results, outp)
 
 
 def test_eye():
     dtype = np.float32
-    cases = [{"input": [10, 20]}, {"input": [20, 30]}]
+    cases = [{"input": [10, 20]}, {"input": [30]}]
     for case in cases:
-        assertTensorClose(
+        np.testing.assert_allclose(
             F.eye(case["input"], dtype=dtype).numpy(),
             np.eye(*case["input"]).astype(dtype),
         )
+        np.testing.assert_allclose(
+            F.eye(*case["input"], dtype=dtype).numpy(),
+            np.eye(*case["input"]).astype(dtype),
+        )
+        np.testing.assert_allclose(
+            F.eye(tensor(case["input"]), dtype=dtype).numpy(),
+            np.eye(*case["input"]).astype(dtype),
+        )
 
 
 def test_concat():
@@ -165,7 +108,7 @@ def test_squeeze():
 
     for axis in [None, 3, -4, (3, -4)]:
         y = np.squeeze(x, axis)
-        yy = F.remove_axis(xx, axis)
+        yy = F.squeeze(xx, axis)
         np.testing.assert_equal(y, yy.numpy())
 
 
@@ -175,7 +118,7 @@ def test_expand_dims():
 
     for axis in [2, -3, (3, -4), (1, -4)]:
         y = np.expand_dims(x, axis)
-        yy = F.add_axis(xx, axis)
+        yy = F.expand_dims(xx, axis)
         np.testing.assert_equal(y, yy.numpy())
 
 
@@ -265,37 +208,37 @@ def test_flatten():
     data1 = np.random.random(data1_shape).astype(np.float32)
 
     def compare_fn(x, y):
-        assert x.numpy().shape == y[0]
+        assert x.shape[0] == y
 
     output0 = (2 * 3 * 4 * 5,)
     output1 = (4 * 5 * 6 * 7,)
     cases = [
-        {"input": data0, "output": (output0,)},
-        {"input": data1, "output": (output1,)},
+        {"input": data0, "output": output0},
+        {"input": data1, "output": output1},
     ]
     opr_test(cases, F.flatten, compare_fn=compare_fn)
 
     output0 = (2, 3 * 4 * 5)
     output1 = (4, 5 * 6 * 7)
     cases = [
-        {"input": data0, "output": (output0,)},
-        {"input": data1, "output": (output1,)},
+        {"input": data0, "output": output0},
+        {"input": data1, "output": output1},
     ]
     opr_test(cases, F.flatten, compare_fn=compare_fn, start_axis=1)
 
     output0 = (2, 3, 4 * 5)
     output1 = (4, 5, 6 * 7)
     cases = [
-        {"input": data0, "output": (output0,)},
-        {"input": data1, "output": (output1,)},
+        {"input": data0, "output": output0},
+        {"input": data1, "output": output1},
     ]
     opr_test(cases, F.flatten, compare_fn=compare_fn, start_axis=2)
 
     output0 = (2, 3 * 4, 5)
     output1 = (4, 5 * 6, 7)
     cases = [
-        {"input": data0, "output": (output0,)},
-        {"input": data1, "output": (output1,)},
+        {"input": data0, "output": output0},
+        {"input": data1, "output": output1},
     ]
     opr_test(cases, F.flatten, compare_fn=compare_fn, start_axis=1, end_axis=2)
 
@@ -305,18 +248,28 @@ def test_broadcast():
     output1_shape = (30, 20, 30)
     data1 = np.random.random(input1_shape).astype(np.float32)
 
-    input2_shape = (10, 20)
+    input2_shape = (10, 1)
     output2_shape = (20, 10, 20)
     data2 = np.random.random(input2_shape).astype(np.float32)
 
     def compare_fn(x, y):
-        assert x.numpy().shape == y
+        assert x.shape[0] == y
 
     cases = [
         {"input": [data1, output1_shape], "output": output1_shape},
         {"input": [data2, output2_shape], "output": output2_shape},
     ]
-    opr_test(cases, F.broadcast, compare_fn=compare_fn)
+    opr_test(cases, F.broadcast_to, compare_fn=compare_fn)
+
+    x = F.ones((2, 1, 3))
+    with pytest.raises(ValueError):
+        F.broadcast_to(x, (2, 3, 4))
+
+    with pytest.raises(ValueError):
+        F.broadcast_to(x, (4, 1, 3))
+
+    with pytest.raises(ValueError):
+        F.broadcast_to(x, (1, 3))
 
 
 def test_utils_astensor1d():
@@ -369,7 +322,7 @@ def test_device():
 
 def test_identity():
     x = tensor(np.random.random((5, 10)).astype(np.float32))
-    y = F.identity(x)
+    y = F.copy(x)
     np.testing.assert_equal(y.numpy(), x)
 
 
@@ -414,19 +367,3 @@ def test_copy_d2h():
 def test_copy_d2d():
     copy_test("gpu0", "gpu1")
     copy_test("gpu0:0", "gpu0:1")
-
-
-def test_param_pack_split():
-    a = tensor(np.ones((10,), np.int32))
-    b, c = F.param_pack_split(a, [0, 1, 1, 10], [(1,), (3, 3)])
-    assert np.allclose(b.numpy(), a.numpy()[1])
-    assert np.allclose(c.numpy(), a.numpy()[1:].reshape(3, 3))
-
-
-def test_param_pack_concat():
-    a = tensor(np.ones((1,), np.int32))
-    b = tensor(np.ones((3, 3), np.int32))
-    offsets_val = [0, 1, 1, 10]
-    offsets = tensor(offsets_val, np.int32)
-    c = F.param_pack_concat([a, b], offsets, offsets_val)
-    assert np.allclose(np.concatenate([a.numpy(), b.numpy().flatten()]), c.numpy())
diff --git a/imperative/python/test/unit/module/test_activation.py b/imperative/python/test/unit/module/test_activation.py
index afca1fde..8212b47e 100644
--- a/imperative/python/test/unit/module/test_activation.py
+++ b/imperative/python/test/unit/module/test_activation.py
@@ -10,7 +10,6 @@ import numpy as np
 
 import megengine as mge
 from megengine.module import LeakyReLU
-from megengine.test import assertTensorClose
 
 
 def test_leaky_relu():
@@ -21,4 +20,4 @@ def test_leaky_relu():
     output = leaky_relu(mge.tensor(data))
 
     np_output = np.maximum(0, data) + negative_slope * np.minimum(0, data)
-    assertTensorClose(output.numpy(), np_output, max_err=0)
+    np.testing.assert_equal(output.numpy(), np_output)
diff --git a/imperative/python/test/unit/module/test_batchnorm.py b/imperative/python/test/unit/module/test_batchnorm.py
index c99dd6b3..e48f96d8 100644
--- a/imperative/python/test/unit/module/test_batchnorm.py
+++ b/imperative/python/test/unit/module/test_batchnorm.py
@@ -6,6 +6,7 @@
 # Unless required by applicable law or agreed to in writing,
 # software distributed under the License is distributed on an
 # "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+import functools
 import multiprocessing as mp
 import platform
 
@@ -17,7 +18,8 @@ import megengine.distributed as dist
 from megengine import Tensor
 from megengine.core._trace_option import use_tensor_shape
 from megengine.module import BatchNorm1d, BatchNorm2d, SyncBatchNorm
-from megengine.test import assertTensorClose
+
+_assert_allclose = functools.partial(np.testing.assert_allclose, atol=5e-6, rtol=5e-6)
 
 
 @pytest.mark.skipif(
@@ -47,9 +49,9 @@ def test_syncbn():
         for i in range(steps):
             yv = bn(Tensor(data[i]))
 
-        assertTensorClose(yv_expect, yv.numpy(), max_err=5e-6)
-        assertTensorClose(running_mean, bn.running_mean.numpy(), max_err=5e-6)
-        assertTensorClose(running_var, bn.running_var.numpy(), max_err=5e-6)
+        _assert_allclose(yv.numpy(), yv_expect)
+        _assert_allclose(bn.running_mean.numpy(), running_mean)
+        _assert_allclose(bn.running_var.numpy(), running_var)
 
     xv = []
     for i in range(steps):
@@ -119,13 +121,9 @@ def test_batchnorm():
         yv = bn(Tensor(xv))
         yv_expect = (xv - mean) / sd
 
-        assertTensorClose(yv_expect, yv.numpy(), max_err=5e-6)
-        assertTensorClose(
-            running_mean.reshape(-1), bn.running_mean.numpy().reshape(-1), max_err=5e-6
-        )
-        assertTensorClose(
-            running_var.reshape(-1), bn.running_var.numpy().reshape(-1), max_err=5e-6
-        )
+        _assert_allclose(yv.numpy(), yv_expect)
+        _assert_allclose(bn.running_mean.numpy().reshape(-1), running_mean.reshape(-1))
+        _assert_allclose(bn.running_var.numpy().reshape(-1), running_var.reshape(-1))
 
     # test set 'training' flag to False
     mean_backup = bn.running_mean.numpy()
@@ -135,11 +133,11 @@ def test_batchnorm():
     data = Tensor(xv)
     yv1 = bn(data)
     yv2 = bn(data)
-    assertTensorClose(yv1.numpy(), yv2.numpy(), max_err=0)
-    assertTensorClose(mean_backup, bn.running_mean.numpy(), max_err=0)
-    assertTensorClose(var_backup, bn.running_var.numpy(), max_err=0)
+    np.testing.assert_equal(yv1.numpy(), yv2.numpy())
+    np.testing.assert_equal(mean_backup, bn.running_mean.numpy())
+    np.testing.assert_equal(var_backup, bn.running_var.numpy())
     yv_expect = (xv - running_mean) / np.sqrt(running_var + bn.eps)
-    assertTensorClose(yv_expect, yv1.numpy(), max_err=5e-6)
+    _assert_allclose(yv1.numpy(), yv_expect)
 
 
 @pytest.mark.skipif(
@@ -173,13 +171,9 @@ def test_syncbn1d():
         yv = bn(Tensor(xv))
         yv_expect = (xv - mean) / sd
 
-        assertTensorClose(yv_expect, yv.numpy(), max_err=5e-6)
-        assertTensorClose(
-            running_mean.reshape(-1), bn.running_mean.numpy().reshape(-1), max_err=5e-6
-        )
-        assertTensorClose(
-            running_var.reshape(-1), bn.running_var.numpy().reshape(-1), max_err=5e-6
-        )
+        _assert_allclose(yv.numpy(), yv_expect)
+        _assert_allclose(bn.running_mean.numpy().reshape(-1), running_mean.reshape(-1))
+        _assert_allclose(bn.running_var.numpy().reshape(-1), running_var.reshape(-1))
 
     # test set 'training' flag to False
     mean_backup = bn.running_mean.numpy()
@@ -189,11 +183,11 @@ def test_syncbn1d():
     data = Tensor(xv)
     yv1 = bn(data)
     yv2 = bn(data)
-    assertTensorClose(yv1.numpy(), yv2.numpy(), max_err=0)
-    assertTensorClose(mean_backup, bn.running_mean.numpy(), max_err=0)
-    assertTensorClose(var_backup, bn.running_var.numpy(), max_err=0)
+    np.testing.assert_equal(yv1.numpy(), yv2.numpy())
+    np.testing.assert_equal(mean_backup, bn.running_mean.numpy())
+    np.testing.assert_equal(var_backup, bn.running_var.numpy())
     yv_expect = (xv - running_mean) / np.sqrt(running_var + bn.eps)
-    assertTensorClose(yv_expect, yv1.numpy(), max_err=5e-6)
+    _assert_allclose(yv1.numpy(), yv_expect)
 
 
 def test_batchnorm2d():
@@ -221,9 +215,9 @@ def test_batchnorm2d():
         yv = bn(Tensor(xv))
         yv_expect = (xv - mean) / sd
 
-        assertTensorClose(yv_expect, yv.numpy(), max_err=5e-6)
-        assertTensorClose(running_mean, bn.running_mean.numpy(), max_err=5e-6)
-        assertTensorClose(running_var, bn.running_var.numpy(), max_err=5e-6)
+        _assert_allclose(yv.numpy(), yv_expect)
+        _assert_allclose(bn.running_mean.numpy(), running_mean)
+        _assert_allclose(bn.running_var.numpy(), running_var)
 
     # test set 'training' flag to False
     mean_backup = bn.running_mean.numpy()
@@ -233,11 +227,11 @@ def test_batchnorm2d():
     data = Tensor(xv)
     yv1 = bn(data)
     yv2 = bn(data)
-    assertTensorClose(yv1.numpy(), yv2.numpy(), max_err=0)
-    assertTensorClose(mean_backup, bn.running_mean.numpy(), max_err=0)
-    assertTensorClose(var_backup, bn.running_var.numpy(), max_err=0)
+    np.testing.assert_equal(yv1.numpy(), yv2.numpy())
+    np.testing.assert_equal(mean_backup, bn.running_mean.numpy())
+    np.testing.assert_equal(var_backup, bn.running_var.numpy())
     yv_expect = (xv - running_mean) / np.sqrt(running_var + bn.eps)
-    assertTensorClose(yv_expect, yv1.numpy(), max_err=5e-6)
+    _assert_allclose(yv1.numpy(), yv_expect)
 
 
 @pytest.mark.skipif(
@@ -272,9 +266,9 @@ def test_syncbn2d():
         yv = bn(Tensor(xv))
         yv_expect = (xv - mean) / sd
 
-        assertTensorClose(yv_expect, yv.numpy(), max_err=5e-6)
-        assertTensorClose(running_mean, bn.running_mean.numpy(), max_err=5e-6)
-        assertTensorClose(running_var, bn.running_var.numpy(), max_err=5e-6)
+        _assert_allclose(yv.numpy(), yv_expect)
+        _assert_allclose(bn.running_mean.numpy(), running_mean)
+        _assert_allclose(bn.running_var.numpy(), running_var)
 
     # test set 'training' flag to False
     mean_backup = bn.running_mean.numpy()
@@ -284,11 +278,11 @@ def test_syncbn2d():
     data = Tensor(xv)
     yv1 = bn(data)
     yv2 = bn(data)
-    assertTensorClose(yv1.numpy(), yv2.numpy(), max_err=0)
-    assertTensorClose(mean_backup, bn.running_mean.numpy(), max_err=0)
-    assertTensorClose(var_backup, bn.running_var.numpy(), max_err=0)
+    np.testing.assert_equal(yv1.numpy(), yv2.numpy())
+    np.testing.assert_equal(mean_backup, bn.running_mean.numpy())
+    np.testing.assert_equal(var_backup, bn.running_var.numpy())
     yv_expect = (xv - running_mean) / np.sqrt(running_var + bn.eps)
-    assertTensorClose(yv_expect, yv1.numpy(), max_err=5e-6)
+    _assert_allclose(yv1.numpy(), yv_expect)
 
 
 def test_batchnorm_no_stats():
@@ -311,7 +305,7 @@ def test_batchnorm_no_stats():
         yv = bn(Tensor(xv))
         yv_expect = (xv - mean) / sd
 
-        assertTensorClose(yv_expect, yv.numpy(), max_err=5e-6)
+        _assert_allclose(yv.numpy(), yv_expect)
 
 
 @pytest.mark.skipif(
@@ -341,7 +335,7 @@ def test_syncbn_no_stats():
         yv = bn(Tensor(xv))
         yv_expect = (xv - mean) / sd
 
-        assertTensorClose(yv_expect, yv.numpy(), max_err=5e-6)
+        _assert_allclose(yv.numpy(), yv_expect)
 
 
 def test_batchnorm2d_no_stats():
@@ -363,7 +357,7 @@ def test_batchnorm2d_no_stats():
         yv = bn(Tensor(xv))
         yv_expect = (xv - mean) / sd
 
-        assertTensorClose(yv_expect, yv.numpy(), max_err=5e-6)
+        _assert_allclose(yv.numpy(), yv_expect)
 
 
 @pytest.mark.skipif(
@@ -392,4 +386,4 @@ def test_syncbn2d_no_stats():
         yv = bn(Tensor(xv))
         yv_expect = (xv - mean) / sd
 
-        assertTensorClose(yv_expect, yv.numpy(), max_err=5e-6)
+        _assert_allclose(yv.numpy(), yv_expect)
diff --git a/imperative/python/test/unit/module/test_conv.py b/imperative/python/test/unit/module/test_conv.py
index 262498f9..5d06e5de 100644
--- a/imperative/python/test/unit/module/test_conv.py
+++ b/imperative/python/test/unit/module/test_conv.py
@@ -12,7 +12,6 @@ import numpy as np
 
 from megengine import Parameter, tensor
 from megengine.module import ConvTranspose2d, LocalConv2d
-from megengine.test import assertTensorClose
 
 
 def test_conv_transpose2d():
@@ -49,62 +48,75 @@ def test_conv_transpose2d():
         conv_transpose2d.bias = Parameter(bias, dtype=np.float32)
     y = conv_transpose2d(tensor(inp))
 
-    assertTensorClose(out, y.numpy(), max_err=2e-6)
+    np.testing.assert_almost_equal(out, y.numpy(), 2e-6)
 
 
 def test_local_conv2d():
-    batch_size = 10
-    in_channels = 4
-    out_channels = 8
-    input_height = 8
-    input_width = 8
-    kernel_size = 3
-    stride = 1
-    padding = 1
-    dilation = 1
-    groups = 1
-    local_conv2d = LocalConv2d(
-        in_channels=in_channels,
-        out_channels=out_channels,
-        input_height=input_height,
-        input_width=input_width,
-        kernel_size=kernel_size,
-        stride=stride,
-        padding=padding,
-        dilation=dilation,
-        groups=groups,
-    )
-    inputs = np.random.normal(
-        size=(batch_size, in_channels, input_height, input_width)
-    ).astype(np.float32)
-    output_height = (input_height + padding * 2 - kernel_size) // stride + 1
-    output_width = (input_width + padding * 2 - kernel_size) // stride + 1
-    weights = np.random.normal(
-        size=(
-            groups,
-            output_height,
-            output_width,
-            in_channels // groups,
-            kernel_size,
-            kernel_size,
-            out_channels // groups,
-        )
-    ).astype(np.float32)
-    local_conv2d.weight = Parameter(weights)
-    outputs = local_conv2d(tensor(inputs))
-    # naive calculation use numpy
-    # only test output_height == input_height, output_width == input_width, group == 1
-    inputs = np.pad(inputs, ((0, 0), (0, 0), (1, 1), (1, 1)))
-    expected = np.zeros(
-        (batch_size, out_channels, output_height, output_width), dtype=np.float32,
-    )
-    for n, oc, oh, ow in itertools.product(
-        *map(range, [batch_size, out_channels, output_height, output_width])
+    def test_func(
+        batch_size,
+        in_channels,
+        out_channels,
+        input_height,
+        input_width,
+        kernel_size,
+        stride,
+        padding,
+        dilation,
+        groups,
     ):
-        ih, iw = oh * stride, ow * stride
-        expected[n, oc, ih, iw] = np.sum(
-            inputs[n, :, ih : ih + kernel_size, iw : iw + kernel_size]
-            * weights[0, oh, ow, :, :, :, oc]
+        local_conv2d = LocalConv2d(
+            in_channels=in_channels,
+            out_channels=out_channels,
+            input_height=input_height,
+            input_width=input_width,
+            kernel_size=kernel_size,
+            stride=stride,
+            padding=padding,
+            dilation=dilation,
+            groups=groups,
+        )
+        inputs = np.random.normal(
+            size=(batch_size, in_channels, input_height, input_width)
+        ).astype(np.float32)
+        output_height = (input_height + padding * 2 - kernel_size) // stride + 1
+        output_width = (input_width + padding * 2 - kernel_size) // stride + 1
+        weights = np.random.normal(
+            size=(
+                groups,
+                output_height,
+                output_width,
+                in_channels // groups,
+                kernel_size,
+                kernel_size,
+                out_channels // groups,
+            )
+        ).astype(np.float32)
+        local_conv2d.weight = Parameter(weights)
+        outputs = local_conv2d(tensor(inputs))
+        # naive calculation use numpy
+        # only test output_height == input_height, output_width == input_width
+        inputs = np.pad(inputs, ((0, 0), (0, 0), (1, 1), (1, 1)))
+        expected = np.zeros(
+            (batch_size, out_channels, output_height, output_width), dtype=np.float32,
         )
+        ic_group_size = in_channels // groups
+        oc_group_size = out_channels // groups
+        for n, oc, oh, ow in itertools.product(
+            *map(range, [batch_size, out_channels, output_height, output_width])
+        ):
+            ih, iw = oh * stride, ow * stride
+            g_id = oc // oc_group_size
+            expected[n, oc, ih, iw] = np.sum(
+                inputs[
+                    n,
+                    g_id * ic_group_size : (g_id + 1) * ic_group_size,
+                    ih : ih + kernel_size,
+                    iw : iw + kernel_size,
+                ]
+                * weights[g_id, oh, ow, :, :, :, oc % oc_group_size]
+            )
+        np.testing.assert_almost_equal(outputs.numpy(), expected, 1e-5)
 
-    assertTensorClose(outputs.numpy(), expected, max_err=1e-5)
+    test_func(10, 4, 4, 5, 5, 3, 1, 1, 1, 1)
+    test_func(10, 32, 32, 8, 8, 3, 1, 1, 1, 2)
+    test_func(10, 32, 32, 8, 8, 3, 1, 1, 1, 4)
diff --git a/imperative/python/test/unit/module/test_module.py b/imperative/python/test/unit/module/test_module.py
index d4a5f304..99712413 100644
--- a/imperative/python/test/unit/module/test_module.py
+++ b/imperative/python/test/unit/module/test_module.py
@@ -21,12 +21,14 @@ from megengine.module import (
     BatchNorm1d,
     BatchNorm2d,
     Conv2d,
+    Dropout,
     Linear,
+    MaxPool2d,
     Module,
     Sequential,
+    Softmax,
 )
 from megengine.quantization.quantize import quantize, quantize_qat
-from megengine.test import assertTensorClose
 
 
 class MLP(Module):
@@ -84,7 +86,7 @@ def graph_mode(*modes):
 
 
 def _default_compare_fn(x, y):
-    assertTensorClose(x.numpy(), y)
+    np.testing.assert_allclose(x.numpy(), y, rtol=1e-6)
 
 
 def opr_test(
@@ -99,7 +101,7 @@ def opr_test(
     mode: the list of test mode which are eager, static and dynamic_shape
           will test all the cases if None.
     func: the function to run opr.
-    compare_fn: the function to compare the result and expected, use assertTensorClose if None.
+    compare_fn: the function to compare the result and expected, use np.testing.assert_allclose if None.
     ref_fn: the function to generate expected data, should assign output if None.
     cases: the list which have dict element, the list length should be 2 for dynamic shape test.
            and the dict should have input,
@@ -325,20 +327,20 @@ def test_module_api_hooks():
     assert pre_hook_num == 4
     assert post_hook_num == 4
     mean1 = Parameter(np.zeros(shape), dtype=np.float32)
-    bn1 = F.batch_norm2d(
+    bn1 = F.batch_norm(
         x + 3, mean1, Parameter(np.ones(shape), dtype=np.float32), training=True
     )
-    assertTensorClose(
+    np.testing.assert_allclose(
         net.i.bn.running_mean.numpy(), mean1.numpy(),
     )
     mean2 = Parameter(np.zeros(shape), dtype=np.float32)
-    bn2 = F.batch_norm2d(
+    bn2 = F.batch_norm(
         bn1 + 3, mean2, Parameter(np.ones(shape), dtype=np.float32), training=True
     )
-    assertTensorClose(
+    np.testing.assert_allclose(
         net.bn.running_mean.numpy(), mean2.numpy(),
     )
-    assertTensorClose((bn2 + 2).numpy(), y.numpy())
+    np.testing.assert_allclose((bn2 + 2).numpy(), y.numpy())
 
     assert len(hooks) == 8
     for handler in hooks:
@@ -457,9 +459,9 @@ def test_sequential_named_children():
     modules["name2"] = Linear(5, 1)
     m = Sequential(modules)
     l = list(m.named_children())
-    assert l[0][0] == "layer_values.0"
-    assert l[1][0] == "layer_values.1"
-    assert l[2][0] == "layer_values.2"
+    assert l[0][0] == "name0"
+    assert l[1][0] == "name1"
+    assert l[2][0] == "name2"
 
 
 def test_state_dict():
@@ -476,7 +478,7 @@ def test_state_dict():
         mlp1 = MLP()
         mlp1.load_state_dict(state_dict, strict=False)
         pred1 = mlp1(data)
-        assertTensorClose(pred0.numpy(), pred1.numpy(), max_err=5e-6)
+        np.testing.assert_allclose(pred0.numpy(), pred1.numpy(), atol=5e-6)
         with pytest.raises(KeyError):
             mlp1.load_state_dict(state_dict)
         del state_dict["extra"]
@@ -517,13 +519,13 @@ def test_shared_param():
     net = Simple()
     assert net.conv0.weight is net.conv1.weight
     data = tensor(np.random.random((1, 1, 8, 8)).astype(np.float32))
-    assertTensorClose(net.conv0(data).numpy(), net.conv1(data).numpy())
+    np.testing.assert_allclose(net.conv0(data).numpy(), net.conv1(data).numpy())
     with BytesIO() as f:
         mge.save(net, f)
         f.seek(0)
         net1 = mge.load(f)
     assert net1.conv0.weight is net1.conv1.weight
-    assertTensorClose(net1.conv0(data).numpy(), net1.conv1(data).numpy())
+    np.testing.assert_allclose(net1.conv0(data).numpy(), net1.conv1(data).numpy())
 
     with BytesIO() as f:
         mge.save(net.conv0, f)
@@ -536,7 +538,7 @@ def test_shared_param():
         conv1 = mge.load(f)
 
     assert conv0.weight is not conv1.weight
-    assertTensorClose(conv0(data).numpy(), conv1(data).numpy())
+    np.testing.assert_allclose(conv0(data).numpy(), conv1(data).numpy())
 
 
 def test_pickle_module():
@@ -559,8 +561,8 @@ def test_pickle_module():
         mlp1 = mge.load(fout)
         pred2 = mlp1(data)
 
-    assertTensorClose(pred0.numpy(), pred1.numpy(), max_err=5e-6)
-    assertTensorClose(pred0.numpy(), pred2.numpy(), max_err=5e-6)
+    np.testing.assert_allclose(pred0.numpy(), pred1.numpy(), atol=5e-6)
+    np.testing.assert_allclose(pred0.numpy(), pred2.numpy(), atol=5e-6)
 
 
 @pytest.mark.skip(reason="under development")
@@ -606,6 +608,114 @@ def test_load_quantized():
         mlp.load_state_dict(checkpoint)
         pred1 = mlp(data)
 
-    assertTensorClose(
-        pred0.astype("float32").numpy(), pred1.astype("float32").numpy(), max_err=5e-6
+    np.testing.assert_allclose(
+        pred0.astype("float32").numpy(), pred1.astype("float32").numpy(), atol=5e-6
     )
+
+
+def test_repr_basic():
+    # test whether __repr__ can output correct information
+    class ConvModel(Module):
+        def __init__(self):
+            super().__init__()
+            self.conv1 = Conv2d(3, 128, 3, stride=2, bias=False)
+            self.conv2 = Conv2d(3, 128, 3, padding=1, bias=False)
+            self.conv3 = Conv2d(3, 128, 3, dilation=2, bias=False)
+            self.bn1 = BatchNorm2d(128)
+            self.bn2 = BatchNorm1d(128)
+            self.dropout = Dropout(drop_prob=0.1)
+            self.softmax = Softmax(axis=100)
+            self.pooling = MaxPool2d(kernel_size=2, padding=0)
+            self.submodule1 = Sequential(Dropout(drop_prob=0.1), Softmax(axis=100),)
+            self.fc1 = Linear(512, 1024)
+
+        def forward(self, inputs):
+            pass
+
+    ground_truth = (
+        "ConvModel(\n"
+        "  (conv1): Conv2d(3, 128, kernel_size=(3, 3), stride=(2, 2), bias=False)\n"
+        "  (conv2): Conv2d(3, 128, kernel_size=(3, 3), padding=(1, 1), bias=False)\n"
+        "  (conv3): Conv2d(3, 128, kernel_size=(3, 3), dilation=(2, 2), bias=False)\n"
+        "  (bn1): BatchNorm2d(128, eps=1e-05, momentum=0.9, affine=True, track_running_stats=True)\n"
+        "  (bn2): BatchNorm1d(128, eps=1e-05, momentum=0.9, affine=True, track_running_stats=True)\n"
+        "  (dropout): Dropout(drop_prob=0.1)\n  (softmax): Softmax(axis=100)\n"
+        "  (pooling): MaxPool2d(kernel_size=2, stride=2, padding=0)\n"
+        "  (submodule1): Sequential(\n"
+        "    (0): Dropout(drop_prob=0.1)\n"
+        "    (1): Softmax(axis=100)\n  )\n"
+        "  (fc1): Linear(in_features=512, out_features=1024, bias=True)\n"
+        ")"
+    )
+    net = ConvModel()
+    output = net.__repr__()
+    assert output == ground_truth
+
+
+def test_repr_module_reassign():
+    # test whether __repr__ can deal with module reassign
+    class ConvModel1(Module):
+        def __init__(self):
+            super().__init__()
+            self.conv1 = Conv2d(3, 128, 3, bias=False)
+            self.conv2 = Conv2d(3, 128, 3, padding=1, bias=False)
+            self.conv1 = Conv2d(3, 256, 3, dilation=2, bias=False)
+
+        def forward(self, inputs):
+            pass
+
+    ground_truth = (
+        "ConvModel1(\n"
+        "  (conv1): Conv2d(3, 256, kernel_size=(3, 3), dilation=(2, 2), bias=False)\n"
+        "  (conv2): Conv2d(3, 128, kernel_size=(3, 3), padding=(1, 1), bias=False)\n"
+        ")"
+    )
+    net = ConvModel1()
+    output = net.__repr__()
+    assert output == ground_truth
+
+
+def test_repr_module_rereference():
+    # test whether __repr__ can deal with module re-reference
+    class ConvModel2(Module):
+        def __init__(self):
+            super().__init__()
+            self.conv1 = Conv2d(3, 128, 3, bias=False)
+            self.conv2 = self.conv1
+            self.conv3 = self.conv1
+
+        def forward(self, inputs):
+            pass
+
+    ground_truth = (
+        "ConvModel2(\n"
+        "  (conv1): Conv2d(3, 128, kernel_size=(3, 3), bias=False)\n"
+        "  (conv2): Conv2d(3, 128, kernel_size=(3, 3), bias=False)\n"
+        "  (conv3): Conv2d(3, 128, kernel_size=(3, 3), bias=False)\n"
+        ")"
+    )
+    net = ConvModel2()
+    output = net.__repr__()
+    assert output == ground_truth
+
+
+def test_repr_module_delete():
+    # test whether __repr__ can deal with module delete
+    class ConvModel3(Module):
+        def __init__(self):
+            super().__init__()
+            self.conv1 = Conv2d(3, 128, 3, bias=False)
+            self.softmax = Softmax(100)
+
+        def forward(self, inputs):
+            pass
+
+    ground_truth = (
+        "ConvModel3(\n"
+        "  (conv1): Conv2d(3, 128, kernel_size=(3, 3), bias=False)\n"
+        ")"
+    )
+    net = ConvModel3()
+    del net.softmax
+    output = net.__repr__()
+    assert output == ground_truth
diff --git a/imperative/python/test/unit/module/test_tensor.py b/imperative/python/test/unit/module/test_module_tensor.py
similarity index 83%
rename from imperative/python/test/unit/module/test_tensor.py
rename to imperative/python/test/unit/module/test_module_tensor.py
index 38686b08..773bf373 100644
--- a/imperative/python/test/unit/module/test_tensor.py
+++ b/imperative/python/test/unit/module/test_module_tensor.py
@@ -15,7 +15,6 @@ import megengine as mge
 import megengine.functional as F
 from megengine import Parameter, Tensor
 from megengine.module import Conv2d
-from megengine.test import assertTensorClose
 
 
 def test_set_value():
@@ -23,21 +22,21 @@ def test_set_value():
     param = Parameter(v0)
     v1 = np.random.random((2, 3)).astype(np.float32)
     param.set_value(v1)
-    assertTensorClose(param.numpy(), v1, max_err=5e-6)
+    np.testing.assert_allclose(param.numpy(), v1, atol=5e-6)
     v2 = np.random.random((3, 3)).astype(np.float32)
     # TODO: add this
     # with pytest.raises(ValueError):
     #     param.set_value(v2)
-    assertTensorClose(param.numpy(), v1, max_err=5e-6)
+    np.testing.assert_allclose(param.numpy(), v1, atol=5e-6)
 
 
 @pytest.mark.skip(reason="fill unsupported")
 def test_fill():
     a = Tensor(np.zeros((2, 3), dtype=np.float32))
     a.fill(3)
-    assertTensorClose(a.numpy(), np.full((2, 3), 3, dtype=np.float32))
+    np.testing.assert_allclose(a.numpy(), np.full((2, 3), 3, dtype=np.float32))
     a.fill(124.568)
-    assertTensorClose(a.numpy(), np.full((2, 3), 124.568, dtype=np.float32))
+    np.testing.assert_allclose(a.numpy(), np.full((2, 3), 124.568, dtype=np.float32))
 
 
 # TODO: remove or rewrite following test
@@ -51,11 +50,11 @@ def test_fill():
 #         f = compile(v, None)
 
 #     out, = f()
-#     assertTensorClose(out, p_ * 2)
+#     np.testing.assert_allclose(out, p_ * 2)
 
 #     F.add_update(p, p)
 #     out, = f()
-#     assertTensorClose(out, p_ * 4)
+#     np.testing.assert_allclose(out, p_ * 4)
 
 
 # TODO: remove or rewrite following test
@@ -74,7 +73,7 @@ def test_fill():
 #     data1 = Input("data", value=v)
 #     out1 = net(data1)
 
-#     assertTensorClose(out0, out1.numpy())
+#     np.testing.assert_allclose(out0, out1.numpy())
 
 
 # def test_shape_warning():
diff --git a/imperative/python/test/unit/module/test_qat.py b/imperative/python/test/unit/module/test_qat.py
index d9312bb4..718d6acb 100644
--- a/imperative/python/test/unit/module/test_qat.py
+++ b/imperative/python/test/unit/module/test_qat.py
@@ -12,7 +12,6 @@ from megengine.module import (
     QuantStub,
 )
 from megengine.quantization.quantize import disable_fake_quant, quantize_qat
-from megengine.test import assertTensorClose
 
 
 def test_qat_convbn2d():
@@ -31,22 +30,24 @@ def test_qat_convbn2d():
         # import pdb
         # pdb.set_trace()
         qat_outputs = qat_module(inputs)
-        assertTensorClose(normal_outputs.numpy(), qat_outputs.numpy(), max_err=5e-6)
-        assertTensorClose(
+        np.testing.assert_allclose(
+            normal_outputs.numpy(), qat_outputs.numpy(), atol=5e-6
+        )
+        np.testing.assert_allclose(
             module.bn.running_mean.numpy(),
             qat_module.bn.running_mean.numpy(),
-            max_err=5e-8,
+            atol=5e-8,
         )
-        assertTensorClose(
-            module.bn.running_var.numpy(),
-            qat_module.bn.running_var.numpy(),
-            max_err=5e-7,
+        np.testing.assert_allclose(
+            module.bn.running_var.numpy(), qat_module.bn.running_var.numpy(), atol=5e-7,
         )
         module.eval()
         normal_outputs = module(inputs)
         qat_module.eval()
         qat_outputs = qat_module(inputs)
-        assertTensorClose(normal_outputs.numpy(), qat_outputs.numpy(), max_err=5e-6)
+        np.testing.assert_allclose(
+            normal_outputs.numpy(), qat_outputs.numpy(), atol=5e-6
+        )
 
 
 def test_qat_conv():
@@ -82,10 +83,10 @@ def test_qat_conv():
         disable_fake_quant(qat_net)
         normal_outputs = net(inputs)
         qat_outputs = qat_net(inputs)
-        assertTensorClose(normal_outputs.numpy(), qat_outputs.numpy())
+        np.testing.assert_allclose(normal_outputs.numpy(), qat_outputs.numpy())
 
         net.eval()
         normal_outputs = net(inputs)
         qat_net.eval()
         qat_outputs = qat_net(inputs)
-        assertTensorClose(normal_outputs.numpy(), qat_outputs.numpy())
+        np.testing.assert_allclose(normal_outputs.numpy(), qat_outputs.numpy())
diff --git a/imperative/python/test/unit/quantization/test_fake_quant.py b/imperative/python/test/unit/quantization/test_fake_quant.py
index ff999b75..3d36847e 100644
--- a/imperative/python/test/unit/quantization/test_fake_quant.py
+++ b/imperative/python/test/unit/quantization/test_fake_quant.py
@@ -13,7 +13,6 @@ import megengine as mge
 from megengine import tensor
 from megengine.quantization.fake_quant import TQT_Function
 from megengine.quantization.internal_fake_quant import *
-from megengine.test import assertTensorClose
 
 
 class numpy_TQT_Function:
@@ -60,13 +59,16 @@ def test_TQT():
     nf = numpy_TQT_Function(-127, 127)
 
     def check_inp(a, b, c, a_np, b_np, c_np):
-        assertTensorClose(
-            f.forward(a, b).numpy(), nf.forward(a_np, b_np).astype("float32")
+        np.testing.assert_allclose(
+            f.forward(a, b).numpy(),
+            nf.forward(a_np, b_np).astype("float32"),
+            rtol=1e-6,
+            atol=1e-6,
         )
         c1, c2 = f.backward(c)
         c1_np, c2_np = nf.backward(c_np)
-        assertTensorClose(c1.numpy(), c1_np.astype("float32"))
-        assertTensorClose(c2.numpy(), c2_np.astype("float32"))
+        np.testing.assert_allclose(c1.numpy(), c1_np.astype("float32"), rtol=1e-6)
+        np.testing.assert_allclose(c2.numpy(), c2_np.astype("float32"), rtol=1e-6)
 
     a_np = np.random.random((4, 3)).astype("float32")
     b_np = np.random.random((1)).astype("float32")
diff --git a/imperative/python/test/unit/test_cgtools.py b/imperative/python/test/unit/test_cgtools.py
index e74f2db1..3f0f341e 100644
--- a/imperative/python/test/unit/test_cgtools.py
+++ b/imperative/python/test/unit/test_cgtools.py
@@ -89,3 +89,12 @@ def test_graph_traversal():
     _, var_idx = var2oprs[input_var.id][0]
 
     assert var_idx == 0
+
+
+def test_load_refcnt():
+    graph = mgb_graph.Graph()
+    varnode = graph.make_const(0)
+    buf, _ = mgb_graph.dump_graph([varnode])
+    graph, _, (varnode,) = mgb_graph.load_graph(io.BytesIO(buf))
+    del graph
+    varnode.owner
diff --git a/imperative/python/test/unit/test_tracing.py b/imperative/python/test/unit/test_tracing.py
index 5c63f8b3..bca796a3 100644
--- a/imperative/python/test/unit/test_tracing.py
+++ b/imperative/python/test/unit/test_tracing.py
@@ -13,6 +13,7 @@ import numpy as np
 import pytest
 
 import megengine.core.tensor.megbrain_graph as G
+import megengine.functional as F
 from megengine import cgtools, tensor
 from megengine.core._trace_option import set_tensor_shape
 from megengine.core.ops import builtin as ops
@@ -22,29 +23,6 @@ from megengine.functional import exp, log
 from megengine.jit import exclude_from_trace, trace
 
 
-def load_and_inference(file, inp_data):
-    cg, _, out_list = G.load_graph(file)
-    inputs = cgtools.get_dep_vars(out_list, "Host2DeviceCopy")
-    replace_dict = {}
-    inp_node_list = []
-    for i in inputs:
-        inp_node = G.InputNode(
-            device="xpux", dtype=inputs[0].dtype, graph=inputs[0].graph
-        )
-        replace_dict[i] = inp_node.outputs[0]
-        inp_node_list.append(inp_node)
-    new_out = cgtools.replace_vars(out_list, replace_dict)
-    out_node_list = [G.OutputNode(i) for i in new_out]
-    new_out_list = [i.outputs[0] for i in out_node_list]
-    new_cg = new_out_list[0].graph
-    func = new_cg.compile(new_out_list)
-    for node, value in zip(inp_node_list, inp_data):
-        node.set_value(as_raw_tensor(value)._dev_tensor())
-    func.execute()
-    out_data_list = [o.get_value().numpy() for o in out_node_list]
-    return out_data_list
-
-
 def test_trace():
     for symbolic in [False, True]:
 
@@ -124,7 +102,7 @@ def test_dump():
     np.testing.assert_equal(dump_info.inputs, ["h2d[0]", "h2d[2]"])
     np.testing.assert_equal(dump_info.outputs, ["ADD(h2d[0],h2d[2])[4]"])
     file.seek(0)
-    result = load_and_inference(file, [a, b])
+    result = cgtools.load_and_inference(file, [a, b])
     np.testing.assert_equal(result[0], y)
 
 
@@ -146,7 +124,7 @@ def test_capture_dump():
     file = io.BytesIO()
     f.dump(file)
     file.seek(0)
-    result = load_and_inference(file, [x])
+    result = cgtools.load_and_inference(file, [x])
     np.testing.assert_equal(result[0], y)
 
 
@@ -172,7 +150,7 @@ def test_dump_volatile():
     (out,) = outputs
     assert (
         cgtools.get_owner_opr_type(cgtools.get_owner_opr_inputs(out)[1])
-        == "SharedDeviceTensor"
+        == "ImmutableTensor"
     )
 
 
@@ -257,6 +235,18 @@ def test_optimize_for_inference():
     assert computing_input.dtype == np.float16
 
 
+def test_optimize_for_inference_broadcast():
+    a = tensor(np.ones(1, dtype=np.float32))
+
+    @trace(capture_as_const=True, tensor_shape=True)
+    def f():
+        (b,) = apply(ops.Broadcast(), a, tensor([1, 10], dtype=np.int32))
+        return b
+
+    f()
+    f.dump(io.BytesIO())
+
+
 def test_trace_cvt_bool():
     set_tensor_shape(True)
     x = tensor([0], dtype=np.int32)
@@ -284,3 +274,123 @@ def test_trace_reshape():
         f(x1)
         f(x2)
         f(x3)
+
+
+def test_trace_topk():
+    x = tensor([5, 2, 7, 1, 0, 3, 2])
+
+    @trace(symbolic=True)
+    def f(x):
+        y = F.topk(x, 3)
+        np.testing.assert_equal(y[0].shape.numpy(), np.array([3,]))
+        return y
+
+    for i in range(3):
+        f(x)
+
+
+def test_trace_warp_perspective():
+    inp_shape = (1, 1, 4, 4)
+    x = tensor(np.arange(16, dtype=np.float32).reshape(inp_shape))
+    M_shape = (1, 3, 3)
+    M = tensor(
+        np.array(
+            [[1.0, 0.0, 1.0], [0.0, 1.0, 1.0], [0.0, 0.0, 1.0]], dtype=np.float32
+        ).reshape(M_shape)
+    )
+
+    @trace(symbolic=True)
+    def f(x, M):
+        out = F.warp_perspective(x, M, (2, 2))
+        np.testing.assert_equal(out.shape.numpy(), np.array([1, 1, 2, 2]))
+        return out
+
+    for i in range(1):
+        f(x, M)
+
+
+def test_raise_on_trace():
+    step_count = 0
+    catch_count = 0
+    bad_step = 10
+
+    class CatchMe(Exception):
+        pass
+
+    a = tensor([1, 2, 3, 4])
+    b = tensor([5, 6, 7, 8])
+    c = tensor([9, 0, 1, 2])
+
+    @trace
+    def add_abc(a, b, c):
+        print("Hello")
+        ps = a + b
+        result = ps + c
+        if step_count == bad_step:
+            raise CatchMe("catch me")
+        return result
+
+    for i in range(100):
+        try:
+            d = add_abc(a, b, c)
+        except CatchMe as e:
+            catch_count += 1
+        else:
+            np.testing.assert_equal(d.numpy(), (a + b + c).numpy())
+        step_count += 1
+
+    assert catch_count == 1
+
+
+def test_trace_broadcast():
+    for symbolic in [False, True]:
+        set_tensor_shape(True)
+        x1 = tensor(np.random.randn(3, 1, 1))
+        x2 = tensor(np.random.randn(1, 4, 1))
+        x3 = tensor(np.random.randn(1, 1, 5))
+
+        @trace(symbolic=symbolic, capture_as_const=True)
+        def f(x):
+            y = F.broadcast_to(x, (3, 4, 5))
+            return y
+
+        f(x1)
+        f(x2)
+        f(x3)
+
+
+def test_trace_nms():
+    def make_inputs(n):
+        boxes = np.zeros((n, 4))
+        boxes[:, :2] = np.random.rand(n, 2) * 100
+        boxes[:, 2:] = np.random.rand(n, 2) * 100 + 100
+
+        scores = np.random.rand(n)
+
+        return tensor(boxes), tensor(scores)
+
+    @trace(symbolic=False)
+    def f(boxes, scores):
+        results = F.nn.nms(boxes, scores=scores, iou_thresh=0.5, max_output=20)
+        with exclude_from_trace():
+            _ = F.nn.nms(boxes, scores=scores, iou_thresh=0.5)
+        return results
+
+    f(*make_inputs(10))
+    f(*make_inputs(20))
+    f(*make_inputs(30))
+
+
+def test_trace_valid_broadcast():
+    set_tensor_shape(True)
+    x1 = tensor(np.random.randn(1, 1))
+    x2 = tensor(np.random.randn(1, 2))
+    shape = (tensor([2]), tensor([2]))
+
+    @trace(symbolic=False)
+    def f(x, shape):
+        y = F.broadcast_to(x, shape)
+        return y
+
+    f(x1, shape)
+    f(x2, shape)
diff --git a/imperative/python/tools/gen_ops.py b/imperative/python/tools/gen_ops.py
index 7fcdb422..dde93e98 100755
--- a/imperative/python/tools/gen_ops.py
+++ b/imperative/python/tools/gen_ops.py
@@ -14,7 +14,6 @@ import os
 import textwrap
 import inspect
 
-
 def camel2underscore(
         name, *,
         first_cap_re=re.compile('([A-Z])([A-Z][a-z]+)'),
@@ -50,9 +49,9 @@ class Context:
     def __init__(self):
         self.fout = StringIO()
         self.indent = 0
-        self.generated = []
         self.skipped = []
         self.generated_signature = set()
+        self.generated_opr = dict()
 
     def write(self, text, *fmt, indent=0):
         text = textwrap.dedent(text)
@@ -181,6 +180,15 @@ class Context:
         :param outputs: the indices of output vars to be selected from raw opr
             result
         """
+
+        class OprItem:
+            def __init__(self, inputs, desc, params, version, has_out_dtype):
+                self.inputs = inputs
+                self.desc = desc
+                self.params = params
+                self.version = version
+                self.has_out_dtype = has_out_dtype
+
         if body:
             self.skipped.append(name)
             return
@@ -197,29 +205,56 @@ class Context:
             params = [('param', params)]
         assert params
 
-        self.write('# %s', caller_lineno())
-        self.write('class %s(PodOpVisitor):', name)
-        self.indent += 1
+        if name in self.generated_opr:
+            org_opr = self.generated_opr[name]
+            if version > org_opr.version:
+                def compare_doc(a, b):
+                    if isinstance(a, str):
+                        return a == b
+                    else:
+                        assert isinstance(a, Doc)
+                        return a.doc == b.doc
+
+                assert compare_doc(desc, org_opr.desc)
+                assert len(inputs) == len(org_opr.inputs)
+                for i, j in zip(inputs, org_opr.inputs):
+                    assert compare_doc(i, j)
+
+                self.generated_opr[name] = OprItem(inputs, desc, params, version, has_out_dtype)
+        else:
+            self.generated_opr[name] = OprItem(inputs, desc, params, version, has_out_dtype)
+
+    def write_generated_oprs(self):
+
+        for opr, opr_item in self.generated_opr.items():
+
+            name = opr
+            params = opr_item.params
+            version = opr_item.version
+            has_out_dtype = opr_item.has_out_dtype
+
+            self.write('# %s', caller_lineno())
+            self.write('class %s(PodOpVisitor):', name)
+            self.indent += 1
 
-        param_names, _ = zip(*params)
-        self.write('param_names = (%s,)', ', '.join(map('"{}"'.format, param_names)))
-        self.write('name = "%s"', '{}V{}'.format(name, version) if version else name)
-        self.write('\n')
+            param_names, _ = zip(*params)
+            self.write('param_names = (%s,)', ', '.join(map('"{}"'.format, param_names)))
+            self.write('name = "%s"', '{}V{}'.format(name, version) if version else name)
+            self.write('\n')
 
-        self.write('def __init__(%s):',
-                    self._gen_signature(params,
-                                        has_out_dtype=has_out_dtype))
-        self.indent += 1
+            self.write('def __init__(%s):',
+                        self._gen_signature(params,
+                                            has_out_dtype=has_out_dtype))
+            self.indent += 1
 
-        self._write_gen_config(has_out_dtype=has_out_dtype)
-        self.write('\n')
+            self._write_gen_config(has_out_dtype=has_out_dtype)
+            self.write('\n')
 
-        self._write_make_params(params)
+            self._write_make_params(params)
 
-        self.write('\n')
-        self.indent -= 2
+            self.write('\n')
+            self.indent -= 2
 
-        self.generated.append(name)
 
     def decl_raw_opr(self, name, *, inputs, inputs_cvt=[], body=None,
                      desc=None, local_defs=[], have_config=True):
@@ -232,7 +267,7 @@ class Context:
         buf = StringIO()
         print(
             '[',
-            *('    "%s",' % i for i in self.generated),
+            *('    "%s",' % i for i in self.generated_opr),
             ']',
             sep='\n',
             file=buf
@@ -259,6 +294,7 @@ def main():
         with open(i) as fin:
             exec(compile(fin.read(), i, 'exec'), exec_globals)
 
+    gen.write_generated_oprs()
     try:
         git_commit = subprocess.check_output(
             ['git', 'rev-parse', 'HEAD'], universal_newlines=True,
diff --git a/imperative/src/include/megbrain/imperative/function_hook.h b/imperative/src/impl/function_hook.h
similarity index 77%
rename from imperative/src/include/megbrain/imperative/function_hook.h
rename to imperative/src/impl/function_hook.h
index 64582f11..83cb6552 100644
--- a/imperative/src/include/megbrain/imperative/function_hook.h
+++ b/imperative/src/impl/function_hook.h
@@ -15,6 +15,7 @@
 
 namespace mgb {
 namespace imperative {
+
 template <typename TFunction>
 class FunctionHooker;
 
@@ -22,13 +23,18 @@ template <typename TRet, typename... TArgs>
 class FunctionHooker<TRet(TArgs...)> {
 public:
     using FunctionType = thin_function<TRet(TArgs&&...)>;
+    //Type of hooks. Hook should accept a real function as argument
+    //and invoke it on an appropriate time
     using HookType = thin_function<TRet(FunctionType, TArgs&&...)>;
-    explicit FunctionHooker(FunctionType* fptr) : m_fptr{fptr} {}
+    explicit FunctionHooker(FunctionType* fptr) : m_fptr{fptr} {
+        m_backup = {nullptr, [](FunctionType*){}};
+    }
 
 public:
     FunctionHooker& apply_hook(HookType&& hook) {
         if (!m_backup) {
             FunctionType* backup = new FunctionType(*m_fptr);
+            //Restore hooked function, would be invoked when destructed
             std::function<void(FunctionType*)> restorer =
                     [fptr = m_fptr](FunctionType* bkp) -> void {
                 *fptr = *bkp;
@@ -36,9 +42,11 @@ public:
             };
             m_backup = decltype(m_backup)(backup, restorer);
         }
+        //Replace with hooked version
         *m_fptr = [func = *m_fptr, hook](TArgs&&... args) -> TRet {
             return hook(func, std::forward<TArgs>(args)...);
         };
+        //Convinent for chain call
         return *this;
     }
 
@@ -47,9 +55,15 @@ private:
     std::unique_ptr<FunctionType, std::function<void(FunctionType*)>> m_backup;
 };
 
+//Helps to deduce template args
 template <typename TRet, typename... TArgs>
 FunctionHooker(thin_function<TRet(TArgs...)>* f)
         ->FunctionHooker<TRet(TArgs...)>;
-}  // namespace imperative
 
+template<typename TSignature>
+auto make_shared_hook(thin_function<TSignature>* fptr){
+    return std::make_shared<FunctionHooker<TSignature>>(fptr);
+}
+
+}  // namespace imperative
 }  // namespace mgb
diff --git a/imperative/src/impl/interpreter_impl.cpp b/imperative/src/impl/interpreter_impl.cpp
index de521f22..748c3dcd 100644
--- a/imperative/src/impl/interpreter_impl.cpp
+++ b/imperative/src/impl/interpreter_impl.cpp
@@ -187,6 +187,7 @@ void ChannelImpl::produce_tensor(TensorInfo* dest, TensorPtr ptr) {
 }
 
 void ChannelImpl::process_one_task(Command& cmd) {
+    //TODO: remove std::visit for support osx 10.12
     std::visit([this](auto& cmd) {
         using T = std::remove_reference_t<decltype(cmd)>;
         try {
diff --git a/imperative/src/impl/opr_utility.cpp b/imperative/src/impl/opr_utility.cpp
index 4990f61e..cf0bf2a6 100644
--- a/imperative/src/impl/opr_utility.cpp
+++ b/imperative/src/impl/opr_utility.cpp
@@ -144,13 +144,24 @@ cg::OperatorNodeBase::NodeProp* OutputCallback::do_make_node_prop() const {
     prop->add_flag(NodeProp::Flag::NO_AUTOMATIC_DUP);
     SmallVector<NodeProp::DepType> dep_types(input().size(),
                                              NodeProp::DepType::DEV_COMP_ORDER);
-    dep_types[0] = NodeProp::DepType::DEV_VALUE;
+    using IT = cg::static_infer::InferType;
+    auto host_value_avail = [&]() -> bool {
+        auto inp = input(0);
+        auto it = owner_graph()->static_infer_manager().get_infer_type(inp).value;
+        return it & (IT::CONST | IT::RT_STATIC | IT::MISSING_INP);
+    };
+    m_use_host_value = m_param.prefer_host_value && host_value_avail();
+    dep_types[0] = m_use_host_value ? NodeProp::DepType::HOST_VALUE : NodeProp::DepType::DEV_VALUE;
     prop->reset_dep_type(input(), dep_types);
     return prop;
 }
 
 void OutputCallback::scn_do_execute() {
-    m_param.callback(input(0)->dev_tensor());
+    if (m_use_host_value) {
+        m_param.callback(owner_graph()->static_infer_manager().infer_value(input(0)));
+    } else {
+        m_param.callback(input(0)->dev_tensor());
+    }
 }
 
 cg::OperatorNodeBase* OutputCallback::shallow_copy(
diff --git a/imperative/src/impl/ops/tensor_manip.cpp b/imperative/src/impl/ops/tensor_manip.cpp
index fdf9fac6..ae16edfd 100644
--- a/imperative/src/impl/ops/tensor_manip.cpp
+++ b/imperative/src/impl/ops/tensor_manip.cpp
@@ -62,7 +62,7 @@ std::shared_ptr<OpDef> make_from_op_node(cg::OperatorNodeBase* node_) {
     if (node->config().comp_node().size() ||
             node->config().output_dtype().valid() ||
             node->param().axis != opr::GetVarShape::Param::INVALID_AXIS) {
-        mgb_log_warn("weird GetVarShape");
+        mgb_log_debug("weird GetVarShape");
         return OpTrait::find_by_typeinfo(OprAttr::typeinfo())->make_from_op_node(node);
     }
     return GetVarShape::make();
diff --git a/imperative/src/impl/profiler.cpp b/imperative/src/impl/profiler.cpp
index 4987ce53..ccb98081 100644
--- a/imperative/src/impl/profiler.cpp
+++ b/imperative/src/impl/profiler.cpp
@@ -11,19 +11,20 @@
 
 #include "megbrain/imperative/profiler.h"
 
-#include <variant>
-
+#include "./function_hook.h"
 #include "megbrain/imperative/ops/opr_attr.h"
 #include "megbrain/imperative/physical_tensor.h"
 
+#include "megbrain/plugin/opr_footprint.h"
+
 #include "./event_pool.h"
 #include "./op_trait.h"
 
 namespace mgb {
-
 namespace imperative {
 
 namespace {
+
 CompNode::UnorderedSet collect_comp_nodes(
         const OpDef& def, const SmallVector<TensorPtr>& inputs) {
     CompNode::UnorderedSet comp_nodes;
@@ -36,37 +37,102 @@ CompNode::UnorderedSet collect_comp_nodes(
     return comp_nodes;
 }
 
+DeviceTimer::SharedEvent alloc_recorded_event(CompNode device) {
+    auto event = EventPool::with_timer().alloc_shared(device);
+    event->record();
+    return event;
+}
+
+OprFootprint footprint{};
+
 }  // namespace
 
 void DeviceTimer::reset(thin_function<double()> host_timer) {
     CompNode::foreach ([this, host_timer](CompNode device) {
-        auto base_event = EventPool::with_timer().alloc_shared(device);
-        base_event->record();
-        m_base_event_table[device] = {std::move(base_event), host_timer()};
+        m_base_event_table[device] = {alloc_recorded_event(device), host_timer()};
     });
+    m_host_timer = host_timer;
 }
 
 thin_function<double()> DeviceTimer::get_device_time(CompNode device) {
     auto event = EventPool::with_timer().alloc_shared(device);
     event->record();
+    if(m_base_event_table.count(device) == 0) {
+        m_base_event_table[device] = {alloc_recorded_event(device), m_host_timer()};
+    }
     auto base = m_base_event_table[device];
     return [base, event] {
         auto [base_event, host_time] = base;
-        //TODO: sync once for each compnode
+        // TODO: sync once for each compnode
         event->host_wait();
         return base_event->elapsed_time_until(*event) * 1000 + host_time;
     };
 }
 
-void Profiler::start() {
+void DeviceTimer::clear() {
+    m_base_event_table.clear();
+}
+
+size_t TensorRecorder::record_tensor(const TensorPtr& tensor) {
+    if (m_tensor_map.count(tensor.get()) > 0) {
+        auto& [prev, id] = m_tensor_map[tensor.get()];
+        if (prev.lock() != tensor) {
+            prev = tensor;
+            id = m_next_id++;
+        }
+        return id;
+    } else {
+        auto id = m_next_id++;
+        m_tensor_map.insert(
+                {tensor.get(), {std::weak_ptr<Tensor>{tensor}, id}});
+        return id;
+    }
+}
+
+void TensorRecorder::clear() {
+    m_next_id = 0;
+    m_tensor_map.clear();
+}
+
+Profile& Profiler::get_profile() {
+    for (auto& entry : m_profile) {
+        for (auto& [device, device_begin, device_end] : entry.device_list) {
+            MGB_MARK_USED_VAR(device);
+            device_begin = [value = device_begin()] { return value; };
+            device_end = [value = device_end()] { return value; };
+        }
+    }
+    return m_profile;
+}
+
+void Profiler::start(uint32_t flags) {
     m_host_timer.reset();
-    m_device_timer.reset([&]{ return m_host_timer.get_msecs();} );
-    OpTrait::for_each_trait([this](OpTrait& trait) {
-        FunctionHooker hooker{&trait.apply_on_physical_tensor};
-        hooker.apply_hook([this](auto&& apply, const OpDef& def,
-                                 const SmallVector<TensorPtr>& inputs) {
+    m_device_timer.reset([&] { return m_host_timer.get_msecs(); });
+    OpTrait::for_each_trait([this, flags](OpTrait& trait) {
+        auto hook_apply_on_physical_tensor =
+                make_shared_hook(&trait.apply_on_physical_tensor);
+        auto hook_apply_on_var_node =
+                make_shared_hook(&trait.apply_on_var_node);
+        hook_apply_on_physical_tensor->apply_hook([this, flags]
+                (auto&& apply, const OpDef& def, const SmallVector<TensorPtr>& inputs) {
+            auto shape2vector = [](const TensorShape& shape) {
+                std::vector<size_t> vector_shape;
+                for (size_t i = 0; i < shape.ndim; i++) {
+                    vector_shape.push_back(shape[i]);
+                }
+                return vector_shape;
+            };
             ProfileEntry entry;
+            entry.id = m_entry_count++;
+            // TODO: assign parent
+            entry.parent = 0;
+            // Record apply context and save to m_profile
             entry.op = def.copy();
+            for (auto&& input : inputs) {
+                entry.inputs.push_back({m_tensor_recorder.record_tensor(input),
+                                        shape2vector(input->layout()),
+                                        input->comp_node()});
+            }
             double host_begin = m_host_timer.get_msecs();
             auto&& comp_nodes = collect_comp_nodes(def, inputs);
             for (auto&& comp_node : comp_nodes) {
@@ -75,6 +141,11 @@ void Profiler::start() {
                          m_device_timer.get_device_time(comp_node),
                          {}});
             }
+            if (flags & PROFILE_FOOTPRINT) {
+                MGB_LOCK_GUARD(m_lock);
+                m_entry_stack.push({&def, &entry, std::this_thread::get_id()});
+            }
+            // Do real apply
             auto outputs = apply(def, inputs);
             for (auto& [cn, dev_begin, dev_end] : entry.device_list) {
                 MGB_MARK_USED_VAR(cn);
@@ -82,20 +153,71 @@ void Profiler::start() {
                 dev_end = m_device_timer.get_device_time(cn);
             }
             entry.host = {host_begin, m_host_timer.get_msecs()};
-            m_profile->push_back(std::move(entry));
+            for (auto&& output : outputs) {
+                entry.outputs.push_back(
+                        {m_tensor_recorder.record_tensor(output),
+                         shape2vector(output->layout()), output->comp_node()});
+            }
+            if (flags & PROFILE_FOOTPRINT) {
+                mgb_assert(std::get<1>(m_entry_stack.top()) == &entry);
+                MGB_LOCK_GUARD(m_lock);
+                m_entry_stack.pop();
+            }
+            m_profile.push_back(std::move(entry));
             return outputs;
         });
-        m_hooker_list.push_back(std::move(hooker));
+        if (flags & PROFILE_FOOTPRINT) {
+            hook_apply_on_var_node->apply_hook(
+                    [this](auto&& apply, const OpDef& def,
+                           VarNodeArray inputs) -> cg::OperatorNodeBase* {
+                        auto* operator_node = apply(def, std::move(inputs));
+                        std::remove_reference_t<decltype(m_entry_stack.top())>
+                                top;
+                        {
+                            MGB_LOCK_GUARD(m_lock);
+                            if (m_entry_stack.empty()) {
+                                return operator_node;
+                            }
+                            top = m_entry_stack.top();
+                        }
+                        auto [current_op, current_entry, thread_id] = top;
+                        if (current_op != &def ||
+                            thread_id != std::this_thread::get_id()) {
+                            return operator_node;
+                        }
+                        auto&& footprint_result =
+                                footprint.calc_footprint(operator_node);
+                        current_entry->memory = footprint_result.memory;
+                        current_entry->computation =
+                                footprint_result.computation;
+#if MGB_ENABLE_JSON
+                        current_entry->param = footprint_result.param;
+#endif
+                        return operator_node;
+                    });
+        }
+        m_hooker_list.push_back(std::move(hook_apply_on_physical_tensor));
+        m_hooker_list.push_back(std::move(hook_apply_on_var_node));
     });
 }
 
 void Profiler::stop() {
     m_hooker_list.clear();
-    for (auto& entry : *m_profile) {
+    for (auto& entry : m_profile) {
         entry.wait_device();
     }
 }
 
+void Profiler::clear() {
+    mgb_assert(m_entry_stack.empty(),
+               "entry_stack should be empty after profile");
+    mgb_assert(m_hooker_list.empty(), "hooks should be released");
+    m_profile.clear();
+    m_entry_count = 0;
+    m_device_timer.clear();
+    m_tensor_recorder.clear();
+}
+
 }  // namespace imperative
 
 }  // namespace mgb
diff --git a/imperative/src/include/megbrain/imperative/opr_utility.h b/imperative/src/include/megbrain/imperative/opr_utility.h
index 9054c217..14f5f272 100644
--- a/imperative/src/include/megbrain/imperative/opr_utility.h
+++ b/imperative/src/include/megbrain/imperative/opr_utility.h
@@ -60,7 +60,8 @@ public:
     using callback_t = thin_function<void(DeviceTensorND)>;
     struct Param {
         callback_t callback;
-        bool borrow = false;
+        bool borrow = false; // do not obtain shared ownership on DeviceTensorND
+        bool prefer_host_value = false; // use host value when possible
     };
     OutputCallback(Param param,
                    const VarNodeArray& inputs,
@@ -81,6 +82,7 @@ protected:
     NodeProp* do_make_node_prop() const override;
 private:
     Param m_param;
+    mutable bool m_use_host_value;
 };
 
 MGB_DEFINE_OPR_CLASS(NopCallback, cg::OperatorNodeBase) // {
diff --git a/imperative/src/include/megbrain/imperative/profiler.h b/imperative/src/include/megbrain/imperative/profiler.h
index bece8226..b2452242 100644
--- a/imperative/src/include/megbrain/imperative/profiler.h
+++ b/imperative/src/include/megbrain/imperative/profiler.h
@@ -11,7 +11,10 @@
 
 #pragma once
 
-#include <variant>
+#include <any>
+#include <optional>
+#include <stack>
+#include <list>
 
 #include "megbrain/comp_node.h"
 #include "megbrain/graph/event.h"
@@ -19,27 +22,39 @@
 #include "megbrain/utils/timer.h"
 
 #include "megbrain/imperative/op_def.h"
-
-#include "megbrain/imperative/function_hook.h"
+#include "megbrain/imperative/physical_tensor.h"
 
 namespace mgb {
 namespace imperative {
 
-struct ProfileEntry{
+using ProfileTensor = std::tuple<size_t, std::vector<size_t>, CompNode>;
+
+struct ProfileEntry {
     using TimeClosure = std::function<double()>;
+    size_t id;
+    size_t parent;
     std::shared_ptr<OpDef> op;
+    //(host_begin, host_end)
     std::tuple<double, double> host;
+    //[(device, device_begin, device_end)]
     std::vector<std::tuple<CompNode, TimeClosure, TimeClosure>> device_list;
-    void wait_device(){
-        for(auto& [cn, begin, end]: device_list){
+    std::vector<ProfileTensor> inputs;
+    std::vector<ProfileTensor> outputs;
+    long long memory = 0;
+    long long computation = 0;
+#if MGB_ENABLE_JSON
+    std::shared_ptr<json::Value> param;
+#endif
+    void wait_device() {
+        for (auto& [cn, begin, end] : device_list) {
             MGB_MARK_USED_VAR(cn);
-            begin = [begin=begin()]{ return begin; };
-            end  = [end = end()]{ return end; };
+            begin = [begin = begin()] { return begin; };
+            end = [end = end()] { return end; };
         }
     }
 };
 
-using Profile = std::vector<ProfileEntry>;
+using Profile = std::list<ProfileEntry>;
 
 class DeviceTimer {
 public:
@@ -47,31 +62,54 @@ public:
     DeviceTimer() = default;
     void reset(thin_function<double()> host_timer);
     thin_function<double()> get_device_time(CompNode device);
+    void clear();
 
 private:
     CompNode::UnorderedMap<std::tuple<SharedEvent, double>> m_base_event_table;
+    thin_function<double()> m_host_timer;
+};
+
+class TensorRecorder {
+private:
+    // active tensors
+    std::unordered_map<Tensor*, std::tuple<std::weak_ptr<Tensor>, size_t>>
+            m_tensor_map;
+    size_t m_next_id;
+
+public:
+    size_t record_tensor(const TensorPtr& tensor);
+    void clear();
 };
 
 class Profiler {
 public:
-    Profiler(Profile* profile = nullptr) {
-        if (!profile) {
-            m_owned_profile = std::make_unique<Profile>();
-            profile = m_owned_profile.get();
-        }
-        m_profile = profile;
-    }
-    void start();
+    enum Flags {
+        PROFILE_FOOTPRINT = 1,
+    };
+
+public:
+    Profiler() = default;
+    // Start profiler by hook OpTrait
+    void start(uint32_t flags);
+    // Stop profiler and clean environment
     void stop();
-    Profile& get_profile() { return *m_profile; }
+    void clear();
+    Profile& get_profile();
 
 private:
     DeviceTimer m_device_timer;
     RealTimer m_host_timer;
-    Profile* m_profile;
+    Profile m_profile;
+    TensorRecorder m_tensor_recorder;
+    std::stack<std::tuple<const OpDef*, ProfileEntry*, std::thread::id>>
+            m_entry_stack;
+    // Hold profile owned by this Profiler
     std::unique_ptr<Profile> m_owned_profile;
-    std::vector<FunctionHooker<decltype(OpDef::apply_on_physical_tensor)>>
-            m_hooker_list;
+    // Hold hooks, cleared when stop
+    std::vector<std::any> m_hooker_list;
+    size_t m_entry_count = 0;
+    Spinlock m_lock;
+    std::unordered_map<Tensor*, std::weak_ptr<Tensor>> m_recorded_tensors;
 };
 
 }  // namespace imperative
diff --git a/imperative/src/test/helper.cpp b/imperative/src/test/helper.cpp
index 4f38813c..2c369ae6 100644
--- a/imperative/src/test/helper.cpp
+++ b/imperative/src/test/helper.cpp
@@ -107,6 +107,7 @@ void OprChecker::run(std::vector<InputSpec> inp_keys) {
     auto graph = ComputingGraph::make();
     graph->options().graph_opt_level = 0;
     for (size_t i = 0; i < nr_inps; ++ i) {
+        //TODO: remove std::visit for support osx 10.12
         host_inp[i] = std::visit([&gen](auto&& arg) -> HostTensorND {
                 using T = std::decay_t<decltype(arg)>;
                 if constexpr (std::is_same_v<TensorShape, T>) {
diff --git a/imperative/src/test/opr_utility.cpp b/imperative/src/test/opr_utility.cpp
index c808d2cd..a078d419 100644
--- a/imperative/src/test/opr_utility.cpp
+++ b/imperative/src/test/opr_utility.cpp
@@ -13,6 +13,7 @@
 #include "megbrain/opr/io.h"
 #include "megbrain/opr/basic_arith.h"
 #include "megbrain/opr/utility.h"
+#include "megbrain/opr/tensor_manip.h"
 #include "megbrain/test/helper.h"
 
 using namespace mgb;
@@ -50,6 +51,27 @@ TEST(TestOprUtility, OutputCallback) {
     MGB_ASSERT_TENSOR_EQ(hy, *hx);
 }
 
+TEST(TestOprUtility, OutputCallbackPreferHost) {
+    HostTensorGenerator<> gen;
+    auto hx = gen({2, 3});
+    auto graph = ComputingGraph::make();
+    auto x = opr::Host2DeviceCopy::make(*graph, hx);
+    x = opr::GetVarShape::make(x);
+    HostTensorND hy;
+    auto callback = [&hy](DeviceTensorND dv) {hy.copy_from(dv);};
+    opr::OutputCallback::Param param{callback};
+    param.prefer_host_value = true;
+    auto dummy = opr::OutputCallback::make(param, x);
+    auto y = opr::VirtualDep::make({x, dummy});
+
+    ComputingGraph::OutputSpec outspec{{y, [](DeviceTensorND&){}}};
+    auto func = graph->compile(outspec);
+    func->execute();
+    ASSERT_TRUE(hy.comp_node() == CompNode::default_cpu());
+    ASSERT_EQ(hy.ptr<int>()[0], 2);
+    ASSERT_EQ(hy.ptr<int>()[1], 3);
+}
+
 TEST(TestOprUtility, NopCallback) {
     HostTensorGenerator<> gen;
     auto hx = gen({2, 3});
diff --git a/scripts/cmake-build/BUILD_README.md b/scripts/cmake-build/BUILD_README.md
index fa060bc3..7cc2c73f 100644
--- a/scripts/cmake-build/BUILD_README.md
+++ b/scripts/cmake-build/BUILD_README.md
@@ -32,10 +32,6 @@
     if u do not do 4d/4e/4f, CUDA runtime can not find dll
     5: install python3 (DFT 3.8.3) to /c/Users/${USER}/mge_whl_python_env/3.8.3 and
     put it to PATH env and run python3 -m pip install numpy (if u want to build with training mode or build python whl)
-    6: install swig from install gui (if u want to build with training mode or build python whl)
-       a: download swig: https://nchc.dl.sourceforge.net/project/swig/swigwin/swigwin-4.0.2/swigwin-4.0.2.zip
-       b: install swig to /c/Users/${USER}/swigwin-4.0.2
-       c: apply scripts/whl/windows/fix-ptr-define-issue.patch to c/Users/${USER}/swigwin-4.0.2
     ```
 ### linux host build
     ```
diff --git a/scripts/whl/BUILD_PYTHON_WHL_README.md b/scripts/whl/BUILD_PYTHON_WHL_README.md
index c25d3f7b..335d6ab6 100644
--- a/scripts/whl/BUILD_PYTHON_WHL_README.md
+++ b/scripts/whl/BUILD_PYTHON_WHL_README.md
@@ -38,19 +38,19 @@
        d: mv /c/Users/${USER}/mge_whl_python_env/3.8.3/python.exe /c/Users/${USER}/mge_whl_python_env/3.8.3/python3.exe
     4: install needed package for build python whl package
        a0: /c/Users/${USER}/mge_whl_python_env/3.5.4/python3.exe -m pip install --upgrade pip
-       a1: /c/Users/${USER}/mge_whl_python_env/3.5.4/python3.exe -m pip install -r python_module/requires-test.txt
+       a1: /c/Users/${USER}/mge_whl_python_env/3.5.4/python3.exe -m pip install -r imperative/python/requires-test.txt
        a2: /c/Users/${USER}/mge_whl_python_env/3.5.4/python3.exe -m pip install numpy wheel requests tqdm tabulate
 
        b0: /c/Users/${USER}/mge_whl_python_env/3.6.8/python3.exe -m pip install --upgrade pip
-       b1: /c/Users/${USER}/mge_whl_python_env/3.6.8/python3.exe -m pip install -r python_module/requires-test.txt
+       b1: /c/Users/${USER}/mge_whl_python_env/3.6.8/python3.exe -m pip install -r imperative/python/requires-test.txt
        b2: /c/Users/${USER}/mge_whl_python_env/3.6.8/python3.exe -m pip install numpy wheel requests tqdm tabulate
 
        c0: /c/Users/${USER}/mge_whl_python_env/3.7.7/python3.exe -m pip install --upgrade pip
-       c1: /c/Users/${USER}/mge_whl_python_env/3.7.7/python3.exe -m pip install -r python_module/requires-test.txt
+       c1: /c/Users/${USER}/mge_whl_python_env/3.7.7/python3.exe -m pip install -r imperative/python/requires-test.txt
        c2: /c/Users/${USER}/mge_whl_python_env/3.7.7/python3.exe -m pip install numpy wheel requests tqdm tabulate
 
        d0: /c/Users/${USER}/mge_whl_python_env/3.8.3/python3.exe -m pip install --upgrade pip
-       d1: /c/Users/${USER}/mge_whl_python_env/3.8.3/python3.exe -m pip install -r python_module/requires-test.txt
+       d1: /c/Users/${USER}/mge_whl_python_env/3.8.3/python3.exe -m pip install -r imperative/python/requires-test.txt
        d2: /c/Users/${USER}/mge_whl_python_env/3.8.3/python3.exe -m pip install numpy wheel requests tqdm tabulate
     ```
 
diff --git a/scripts/whl/macos/macos_build_whl.sh b/scripts/whl/macos/macos_build_whl.sh
index bbf4a1b1..08b197f7 100755
--- a/scripts/whl/macos/macos_build_whl.sh
+++ b/scripts/whl/macos/macos_build_whl.sh
@@ -112,6 +112,10 @@ function do_build() {
         export EXTRA_CMAKE_ARGS="-DCMAKE_PREFIX_PATH=${PYTHON_DIR} -DPYTHON_LIBRARY=${PYTHON_LIBRARY} -DPYTHON_INCLUDE_DIR=${PYTHON_INCLUDE_DIR} "
         #config build type to RelWithDebInfo to enable MGB_ENABLE_DEBUG_UTIL etc
         export EXTRA_CMAKE_ARGS="${EXTRA_CMAKE_ARGS} -DCMAKE_BUILD_TYPE=RelWithDebInfo "
+        #we use std::visit in src, so set osx version minimum to 10.14, but 10.14 have objdump
+        #issue, so we now config to 10.15, whl name to 10.14
+        #TODO: can set to 10.12 after remove use std::visit
+        export EXTRA_CMAKE_ARGS="${EXTRA_CMAKE_ARGS} -DCMAKE_OSX_DEPLOYMENT_TARGET=10.15 "
 
         #call build and install
         #FIXME: cmake do not triger update python config, after
@@ -164,9 +168,7 @@ function do_build() {
         cd ${BUILD_DIR}/staging/dist/
         org_whl_name=`ls Meg*.whl`
         index=`awk -v a="${org_whl_name}" -v b="-macosx" 'BEGIN{print index(a,b)}'`
-        #compat for osx version from 10.5(Leopard)
-        #FIXME: same no need at -macosx-version-min=10.5 for build so
-        compat_whl_name=`echo ${org_whl_name} |cut -b -$index`macosx_10_5_x86_64.whl
+        compat_whl_name=`echo ${org_whl_name} |cut -b -$index`macosx_10_14_x86_64.whl
         echo "org whl name: ${org_whl_name}"
         echo "comapt whl name: ${compat_whl_name}"
         cp ${BUILD_DIR}/staging/dist/Meg*.whl ${MACOS_WHL_HOME}/${compat_whl_name}
diff --git a/sdk/load-and-run/dump_with_testcase_imperative.py b/sdk/load-and-run/dump_with_testcase_imperative.py
index 8ebd1274..0b3cce77 100755
--- a/sdk/load-and-run/dump_with_testcase_imperative.py
+++ b/sdk/load-and-run/dump_with_testcase_imperative.py
@@ -20,6 +20,7 @@ import megengine.core.tensor.megbrain_graph as G
 from megengine import cgtools
 from megengine.core.ops import builtin
 from megengine.core.tensor.core import apply
+from megengine.core.tensor.megbrain_graph import VarNode
 from megengine.core.tensor.raw_tensor import as_raw_tensor
 
 logger = mge.get_logger(__name__)
@@ -484,11 +485,29 @@ def main():
         sereg_kwargs = dict(keep_var_name=0, keep_param_name=False)
     else:
         sereg_kwargs = dict(keep_var_name=2, keep_param_name=True)
+    
+    
+    strip_info_file = args.output + '.json' if args.output_strip_info else None
 
     with open(args.output, "wb") as fout:
         fout.write(b"mgbtest0")
         fout.write(struct.pack("I", len(feeds["testcases"])))
-        fout.write(rt.dump_graph(output_mgbvars))
+        if isinstance(output_mgbvars, dict):
+            wrap_output_vars = dict([(i,VarNode(j)) for i,j in output_mgbvars])
+        else:
+            wrap_output_vars = [VarNode(i) for i in output_mgbvars]
+        dump_content, stat = G.dump_graph(
+            wrap_output_vars,
+            append_json=True,
+            strip_info_file=strip_info_file,
+            **sereg_kwargs)
+        fout.write(dump_content)
+
+        logger.info(
+            'graph dump sizes: tot_size={:.3f}KiB overhead={:.3f}KiB'.format(
+                stat.tot_bytes / 1024, (stat.tot_bytes - stat.tensor_value_bytes) / 1024
+            )
+        )
 
     def make_dev_tensor(value, dtype=None, device=None):
         return as_raw_tensor(value, dtype=dtype, device=device)._dev_tensor()
@@ -507,7 +526,11 @@ def main():
             testcase.keys()
         )
         with open(args.output, "ab") as fout:
-            fout.write(G.dump_graph(*output_mgbvars))
+            dump_content, _ = G.dump_graph(
+                output_mgbvars,
+                strip_info_file = strip_info_file,
+                append_json=True)
+            fout.write(dump_content)      
 
 
 
diff --git a/sdk/load-and-run/dump_with_testcase_mge.py b/sdk/load-and-run/dump_with_testcase_mge.py
index 57b933b4..b9026f61 100755
--- a/sdk/load-and-run/dump_with_testcase_mge.py
+++ b/sdk/load-and-run/dump_with_testcase_mge.py
@@ -475,7 +475,8 @@ def main():
             args.output,
             output_mgbvars,
             append=True,
-            output_strip_info=args.output_strip_info)
+            output_strip_info=args.output_strip_info,
+            append_json=True)
 
 if __name__ == '__main__':
     main()
diff --git a/sdk/load-and-run/src/mgblar.cpp b/sdk/load-and-run/src/mgblar.cpp
index 477c9190..96d83601 100644
--- a/sdk/load-and-run/src/mgblar.cpp
+++ b/sdk/load-and-run/src/mgblar.cpp
@@ -194,6 +194,26 @@ R"__usage__(
     Execute operators with kernels implemented in MegDNN with CHWN4 tensor format. Can only be used
     on Nvidia GPUs, whose compute capability is above 6.1.
 )__usage__"
+R"__usage__(
+  --enable-nchw44
+    Execute operators with kernels implemented in MegDNN with NCHW44 tensor format. This can only
+    be used on arm of armv7 and arm64, support data tyep of float32, qint8 and int8x8x16.
+)__usage__"
+R"__usage__(
+  --enable-nhw88
+    Execute operators with kernels implemented in MegDNN with NCHW88 tensor format. This can only
+    be used on x86 with data type float.
+)__usage__"
+R"__usage__(
+  --enable-nhw44-dot
+    Execute operators with kernels implemented in MegDNN with NCHW44-DOT tensor format. This Can
+    only be used on arm32 and arm64 with dot-product supported, and only support qint8 model
+)__usage__"
+R"__usage__(
+  --weight-preprocess
+    Execute operators with weight preprocess, which can optimize the operator execution time with
+    algo of winograd, im2col ,etc., but it may consume more memory.
+)__usage__"
 
 ;
 
@@ -1226,6 +1246,11 @@ Args Args::from_argv(int argc, char **argv) {
             graph_opt.graph_opt.weight_winograd_transform = true;
             continue;
         }
+        if (!strcmp(argv[i], "--weight-preprocess")) {
+            mgb_log_warn("enable weight-preprocess optimization");
+            graph_opt.graph_opt.enable_weight_preprocess();
+            continue;
+        }
 
         fprintf(stderr, "invalid arg: %s\n", argv[i]);
         ret.args_parse_ret = -1;
diff --git a/sdk/xor-deploy/xornet.py b/sdk/xor-deploy/xornet.py
index a032ef56..04835f73 100644
--- a/sdk/xor-deploy/xornet.py
+++ b/sdk/xor-deploy/xornet.py
@@ -1,6 +1,7 @@
 import numpy as np
 
 import megengine as mge
+import megengine.autodiff as ad
 import megengine.functional as F
 import megengine.module as M
 import megengine.optimizer as optim
@@ -35,57 +36,54 @@ class XORNet(M.Module):
         return x
 
 
-@trace(symbolic=True)
-def train_fun(data, label, net=None, opt=None):
-    net.train()
-    pred = net(data)
-    loss = F.cross_entropy_with_softmax(pred, label)
-    opt.backward(loss)
-    return pred, loss
-
-
-@trace(symbolic=True)
-def val_fun(data, label, net=None):
-    net.eval()
-    pred = net(data)
-    loss = F.cross_entropy_with_softmax(pred, label)
-    return pred, loss
-
-
-@trace(symbolic=True)
-def pred_fun(data, net=None):
-    net.eval()
-    pred = net(data)
-    pred_normalized = F.softmax(pred)
-    return pred_normalized
-
-
 def main():
 
     if not mge.is_cuda_available():
         mge.set_default_device("cpux")
 
     net = XORNet()
+    gm = ad.GradManager().attach(net.parameters())
     opt = optim.SGD(net.parameters(), lr=0.01, momentum=0.9)
     batch_size = 64
     train_dataset = minibatch_generator(batch_size)
     val_dataset = minibatch_generator(batch_size)
 
-    data = mge.tensor()
-    label = mge.tensor(np.zeros((batch_size,)), dtype=np.int32)
+    def train_fun(data, label):
+        opt.clear_grad()
+        with gm:
+            pred = net(data)
+            loss = F.cross_entropy_with_softmax(pred, label)
+            gm.backward(loss)
+        opt.step()
+        return pred, loss
+
+    def val_fun(data, label):
+        pred = net(data)
+        loss = F.cross_entropy_with_softmax(pred, label)
+        return pred, loss
+
+    @trace(symbolic=True, capture_as_const=True)
+    def pred_fun(data):
+        pred = net(data)
+        pred_normalized = F.softmax(pred)
+        return pred_normalized
+
+    data = np.random.random((batch_size, 2)).astype(np.float32)
+    label = np.zeros((batch_size,)).astype(np.int32)
     train_loss = []
     val_loss = []
     for step, minibatch in enumerate(train_dataset):
         if step > 1000:
             break
-        data.set_value(minibatch["data"])
-        label.set_value(minibatch["label"])
-        opt.zero_grad()
-        _, loss = train_fun(data, label, net=net, opt=opt)
+        data = minibatch["data"]
+        label = minibatch["label"]
+        net.train()
+        _, loss = train_fun(data, label)
         train_loss.append((step, loss.numpy()))
         if step % 50 == 0:
             minibatch = next(val_dataset)
-            _, loss = val_fun(data, label, net=net)
+            net.eval()
+            _, loss = val_fun(data, label)
             loss = loss.numpy()[0]
             val_loss.append((step, loss))
             print("Step: {} loss={}".format(step, loss))
@@ -108,8 +106,10 @@ def main():
         ]
     )
 
-    data.set_value(test_data)
-    out = pred_fun(data, net=net)
+    # tracing only accepts tensor as input
+    data = mge.tensor(test_data, dtype=np.float32)
+    net.eval()
+    out = pred_fun(data)
     pred_output = out.numpy()
     pred_label = np.argmax(pred_output, 1)
 
@@ -125,11 +125,8 @@ def main():
 
     model_name = "xornet_deploy.mge"
 
-    if pred_fun.enabled:
-        print("Dump model as {}".format(model_name))
-        pred_fun.dump(model_name, arg_names=["data"])
-    else:
-        print("pred_fun must be run with trace enabled in order to dump model")
+    print("Dump model as {}".format(model_name))
+    pred_fun.dump(model_name, arg_names=["data"])
 
 
 if __name__ == "__main__":
diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt
index d380d389..12d6e957 100644
--- a/src/CMakeLists.txt
+++ b/src/CMakeLists.txt
@@ -181,7 +181,7 @@ if(ANDROID)
     target_link_libraries(megbrain PUBLIC log)
 endif()
 
-set (_VER_FILE ${PROJECT_SOURCE_DIR}/imperative/src/version.ld)
+set (_VER_FILE ${PROJECT_SOURCE_DIR}/src/version.ld)
 
 if(MGE_BUILD_IMPERATIVE_RT
         )
@@ -189,7 +189,7 @@ if(MGE_BUILD_IMPERATIVE_RT
     add_library(megengine_export SHARED)
     target_link_libraries(megengine_export PUBLIC megbrain megdnn)
     if (MGE_WITH_DISTRIBUTED)
-        message("megengine_export configured to link megray")
+        message("-- megengine_export configured to link megray")
         target_link_libraries(megengine_export PUBLIC megray)
     endif()
 endif()
diff --git a/src/core/impl/comp_node/atlas/comp_node.cpp b/src/core/impl/comp_node/atlas/comp_node.cpp
index 04376299..4b7a03d5 100644
--- a/src/core/impl/comp_node/atlas/comp_node.cpp
+++ b/src/core/impl/comp_node/atlas/comp_node.cpp
@@ -104,9 +104,14 @@ public:
     void copy_to_host(void* host_ptr, const void* device_ptr,
                       size_t size) override {
         activate();
+#if MGB_USE_ATLAS_ASYNC_API
         MGB_ATLAS_CHECK(aclrtMemcpyAsync(host_ptr, size, device_ptr, size,
                                          ACL_MEMCPY_DEVICE_TO_HOST,
                                          m_env.atlas_env().stream));
+#else
+        MGB_ATLAS_CHECK(aclrtMemcpy(host_ptr, size, device_ptr, size,
+                                    ACL_MEMCPY_DEVICE_TO_HOST));
+#endif
     }
 
     void copy_to_device(void* device_ptr, const void* host_ptr,
@@ -225,9 +230,14 @@ void AtlasCompNodeImpl::peer_copy_to(Impl* dest_impl, void* dest,
         auto&& src_env = m_env.atlas_env();
         activate();
         if (dst_env.device == src_env.device) {
-            MGB_ATLAS_CHECK(aclrtMemcpyAsync(dest, size, src, size,
-                                             ACL_MEMCPY_DEVICE_TO_DEVICE,
-                                             dst_env.stream));
+#if MGB_USE_ATLAS_ASYNC_API
+        MGB_ATLAS_CHECK(aclrtMemcpyAsync(dest, size, src, size,
+                                         ACL_MEMCPY_DEVICE_TO_DEVICE,
+                                         dst_env.stream));
+#else
+            MGB_ATLAS_CHECK(aclrtMemcpy(dest, size, src, size,
+                                             ACL_MEMCPY_DEVICE_TO_DEVICE));
+#endif
         } else {
             mgb_throw(MegBrainError,
                       "Atlas does not support peer copy between differents "
@@ -239,12 +249,18 @@ void AtlasCompNodeImpl::peer_copy_to(Impl* dest_impl, void* dest,
     mgb_assert(dest_impl->env().property().type == DeviceType::CPU,
                "cuda peer_copy_to only implemented for CPU");
     auto copy = [this, dest, src, size]() {
-        auto stream = m_env.atlas_env().stream;
         m_env.atlas_env().activate();
+
+#if MGB_USE_ATLAS_ASYNC_API
+        auto stream = m_env.atlas_env().stream;
         MGB_ATLAS_CHECK(aclrtMemcpyAsync(dest, size, src, size,
                                          ACL_MEMCPY_DEVICE_TO_HOST,
                                          m_env.atlas_env().stream));
         MGB_ATLAS_CHECK(aclrtSynchronizeStream(stream));
+#else
+        MGB_ATLAS_CHECK(
+                aclrtMemcpy(dest, size, src, size, ACL_MEMCPY_DEVICE_TO_HOST));
+#endif
     };
     dest_impl->env().cpu_env().dispatch(copy);
 
diff --git a/src/core/impl/comp_node/cpu/comp_node.cpp b/src/core/impl/comp_node/cpu/comp_node.cpp
index 9c22aaa5..682b51d7 100644
--- a/src/core/impl/comp_node/cpu/comp_node.cpp
+++ b/src/core/impl/comp_node/cpu/comp_node.cpp
@@ -102,17 +102,21 @@ class CpuCompNode::SeqRecorderImpl final : public CompNodeSeqRecorder {
     bool m_fake_exec = false, m_synchronized = false, m_stopped = false,
          m_first_replay = true;
     SeqRecorderImpl** const m_self_pointer;
-    std::mutex* const m_self_pointer_mtx;
 
     std::vector<TaskElem> m_tasks;
     ThreadPool* m_thread_pool = nullptr;
-
+    const CompNode m_record_compnode;
+    /*!
+     * \brief use to check the all ther recording tasks are its self CompNode
+     * related task, void hook other CompNode related task to the recorder.
+     */
+    void check_the_same_comp_node(const CompNode& comp_node) const;
 public:
-    SeqRecorderImpl(SeqRecorderImpl** self_pointer,
-                    std::mutex* const self_pointer_mtx, ThreadPool* thread_pool)
+    SeqRecorderImpl(SeqRecorderImpl** self_pointer, ThreadPool* thread_pool,
+                    const CompNode& comp_node)
             : m_self_pointer{self_pointer},
-              m_self_pointer_mtx{self_pointer_mtx},
-              m_thread_pool{thread_pool} {
+              m_thread_pool{thread_pool},
+              m_record_compnode{comp_node} {
         mgb_assert(!*m_self_pointer);
         *m_self_pointer = this;
     }
@@ -123,23 +127,25 @@ public:
         }
     }
 
-    void enter_fake_exec() override {
+    void enter_fake_exec(const CompNode&  comp_node) override {
+        check_the_same_comp_node(comp_node);
         mgb_assert(!m_stopped && !m_fake_exec);
         m_fake_exec = true;
     }
 
-    void exit_fake_exec() override {
+    void exit_fake_exec(const CompNode&  comp_node) override {
+        check_the_same_comp_node(comp_node);
         mgb_assert(!m_stopped && m_fake_exec);
         mgb_assert(m_tasks.empty());
         m_fake_exec = false;
         m_synchronized = false;
     }
 
-    void stop() override {
+    void stop(const CompNode& comp_node = {}) override {
+        check_the_same_comp_node(comp_node);
         mgb_assert(*m_self_pointer == this);
         mgb_assert(!m_fake_exec);
         *m_self_pointer = nullptr;
-        m_self_pointer_mtx->unlock();
         m_stopped = true;
     }
 
@@ -175,25 +181,32 @@ public:
         });
     }
 
-    void on_alloc() {
+    void on_alloc(const CompNode& comp_node) {
+        check_the_same_comp_node(comp_node);
         mgb_assert(m_fake_exec,
                    "alloc is disallowed during comp node seq recording");
     }
 
-    void on_free() {
+    void on_free(const CompNode& comp_node) {
+        check_the_same_comp_node(comp_node);
         mgb_assert(m_fake_exec,
                    "free is disallowed during comp node seq recording");
     }
 
-    void on_sync() { m_synchronized = true; }
+    void on_sync(const CompNode& comp_node) {
+        check_the_same_comp_node(comp_node);
+        m_synchronized = true;
+    }
 
-    void dispatch(Task&& task) {
+    void dispatch(Task&& task, const CompNode& comp_node) {
         mgb_assert(!m_synchronized,
                    "no more tasks should be dispatched after synchronization");
         auto kern = [task](size_t, size_t) { task(); };
-        dispatch_allow_after_sync({std::move(kern), static_cast<size_t>(1_z)});
+        dispatch_allow_after_sync({std::move(kern), static_cast<size_t>(1_z)},
+                                  comp_node);
     }
-    void dispatch_allow_after_sync(Task&& task) {
+    void dispatch_allow_after_sync(Task&& task, const CompNode& comp_node) {
+        check_the_same_comp_node(comp_node);
         mgb_assert(!m_stopped,
                    "dispatch should not be called after recording is stopped");
         if (!m_fake_exec) {
@@ -201,159 +214,45 @@ public:
             m_tasks.push_back({std::move(kern), static_cast<size_t>(1_z)});
         }
     }
-    void dispatch(TaskElem&& task_elem) {
+    void dispatch(TaskElem&& task_elem, const CompNode& comp_node) {
         mgb_assert(!m_synchronized,
                    "no more tasks should be dispatched after synchronization");
-        dispatch_allow_after_sync(std::move(task_elem));
+        dispatch_allow_after_sync(std::move(task_elem), comp_node);
     }
-    void dispatch_allow_after_sync(TaskElem&& task_elem) {
+    void dispatch_allow_after_sync(TaskElem&& task_elem,
+                                   const CompNode& comp_node) {
+        check_the_same_comp_node(comp_node);
         mgb_assert(!m_stopped,
                    "dispatch should not be called after recording is stopped");
         if (!m_fake_exec) {
             m_tasks.push_back(task_elem);
         }
     }
-    size_t nr_threads() {
+    size_t nr_threads(const CompNode& comp_node) {
+        check_the_same_comp_node(comp_node);
         return m_thread_pool ? m_thread_pool->nr_threads() : 1_z;
     }
 
     ThreadPool* get_thread_pool() { return m_thread_pool; }
 };
 
-//! implementation of CPUDispatcher that is passed to megdnn via megcore
-class CpuCompNode::WorkerQueue::DispatcherImpl final: public CPUDispatcher {
-    std::atomic_size_t m_nr_task{0};
-    std::shared_ptr<WorkerQueue> m_queue;
-    SeqRecorderImpl** const m_cur_recorder;
-
-    public:
-        DispatcherImpl(const std::shared_ptr<WorkerQueue>& queue,
-                       SeqRecorderImpl** recorder)
-                : m_queue{queue}, m_cur_recorder{recorder} {}
-
-        void dispatch(Task&& task) override {
-            if (*m_cur_recorder) {
-                (*m_cur_recorder)->dispatch(std::move(task));
-            } else {
-                m_nr_task.fetch_add(1, std::memory_order_relaxed);
-                auto kern = [task](size_t, size_t) { task(); };
-                m_queue->add_task({kern, static_cast<size_t>(1_z)});
-            }
-        }
-
-        void dispatch(MultiThreadingTask&& task, size_t parallelism) override {
-            if (*m_cur_recorder) {
-                (*m_cur_recorder)->dispatch({std::move(task), parallelism});
-            } else {
-                m_nr_task.fetch_add(1, std::memory_order_relaxed);
-                m_queue->add_task({std::move(task), parallelism});
-            }
-        }
-
-        void sync() override {
-            if (*m_cur_recorder) {
-                (*m_cur_recorder)->on_sync();
-            } else {
-                m_queue->wait_all_task_finish();
-            }
-        }
-
-        size_t nr_threads() override {
-            if (*m_cur_recorder) {
-                return (*m_cur_recorder)->nr_threads();
-            } else {
-                return m_queue->nr_threads();
-            }
-        }
-
-        size_t get_nr_dispatched_tasks() const override {
-            return m_nr_task;
-        }
-
-        void set_affinity(AffinityCallBack&& affinity_cb) override {
-            auto thread_pool = m_queue->get_thread_pool();
-            if(thread_pool){
-                thread_pool->set_affinity(affinity_cb);
-            } else {
-                auto affinity_run = [affinity_cb](size_t, size_t) {
-                    affinity_cb(0);
-                };
-                m_queue->add_task({affinity_run, 1_z});
-            }
-        }
-};
-
-//! implementation of InplaceCPUDispatcher
-class InplaceCPUDispatcher final : public CPUDispatcher {
-    std::atomic_size_t m_nr_task{0};
-    ThreadPool* m_thread_pool = nullptr;
-    CpuCompNode::SeqRecorderImpl** const m_cur_recorder;
-
-public:
-    InplaceCPUDispatcher(CpuCompNode::SeqRecorderImpl** recorder,
-                         ThreadPool* thread_pool = nullptr)
-            : m_thread_pool(thread_pool), m_cur_recorder(recorder) {}
-
-    void dispatch(Task&& task) override {
-        if (*m_cur_recorder) {
-            (*m_cur_recorder)->dispatch(std::move(task));
-        } else if (m_thread_pool) {
-            m_nr_task.fetch_add(1, std::memory_order_relaxed);
-            auto kern = [task](size_t, size_t) { task(); };
-            m_thread_pool->add_task({kern, static_cast<size_t>(1_z)});
-        }else {
-            m_nr_task.fetch_add(1, std::memory_order_relaxed);
-            task();
-        }
-    }
-
-    void dispatch(MultiThreadingTask&& task, size_t parallelism) override {
-        if (*m_cur_recorder) {
-            (*m_cur_recorder)->dispatch({std::move(task), parallelism});
-        } else if (m_thread_pool) {
-            m_nr_task.fetch_add(1, std::memory_order_relaxed);
-            m_thread_pool->add_task({task, parallelism});
-        }else{
-            m_nr_task.fetch_add(1, std::memory_order_relaxed);
-            for(size_t i=0; i<parallelism;i++){
-                task(i, 0);
-            }
-        }
-    }
-
-    size_t nr_threads() override {
-        return m_thread_pool ? m_thread_pool->nr_threads() : 1_z;
-    }
-
-    void sync() override {
-        if (*m_cur_recorder) {
-            (*m_cur_recorder)->on_sync();
-        } else if (m_thread_pool) {
-            m_thread_pool->deactive();
-        }
-    }
-
-    size_t get_nr_dispatched_tasks() const override { return m_nr_task; }
-
-    void set_affinity(AffinityCallBack&& affinity_cb) override {
-        if (*m_cur_recorder) {
-            (*m_cur_recorder)->get_thread_pool()->set_affinity(affinity_cb);
-        } else if (m_thread_pool) {
-            m_thread_pool->set_affinity(affinity_cb);
-        }else{
-            affinity_cb(0);
-        }
-    }
-};
-
 class CpuCompNode::CompNodeImpl final: public CpuDispatchableBase {
     MGB_DYN_TYPE_OBJ_FINAL_DECL;
 
     //! used during comp node seq rec
     class CompSeqRecEventImpl;
+    class CpuEventImpl;
+
+//! TODO: because the x-code bug, see
+//! https://github.com/tensorflow/tensorflow/issues/18356
+//! thread local is no support on IOS,
+//! When update x-xode, this code should be deleted
+#ifndef IOS
+    static thread_local SeqRecorderImpl* sm_cur_recorder;
+#else
+    SeqRecorderImpl* sm_cur_recorder = nullptr;
+#endif
 
-    SeqRecorderImpl* m_cur_recorder = nullptr;
-    std::mutex m_cur_recorder_mtx;
     std::shared_ptr<WorkerQueue> m_worker_queue;
     Locator m_locator, m_locator_logical;
     std::unique_ptr<ThreadPool> m_thread_pool;
@@ -374,49 +273,10 @@ class CpuCompNode::CompNodeImpl final: public CpuDispatchableBase {
 
     public:
         CompNodeImpl(const Locator& locator, const Locator& locator_logical,
-                     const std::shared_ptr<WorkerQueue>& worker_queue)
-                : CpuDispatchableBase(static_free_device, static_free_host),
-                  m_worker_queue{worker_queue},
-                  m_locator(locator),
-                  m_locator_logical(locator_logical) {
-            auto cn = make_comp_node_from_impl(this);
-            if (locator.type == DeviceType::MULTITHREAD) {
-                m_thread_pool = std::unique_ptr<ThreadPool>(new ThreadPool(
-                        static_cast<size_t>(locator.nr_threads)));
-                mgb_assert(m_thread_pool, "ThradPool create failed");
-            }
-
-            if (locator.type == DeviceType::CPU) {
-                if(locator.device == Locator::DEVICE_CPU_DEFAULT){
-                    sm_default_cpu_comp_node_ptr = this;
-                    m_env.init_cpu({std::make_shared<InplaceCPUDispatcher>(
-                                           &m_cur_recorder)},
-                                   cn);
-                } else {
-                    m_env.init_cpu(
-                            {std::make_shared<WorkerQueue::DispatcherImpl>(
-                                    m_worker_queue, &m_cur_recorder)},
-                            cn);
-                }
-            } else if (locator.type == DeviceType::MULTITHREAD) {
-                if (locator.device == Locator::DEVICE_MULTITHREAD_DEFAULT) {
-                    m_env.init_cpu(
-                            {std::make_shared<InplaceCPUDispatcher>(
-                                    &m_cur_recorder, m_thread_pool.get())},
-                            cn);
-                } else {
-                    m_worker_queue->attach_thread_pool(m_thread_pool.get());
-                    m_env.init_cpu(
-                            {std::make_shared<WorkerQueue::DispatcherImpl>(
-                                    m_worker_queue, &m_cur_recorder)},
-                            cn);
-                }
-            }
-        }
-
+                     const std::shared_ptr<WorkerQueue>& worker_queue);
         ~CompNodeImpl() {
-            if (m_cur_recorder) {
-                m_cur_recorder->stop();
+            if (sm_cur_recorder) {
+                sm_cur_recorder->stop();
             }
             if (m_worker_queue) {
                 // synchronize before fini
@@ -461,17 +321,17 @@ class CpuCompNode::CompNodeImpl final: public CpuDispatchableBase {
         }
 
         void* alloc_device(size_t size) override {
-            if (m_cur_recorder) {
-                m_cur_recorder->on_alloc();
+            if (sm_cur_recorder) {
+                sm_cur_recorder->on_alloc(this);
             }
             return mgb_aligned_alloc(size);
         }
 
         void free_device(void *ptr) {
-            if (m_cur_recorder || check_global_finalized("free_device()")) {
+            if (sm_cur_recorder || check_global_finalized("free_device()")) {
                 mgb_aligned_free(ptr);
-                if (m_cur_recorder) {
-                    m_cur_recorder->on_free();
+                if (sm_cur_recorder) {
+                    sm_cur_recorder->on_free(this);
                 }
                 return;
             } else {
@@ -556,8 +416,8 @@ class CpuCompNode::CompNodeImpl final: public CpuDispatchableBase {
         std::unique_ptr<Event> create_event(size_t flags) override;
 
         void sync() override {
-            if (m_cur_recorder) {
-                m_cur_recorder->on_sync();
+            if (sm_cur_recorder) {
+                sm_cur_recorder->on_sync(this);
             } else if (m_worker_queue) {
                 m_worker_queue->wait_all_task_finish();
             }
@@ -589,13 +449,16 @@ class CpuCompNode::CompNodeImpl final: public CpuDispatchableBase {
 
         std::unique_ptr<CompNodeSeqRecorder> create_seq_recorder(
                 cg::ComputingGraph*) override {
-            m_cur_recorder_mtx.lock();
-            return std::make_unique<SeqRecorderImpl>(
-                    &m_cur_recorder, &m_cur_recorder_mtx, m_thread_pool.get());
+            return std::make_unique<SeqRecorderImpl>(&sm_cur_recorder,
+                                                     m_thread_pool.get(), this);
         }
 
-        //! current sequence recorder
-        SeqRecorderImpl* cur_recorder() const { return m_cur_recorder; }
+        //! current sequence recorder of this thread
+#ifndef IOS
+        static SeqRecorderImpl* cur_recorder() { return sm_cur_recorder; }
+#else
+        SeqRecorderImpl* cur_recorder() { return sm_cur_recorder; }
+#endif
 
         void add_callback(Task &&task) override {
             if (!check_global_finalized("add_callback()")) {
@@ -607,6 +470,181 @@ class CpuCompNode::CompNodeImpl final: public CpuDispatchableBase {
 };
 MGB_DYN_TYPE_OBJ_FINAL_IMPL(CpuCompNodeImpl);
 CpuCompNodeImpl* CpuCompNodeImpl::sm_default_cpu_comp_node_ptr;
+#ifndef IOS
+thread_local CpuCompNode::SeqRecorderImpl* CpuCompNodeImpl::sm_cur_recorder =
+        nullptr;
+#endif
+
+void CpuCompNode::SeqRecorderImpl::check_the_same_comp_node(
+        const CompNode& comp_node) const {
+    if (mgb_unlikely(comp_node.valid())) {
+        mgb_assert(m_record_compnode == comp_node,
+                   "CompNode %s can't hook in CompNode %s when recording\n",
+                   comp_node.locator().to_string().c_str(),
+                   m_record_compnode.locator().to_string().c_str());
+    }
+}
+
+//! implementation of CPUDispatcher that is passed to megdnn via megcore
+class CpuCompNode::WorkerQueue::DispatcherImpl final: public CPUDispatcher {
+    std::atomic_size_t m_nr_task{0};
+    std::shared_ptr<WorkerQueue> m_queue;
+    CpuCompNode::CompNodeImpl* const m_comp_node;
+
+public:
+    DispatcherImpl(const std::shared_ptr<WorkerQueue>& queue,
+                   CpuCompNode::CompNodeImpl* comp_node)
+            : m_queue{queue}, m_comp_node{comp_node} {}
+
+    void dispatch(Task&& task) override {
+        if (auto recorder = m_comp_node->cur_recorder()) {
+            recorder->dispatch(std::move(task), m_comp_node);
+        } else {
+            m_nr_task.fetch_add(1, std::memory_order_relaxed);
+            auto kern = [task](size_t, size_t) { task(); };
+            m_queue->add_task({kern, static_cast<size_t>(1_z)});
+        }
+    }
+
+    void dispatch(MultiThreadingTask&& task, size_t parallelism) override {
+        if (auto recorder = m_comp_node->cur_recorder()) {
+            recorder->dispatch({std::move(task), parallelism}, m_comp_node);
+        } else {
+            m_nr_task.fetch_add(1, std::memory_order_relaxed);
+            m_queue->add_task({std::move(task), parallelism});
+        }
+    }
+
+    void sync() override {
+        if (auto recorder = m_comp_node->cur_recorder()) {
+            recorder->on_sync(m_comp_node);
+        } else {
+            m_queue->wait_all_task_finish();
+        }
+    }
+
+    size_t nr_threads() override {
+        if (auto recorder = m_comp_node->cur_recorder()) {
+            return recorder->nr_threads(m_comp_node);
+        } else {
+            return m_queue->nr_threads();
+        }
+    }
+
+    size_t get_nr_dispatched_tasks() const override { return m_nr_task; }
+
+    void set_affinity(AffinityCallBack&& affinity_cb) override {
+        auto thread_pool = m_queue->get_thread_pool();
+        if (thread_pool) {
+            thread_pool->set_affinity(affinity_cb);
+        } else {
+            auto affinity_run = [affinity_cb](size_t, size_t) {
+                affinity_cb(0);
+            };
+            m_queue->add_task({affinity_run, 1_z});
+        }
+    }
+};
+
+//! implementation of InplaceCPUDispatcher
+class InplaceCPUDispatcher final : public CPUDispatcher {
+    std::atomic_size_t m_nr_task{0};
+    ThreadPool* m_thread_pool = nullptr;
+    CpuCompNode::CompNodeImpl* const m_comp_node;
+
+public:
+    InplaceCPUDispatcher(CpuCompNode::CompNodeImpl* comp_node,
+                         ThreadPool* thread_pool = nullptr)
+            : m_thread_pool(thread_pool), m_comp_node(comp_node) {}
+
+    void dispatch(Task&& task) override {
+        if (auto recorder = m_comp_node->cur_recorder()) {
+            recorder->dispatch(std::move(task), m_comp_node);
+        } else if (m_thread_pool) {
+            m_nr_task.fetch_add(1, std::memory_order_relaxed);
+            auto kern = [task](size_t, size_t) { task(); };
+            m_thread_pool->add_task({kern, static_cast<size_t>(1_z)});
+        } else {
+            m_nr_task.fetch_add(1, std::memory_order_relaxed);
+            task();
+        }
+    }
+
+    void dispatch(MultiThreadingTask&& task, size_t parallelism) override {
+        if (auto recorder = m_comp_node->cur_recorder()) {
+            recorder->dispatch({std::move(task), parallelism}, m_comp_node);
+        } else if (m_thread_pool) {
+            m_nr_task.fetch_add(1, std::memory_order_relaxed);
+            m_thread_pool->add_task({task, parallelism});
+        }else{
+            m_nr_task.fetch_add(1, std::memory_order_relaxed);
+            for(size_t i=0; i<parallelism;i++){
+                task(i, 0);
+            }
+        }
+    }
+
+    size_t nr_threads() override {
+        return m_thread_pool ? m_thread_pool->nr_threads() : 1_z;
+    }
+
+    void sync() override {
+        if (auto recorder = m_comp_node->cur_recorder()) {
+            recorder->on_sync(m_comp_node);
+        } else if (m_thread_pool) {
+            m_thread_pool->deactive();
+        }
+    }
+
+    size_t get_nr_dispatched_tasks() const override { return m_nr_task; }
+
+    void set_affinity(AffinityCallBack&& affinity_cb) override {
+        if (auto recorder = m_comp_node->cur_recorder()) {
+            recorder->get_thread_pool()->set_affinity(affinity_cb);
+        } else if (m_thread_pool) {
+            m_thread_pool->set_affinity(affinity_cb);
+        }else{
+            affinity_cb(0);
+        }
+    }
+};
+
+CpuCompNode::CompNodeImpl::CompNodeImpl(
+        const Locator& locator, const Locator& locator_logical,
+        const std::shared_ptr<WorkerQueue>& worker_queue)
+        : CpuDispatchableBase(static_free_device, static_free_host),
+          m_worker_queue{worker_queue},
+          m_locator(locator),
+          m_locator_logical(locator_logical) {
+    auto cn = make_comp_node_from_impl(this);
+    if (locator.type == DeviceType::MULTITHREAD) {
+        m_thread_pool = std::unique_ptr<ThreadPool>(
+                new ThreadPool(static_cast<size_t>(locator.nr_threads)));
+        mgb_assert(m_thread_pool, "ThradPool create failed");
+    }
+
+    if (locator.type == DeviceType::CPU) {
+        if (locator.device == Locator::DEVICE_CPU_DEFAULT) {
+            sm_default_cpu_comp_node_ptr = this;
+            m_env.init_cpu({std::make_shared<InplaceCPUDispatcher>(this)}, cn);
+        } else {
+            m_env.init_cpu({std::make_shared<WorkerQueue::DispatcherImpl>(
+                                   m_worker_queue, this)},
+                           cn);
+        }
+    } else if (locator.type == DeviceType::MULTITHREAD) {
+        if (locator.device == Locator::DEVICE_MULTITHREAD_DEFAULT) {
+            m_env.init_cpu({std::make_shared<InplaceCPUDispatcher>(
+                                   this, m_thread_pool.get())},
+                           cn);
+        } else {
+            m_worker_queue->attach_thread_pool(m_thread_pool.get());
+            m_env.init_cpu({std::make_shared<WorkerQueue::DispatcherImpl>(
+                                   m_worker_queue, this)},
+                           cn);
+        }
+    }
+}
 
 class CpuCompNodeImpl::CompSeqRecEventImpl final
         : public CpuDispatchableBase::EventImpl {
@@ -617,7 +655,7 @@ class CpuCompNodeImpl::CompSeqRecEventImpl final
                 incr_nr_req();
                 on_finish();
             };
-            rec->dispatch_allow_after_sync(callback);
+            rec->dispatch_allow_after_sync(callback, m_comp_node_impl);
         } else {
             EventImpl::do_record();
         }
@@ -633,14 +671,50 @@ public:
     using EventImpl::EventImpl;
 };
 
+class CpuCompNodeImpl::CpuEventImpl final
+        : public CpuDispatchableBase::EventImpl {
+#if MGB_HAVE_THREAD
+    void host_wait_cv() override {
+        for (size_t i = 0, it = SCQueueSynchronizer::max_spin() / 20; i < it;
+             ++i) {
+            if (finished()) {
+                auto thread_pool =
+                        static_cast<CpuCompNodeImpl*>(m_comp_node_impl)
+                                ->get_thread_pool();
+                if (thread_pool) {
+                    thread_pool->deactive();
+                }
+                return;
+            }
+        }
+        m_dev_wait_nr_waiter.fetch_add(1, std::memory_order_release);
+        for (;;) {
+            std::unique_lock<std::mutex> lock{m_dev_wait_mtx};
+            if (finished()) {
+                break;
+            }
+            m_dev_wait_cv.wait(lock);
+        }
+        m_dev_wait_nr_waiter.fetch_sub(1, std::memory_order_release);
+        auto thread_pool = static_cast<CpuCompNodeImpl*>(m_comp_node_impl)
+                                   ->get_thread_pool();
+        if (thread_pool) {
+            thread_pool->deactive();
+        }
+    }
+#endif
+public:
+    using EventImpl::EventImpl;
+};
+
 std::unique_ptr<CompNode::Event> CpuCompNodeImpl::create_event(size_t flags) {
     if (m_worker_queue) {
         m_worker_queue->check_exception();
     }
-    if (m_cur_recorder) {
+    if (sm_cur_recorder) {
         return std::make_unique<CompSeqRecEventImpl>(this, flags);
     } else {
-        return std::make_unique<EventImpl>(this, flags);
+        return std::make_unique<CpuEventImpl>(this, flags);
     }
 }
 
@@ -921,11 +995,6 @@ bool CpuCompNode::CpuDispatchableBase::EventImpl::do_finished() {
 void CpuCompNode::CpuDispatchableBase::EventImpl::host_wait_cv() {
     for (size_t i = 0, it = SCQueueSynchronizer::max_spin() / 20; i < it; ++i) {
         if (finished()) {
-            auto thread_pool = static_cast<CpuCompNodeImpl*>(m_comp_node_impl)
-                                       ->get_thread_pool();
-            if (thread_pool) {
-                thread_pool->deactive();
-            }
             return;
         }
     }
@@ -939,11 +1008,6 @@ void CpuCompNode::CpuDispatchableBase::EventImpl::host_wait_cv() {
         m_dev_wait_cv.wait(lock);
     }
     m_dev_wait_nr_waiter.fetch_sub(1, std::memory_order_release);
-    auto thread_pool =
-            static_cast<CpuCompNodeImpl*>(m_comp_node_impl)->get_thread_pool();
-    if (thread_pool) {
-        thread_pool->deactive();
-    }
 }
 
 CpuCompNode::CpuDispatchableBase::EventImpl::~EventImpl() noexcept {
diff --git a/src/core/impl/comp_node/cpu/comp_node.h b/src/core/impl/comp_node/cpu/comp_node.h
index dfb6c1b1..d8a79ecd 100644
--- a/src/core/impl/comp_node/cpu/comp_node.h
+++ b/src/core/impl/comp_node/cpu/comp_node.h
@@ -64,9 +64,8 @@ namespace mgb {
 
     //! implement Event on CpuDispatchableBase comp nodes
     class CpuCompNode::CpuDispatchableBase::EventImpl: public EventImplHelper {
-
+    protected:
         TimeSpec m_prev_finish_time;
-
 #if MGB_HAVE_THREAD
         std::atomic_size_t
             m_record_nr_req{0}, m_record_nr_finish{0},
@@ -83,22 +82,21 @@ namespace mgb {
 
         void host_wait_cv() override;
 
-        protected:
-            void do_record() override;
+        void do_record() override;
 
-            //! incr m_record_nr_req; this is used in do_record()
-            void incr_nr_req() {
+        //! incr m_record_nr_req; this is used in do_record()
+        void incr_nr_req() {
 #if MGB_HAVE_THREAD
-                m_record_nr_req.fetch_add(1, std::memory_order_relaxed);
+            m_record_nr_req.fetch_add(1, std::memory_order_relaxed);
 #endif
-            }
+        }
 
-            //! callback to be dispatched to comp node
-            void on_finish();
+        //! callback to be dispatched to comp node
+        void on_finish();
 
-        public:
-            using EventImplHelper::EventImplHelper;
-            ~EventImpl() noexcept;
+    public:
+        using EventImplHelper::EventImplHelper;
+        ~EventImpl() noexcept;
     };
 }
 
diff --git a/src/core/impl/graph/cg_impl.h b/src/core/impl/graph/cg_impl.h
index 4f7457dd..8c24785f 100644
--- a/src/core/impl/graph/cg_impl.h
+++ b/src/core/impl/graph/cg_impl.h
@@ -125,9 +125,7 @@ public:
     template<typename T> static ComputingGraphImpl* downcast(T* ptr) = delete;
 
     inline static ComputingGraphImpl* downcast(ComputingGraph* graph) {
-        #ifdef MGB_ENABLE_IMPERATIVE_RUNTIME
         mgb_assert(!graph->options().imperative_proxy_graph);
-        #endif
         return static_cast<ComputingGraphImpl*>(graph);
     }
 
diff --git a/src/core/impl/graph/cg_impl_seq.cpp b/src/core/impl/graph/cg_impl_seq.cpp
index 8d2cb9b3..326eb4c3 100644
--- a/src/core/impl/graph/cg_impl_seq.cpp
+++ b/src/core/impl/graph/cg_impl_seq.cpp
@@ -78,14 +78,16 @@ class ComputingGraphImpl::ComputingSequence::ExecContext {
 
     void warmup_for_fake_exec_with_recorder() {
         // Rerun recorder to ensure that all internal caches stabilize
-        m_recorder->enter_fake_exec();
+        auto comp_node = *(m_comp_seq->m_used_comp_node.begin());
+        m_recorder->enter_fake_exec(comp_node);
         m_comp_seq->m_exec_env.start_exec();
         m_comp_seq->m_exec_env.wait_all();
-        m_recorder->exit_fake_exec();
+        m_recorder->exit_fake_exec(comp_node);
     }
 
     void stop_and_move_recorder() {
-        m_recorder->stop();
+        auto comp_node = *(m_comp_seq->m_used_comp_node.begin());
+        m_recorder->stop(comp_node);
         if (m_fake_next_exec) {
             m_owner_graph->options().fake_next_exec = false;
         } else {
@@ -439,17 +441,22 @@ void ComputingGraphImpl::ComputingSequence::on_first_exec() {
             m_used_comp_node.insert(j->comp_node());
     }
 
+    // we maintain a recorder because events may depend on whether recorder
+    // is enabled
+    auto recorder = check_enable_comp_node_seq_recorder();
     auto&& options = m_owner_graph->options();
-    m_exec_env.set_async_level(options.async_exec_level);
+    //! The recorder in comp_node is thread_local, so the create thread should
+    //! the same as the execute thread, so set the Synchronize mode
+    if (m_enable_comp_node_seq_recorder) {
+        m_exec_env.set_async_level(0);
+    } else {
+        m_exec_env.set_async_level(options.async_exec_level);
+    }
     if (options.async_exec_level) {
         for (auto i : m_used_comp_node)
             m_exec_env.add_comp_node(i);
     }
 
-    // we maintain a recorder because events may depend on whether recorder
-    // is enabled
-    auto recorder = check_enable_comp_node_seq_recorder();
-
     // create events for timing and sync
     for (auto&& i : m_used_comp_node) {
         size_t flag = 0;
diff --git a/src/core/impl/graph/operator_node.cpp b/src/core/impl/graph/operator_node.cpp
index 85500162..c59016a0 100644
--- a/src/core/impl/graph/operator_node.cpp
+++ b/src/core/impl/graph/operator_node.cpp
@@ -45,6 +45,10 @@ class PostExecActions {
         }
     };
     CompNode m_comp_node;
+    // VarNodes in m_items should be listed in the same order as in the
+    // output of the owner_opr, because opr would generate input_wating_spec()
+    // according to this order
+    // see `SeqCompNodeOptimizerImpl::init_ready_event()` for more details
     SmallVector<Item> m_items;
     MGB_IF_COND_EXEC(ExecutionMask* m_mask = nullptr);
 
diff --git a/src/core/impl/graph/seq_comp_node_opt_impl.cpp b/src/core/impl/graph/seq_comp_node_opt_impl.cpp
index 5f166490..e135ab13 100644
--- a/src/core/impl/graph/seq_comp_node_opt_impl.cpp
+++ b/src/core/impl/graph/seq_comp_node_opt_impl.cpp
@@ -109,11 +109,8 @@ void SeqCompNodeOptimizerImpl::change_to_specific_stream(
             type = any_strong_changed ?
                 StreamPropType::STRONG : StreamPropType::WEAK;
             int copy_stream = CompNode::Stream::COPY;
-            int nccl_stream = CompNode::Stream::NCCL;
             if (inp_streams.count(copy_stream))
                 stream = copy_stream;
-            else if (inp_streams.count(nccl_stream))
-                stream = nccl_stream;
             mgb_assert(type != StreamPropType::NONE && stream != 0);
         }
         return prop_type_storage.second = StreamPropType{stream, type};
@@ -188,8 +185,7 @@ void SeqCompNodeOptimizerImpl::register_stream_var(
     mgb_assert(var->owner_graph() == m_owner_graph &&
             (prop_type == StreamPropType::WEAK ||
              prop_type == StreamPropType::STRONG));
-    mgb_assert(stream == CompNode::Stream::COPY || stream ==
-            CompNode::Stream::NCCL);
+    mgb_assert(stream == CompNode::Stream::COPY);
 
     auto ins = m_var2prop_type.insert({var, {stream, prop_type}});
     if (!ins.second) {
@@ -216,16 +212,19 @@ void SeqCompNodeOptimizerImpl::init_ready_event(
     }
     m_cnpair2opr_step.clear();
 
+    // opr step, idx of output
+    using VarStep = std::pair<size_t, size_t>;
+
     // cn0 -> (cn1 -> step): step on cn1 is known to have finished for current
     // opr on cn0
-    CompNode::UnorderedMap<CompNode::UnorderedMap<size_t>> cnpair2step;
+    CompNode::UnorderedMap<CompNode::UnorderedMap<VarStep>> cnpair2step;
 
     // vars to be waited on for current opr; only the latest var needs to be
     // waited for each comp node
     CompNode::UnorderedMap<VarNode*> vars_to_wait;
 
     CompNode::UnorderedSet cur_used_cn;
-    ThinHashMap<OperatorNodeBase*, size_t> opr2step;
+    ThinHashMap<VarNode*, VarStep> var2step;
     size_t cur_step = 0;
 
     using OprNodeProp = OperatorNodeBase::NodeProp;
@@ -266,7 +265,7 @@ void SeqCompNodeOptimizerImpl::init_ready_event(
                 }
                 if ((OprNodeProp::is_device_comp_order_dep(i.second) &&
                         i.first->comp_node() != cn) || pdv_need_sync_host) {
-                    auto step = opr2step.at(i.first->owner_opr());
+                    auto step = var2step.at(i.first);
                     auto ins = dep2step.insert({i.first->comp_node(), step});
                     // only wait for var if it is beyond currently known
                     // synchronized step
@@ -290,16 +289,25 @@ void SeqCompNodeOptimizerImpl::init_ready_event(
 
                 auto&& record = m_cnpair2opr_step[cn];
                 for (auto&& i : vars_to_wait) {
-                    auto step_done = opr2step.at(i.second->owner_opr());
+                    auto step_done = var2step.at(i.second).first;
                     auto&& seq = record[i.first];
-                    mgb_assert(seq.empty() || step_done > seq.back().second);
-                    seq.emplace_back(cur_step, step_done);
+                    // for multi-output operator, there might be multiple other
+                    // operators which depand on different output varnodes, and
+                    // those output vars share the same opr step number
+                    mgb_assert(seq.empty() || step_done >= seq.back().second);
+                    if (seq.empty() || step_done > seq.back().second) {
+                        seq.emplace_back(cur_step, step_done);
+                    }
                 }
             }
         }
 
         opr->input_waiting_spec(std::move(waiting_spec));
-        opr2step[opr] = cur_step ++;
+        auto&& usable_output = opr->usable_output();
+        for (size_t i = 0; i < usable_output.size(); ++ i) {
+            var2step[usable_output[i]] = {cur_step, i};
+        }
+        cur_step ++;
     }
     mgb_assert(cur_step == seq.size());
 }
diff --git a/src/core/impl/tensor.cpp b/src/core/impl/tensor.cpp
index 3ae8ff14..1ef2ab7d 100644
--- a/src/core/impl/tensor.cpp
+++ b/src/core/impl/tensor.cpp
@@ -614,8 +614,12 @@ void mgb::dev_tensor_memset(const DeviceTensorND& tensor, int val) {
 #endif
 #if MGB_ATLAS
        case CompNode::DeviceType::ATLAS:
+#if MGB_USE_ATLAS_ASYNC_API
            MGB_ATLAS_CHECK(aclrtMemsetAsync(ptr, -1, val, size,
                                             env.atlas_env().stream));
+#else
+           MGB_ATLAS_CHECK(aclrtMemset(ptr, -1, val, size));
+#endif
            break;
 #endif
         case CompNode::DeviceType::CPU: {
diff --git a/src/core/include/megbrain/comp_node.h b/src/core/include/megbrain/comp_node.h
index 9cf376a9..e2920f07 100644
--- a/src/core/include/megbrain/comp_node.h
+++ b/src/core/include/megbrain/comp_node.h
@@ -32,39 +32,7 @@ namespace cg {
 class ComputingGraph;
 }
 
-/*!
- * \brief record computation operations on a computing node
- *
- * This is used for fast execution of an identical computation sequence where
- * only input/output data differ.
- *
- * When this object is created from a comp node, recording starts immediately.
- * Call stop() when computation finishes, and call replay() when it needs to be
- * re-executed.
- *
- * Implementations should hold a global lock on the comp node until stop() is
- * called.
- */
-class CompNodeSeqRecorder {
-    public:
-        virtual ~CompNodeSeqRecorder() noexcept = default;
-
-        /*!
-         * \brief Enter fake-exec mode
-         *
-         * Memory allocation/free is only allowed in fake-exec mode, and kernels
-         * should not be actually recorded in this mode.
-         *
-         * This should be paired with exit_fake_exec()
-         */
-        virtual void enter_fake_exec() = 0;
-
-        //! Exit fake-exec mode
-        virtual void exit_fake_exec() = 0;
-
-        virtual void stop() = 0;
-        virtual void replay() = 0;
-};
+class CompNodeSeqRecorder;
 
 /*!
  * \brief identifier for a memory node
@@ -207,8 +175,7 @@ class CompNode {
             static constexpr int
                 COPY = -1,
                 REMOTE_SEND = -2,
-                LOOP_SWAP = -3,
-                NCCL = -4;
+                LOOP_SWAP = -3;
         };
 
         CompNode() = default;
@@ -564,19 +531,57 @@ class CompNode {
         //! is needed
         ImplBase *m_impl = nullptr;
 
-        CompNode(ImplBase *impl):
-            m_impl{impl}
-        {}
-
         friend class CompNodeEnv;
         friend struct HashTrait<CompNode>;
         friend class CompNodeImplHelper;
+    public:
+        CompNode(ImplBase* impl) : m_impl{impl} {}
 };
 
 
 MGB_DEF_ENUM_CLASS_BIT_OPR(CompNode::Flag)
 
 /*!
+ * \brief record computation operations on a computing node
+ *
+ * This is used for fast execution of an identical computation sequence where
+ * only input/output data differ.
+ *
+ * When this object is created from a comp node, recording starts immediately.
+ * Call stop() when computation finishes, and call replay() when it needs to be
+ * re-executed.
+ *
+ * Implementations should consider thread safe in comp_node, in order to support
+ * multi threads reording in the same comp_node simultaneously, using thread
+ * local recorder in comp_node.
+ *
+ * Note. When recording is over, the recorder is independent with comp_node, so
+ * the task dispatched into recorder should not related to the comp_node
+ * methord, and the thread of recorder replay is the user thread.
+ */
+class CompNodeSeqRecorder {
+public:
+    virtual ~CompNodeSeqRecorder() noexcept = default;
+
+    /*!
+     * \brief Enter fake-exec mode
+     *
+     * Memory allocation/free is only allowed in fake-exec mode, and kernels
+     * should not be actually recorded in this mode.
+     *
+     * This should be paired with exit_fake_exec()
+     */
+    virtual void enter_fake_exec(const CompNode& comp_node) = 0;
+
+    //! Exit fake-exec mode
+    virtual void exit_fake_exec(const CompNode& comp_node) = 0;
+
+    virtual void stop(const CompNode& comp_node) = 0;
+
+    virtual void replay() = 0;
+};
+
+/*!
  * \brief event associated with a CompNode node, used for cross-device
  *      synchronization
  */
diff --git a/src/core/include/megbrain/graph/bases.h b/src/core/include/megbrain/graph/bases.h
index 95dd7eb6..7fc505a3 100644
--- a/src/core/include/megbrain/graph/bases.h
+++ b/src/core/include/megbrain/graph/bases.h
@@ -22,10 +22,12 @@
 #define MGB_ENABLE_SUBLINEAR ((!MGB_BUILD_SLIM_SERVING) && (!!MGB_HAVE_THREAD))
 #endif  //  MGB_ENABLE_SUBLINEAR
 
+// FIXME: reopen when rewriting memory swap or existing tests are passed
+#define MGB_ENABLE_MEMORY_SWAP 0
 #ifndef MGB_ENABLE_MEMORY_SWAP
 #define MGB_ENABLE_MEMORY_SWAP \
     ((!MGB_BUILD_SLIM_SERVING) && (!!MGB_HAVE_THREAD) && (MGB_CUDA))
-#endif
+#endif  //  MGB_ENABLE_MEMORY_SWAP
 
 #ifndef MGB_ENABLE_PARTIAL_EXECUTION
 #define MGB_ENABLE_PARTIAL_EXECUTION (!MGB_BUILD_SLIM_SERVING)
diff --git a/src/core/include/megbrain/graph/cg.h b/src/core/include/megbrain/graph/cg.h
index 9b83e459..27b33565 100644
--- a/src/core/include/megbrain/graph/cg.h
+++ b/src/core/include/megbrain/graph/cg.h
@@ -97,6 +97,9 @@ struct GraphCommonOptimizeOptions {
     bool fuse_conv_bias_with_z = false;
     //! whether to enable fast-run profiled winograd opr replace
     bool weight_winograd_transform = false;
+    //! whether to enable weight preprocess, if enabled it may use more
+    //! memory, default disable now
+    bool weight_preprocess = false;
     enum LayoutTransform : uint32_t {
         DEFAULT,
         NCHW4,       ///< compute using NCHW4 tensor format
@@ -127,6 +130,7 @@ struct GraphCommonOptimizeOptions {
     SET(fuse_conv_bias_nonlinearity);
     SET(fuse_conv_bias_with_z);
     SET(weight_winograd_transform);
+    SET(weight_preprocess);
 #undef SET
 #define SET(_trans, _trans_capital)                                 \
     GraphCommonOptimizeOptions& enable_##_trans() {                 \
diff --git a/src/core/test/comp_node_helper.cpp b/src/core/test/comp_node_helper.cpp
index 28c6620d..73df2d02 100644
--- a/src/core/test/comp_node_helper.cpp
+++ b/src/core/test/comp_node_helper.cpp
@@ -471,6 +471,37 @@ void run<shape_dep_const_shape>(CompNode cn) {
     MGB_ASSERT_TENSOR_EQ(y_expect, host_y);
 }
 
+//! single thread multi recorder run interleave
+template <>
+void run<multi_recorder_run>(CompNode cn) {
+    using ConvParam = opr::Convolution::Param;
+    ConvParam param;
+    param.sparse = ConvParam::Sparse::GROUP;
+    HostTensorGenerator<> gen;
+    std::vector<HostTensorND> host_z_v(2, HostTensorND());
+    std::vector<std::unique_ptr<mgb::cg::AsyncExecutable>> funcs;
+    auto host_x = gen({3, 4, 10, 8}, cn), host_y = gen({2, 3, 2, 3, 3}, cn);
+    auto gen_graph =
+            [&](int graph_id) -> std::unique_ptr<mgb::cg::AsyncExecutable> {
+        auto graph = ComputingGraph::make();
+        auto x = opr::Host2DeviceCopy::make(*graph, host_x),
+             y = opr::Host2DeviceCopy::make(*graph, host_y),
+             z = opr::Convolution::make(x, y, param);
+        graph->options().comp_node_seq_record_level = 1;
+        return graph->compile({make_callback_copy(z, host_z_v[graph_id])});
+    };
+    funcs.push_back(gen_graph(0));
+    funcs.push_back(gen_graph(1));
+    for (int iter = 0; iter < 10; ++iter) {
+        host_x->copy_from_fixlayout(*gen(host_x->shape(), cn));
+        funcs[0]->execute();
+        funcs[1]->execute();
+        auto expect = eval_conv_cpu<opr::Convolution>(*host_x, *host_y, param);
+        MGB_ASSERT_TENSOR_NEAR(expect, host_z_v[0], 1e-3) << "iter " << iter;
+        MGB_ASSERT_TENSOR_NEAR(expect, host_z_v[1], 1e-3) << "iter " << iter;
+    }
+}
+
 template <>
 void run<void>(CompNode) {}
 
diff --git a/src/core/test/comp_node_helper.h b/src/core/test/comp_node_helper.h
index 441f885c..ad5d8a31 100644
--- a/src/core/test/comp_node_helper.h
+++ b/src/core/test/comp_node_helper.h
@@ -56,7 +56,7 @@ namespace seq_rec {
     cb(dyn_elemwise_fake_exec)                                                 \
     cb(level2) cb(level2_multi_holder) cb(level2_share_storage)                \
     cb(level2_exec_check) cb(sync_from_func) cb(cb_non_contig)                 \
-    cb(shape_dep_const_shape)
+    cb(shape_dep_const_shape) cb(multi_recorder_run)
 // clang-format on
 
 #define def_tags(name) \
diff --git a/src/core/test/graph/misc.cpp b/src/core/test/graph/misc.cpp
index b7830d56..9f67edc2 100644
--- a/src/core/test/graph/misc.cpp
+++ b/src/core/test/graph/misc.cpp
@@ -1085,6 +1085,22 @@ TEST(TestGraph, DynShapeDepCrossCN) {
     ASSERT_EQ(24.f, host_b.ptr<int>()[0]);
 }
 
+namespace {
+void check_wait(SymbolVar dest, SymbolVar dep) {
+    if (!dep.node()) {
+        ASSERT_EQ(0u,
+                dest.node()->owner_opr()->input_waiting_spec().size());
+        return;
+    }
+    cg::OperatorNodeBase::InputWaitingSpecElem ws;
+    unpack_vector(dest.node()->owner_opr()->input_waiting_spec(), ws);
+    ASSERT_EQ(ws.comp_node, dest.node()->comp_node());
+    VarNode *get;
+    unpack_vector(ws.dev_ready, get);
+    ASSERT_EQ(dep, get);
+};
+}
+
 TEST(TestGraph, InputWaitingSpec) {
     auto cns = load_multiple_xpus(2);
     constexpr size_t SIZE = 12345;
@@ -1115,26 +1131,40 @@ TEST(TestGraph, InputWaitingSpec) {
         MGB_ASSERT_FLOAT_EQ(px[i] + 1, pz0[i]);
         MGB_ASSERT_FLOAT_EQ(px[i] + 2, pz1[i]);
     }
-
-    auto check_wait = [](SymbolVar dest, SymbolVar dep) {
-        if (!dep.node()) {
-            ASSERT_EQ(0u,
-                    dest.node()->owner_opr()->input_waiting_spec().size());
-            return;
-        }
-        cg::OperatorNodeBase::InputWaitingSpecElem ws;
-        unpack_vector(dest.node()->owner_opr()->input_waiting_spec(), ws);
-        ASSERT_EQ(ws.comp_node, dest.node()->comp_node());
-        VarNode *get;
-        unpack_vector(ws.dev_ready, get);
-        ASSERT_EQ(dep, get);
-    };
     check_wait(y0, x);
     check_wait(y1, x + 1);
     check_wait(z1, y1 + 1);
     check_wait(z0, {});
 }
 
+TEST(TestGraph, InputWaitingSpecMultiOut) {
+    auto cn0 = CompNode::load("xpu0:0"), cn1 = CompNode::load("xpu0:1");
+    HostTensorGenerator<> gen;
+    auto graph = cg::ComputingGraph::make();
+    graph->options().graph_opt_level = 0;
+    graph->options().var_sanity_check_first_run = 0;
+    graph->options().async_exec_level = 0b100;
+    graph->options().seq_opt.enable_seq_comp_node_opt = false;
+    size_t nr_out = 1024, length = 32;
+    auto hv = gen({nr_out * length}, cn0);
+    auto x = opr::Host2DeviceCopy::make(*graph, hv);
+    auto outs = opr::Split::make(x, opr::Split::Options::make_average(0, nr_out));
+    cg::ComputingGraph::OutputSpec output_spec;
+    for (size_t i = 0; i < nr_out; ++ i) {
+        auto y = opr::Copy::make(outs[i], cn1);
+        y.node()->owner_opr()->node_prop().attribute().priority = i ? nr_out - i : 0;
+        output_spec.push_back({y, {}});
+    }
+    auto func = graph->compile(output_spec);
+    func->execute().wait();
+
+    check_wait(output_spec[0].first, outs[0]);
+    check_wait(output_spec[nr_out - 1].first, outs[nr_out - 1]);
+    for (size_t i = 1; i < nr_out - 1; ++ i) {
+        check_wait(output_spec[i].first, {});
+    }
+}
+
 TEST(TestGraph, GradStaticShape) {
     for (bool enable: {false, true}) {
         auto graph = ComputingGraph::make();
diff --git a/src/core/test/graph/multi_thread.cpp b/src/core/test/graph/multi_thread.cpp
index e4c484b1..1b9af8b0 100644
--- a/src/core/test/graph/multi_thread.cpp
+++ b/src/core/test/graph/multi_thread.cpp
@@ -12,6 +12,7 @@
 #include "megbrain/opr/io.h"
 #include "megbrain/opr/utility.h"
 #include "megbrain/system.h"
+#include "megbrain/opr/dnn/convolution.h"
 
 #include "megbrain/test/helper.h"
 
@@ -20,6 +21,37 @@
 
 using namespace mgb;
 
+namespace{
+template <typename Opr>
+HostTensorND eval_conv(const std::shared_ptr<HostTensorND>& src,
+                       const std::shared_ptr<HostTensorND>& filter,
+                       const typename Opr::Param& param = {}) {
+    auto graph = ComputingGraph::make();
+    graph->options().log_level = 0;
+    SymbolVar x = opr::Host2DeviceCopy::make(*graph, src);
+    SymbolVar y = opr::Host2DeviceCopy::make(*graph, filter);
+    SymbolVar z = Opr::make(x, y, param);
+    HostTensorND host_z;
+    auto func = graph->compile({make_callback_copy(z, host_z)});
+    func->execute();
+
+    host_z.sync();
+    return host_z;
+}
+
+template <typename Opr>
+HostTensorND eval_conv_cpu(const HostTensorND& xv, const HostTensorND& fv,
+                           const typename Opr::Param& param = {}) {
+    auto cn = CompNode::load("cpux");
+    auto src = std::make_shared<HostTensorND>(cn, xv.layout()),
+         filter = std::make_shared<HostTensorND>(cn, fv.layout());
+    memcpy(src->raw_ptr(), xv.raw_ptr(), xv.layout().span().dist_byte());
+    memcpy(filter->raw_ptr(), fv.raw_ptr(), fv.layout().span().dist_byte());
+    return eval_conv<Opr>(src, filter, param);
+}
+}  // namespace
+
+
 TEST(TestGraph, AsyncExecLevel) {
     REQUIRE_GPU(1);
 
@@ -164,5 +196,37 @@ TEST(TestGraph, ParallelRun) {
     for (auto&& i : workers)
         i.join();
 }
+#ifndef IOS
+TEST(TestGraph, MultiThreadRecorder) {
+    using ConvParam = opr::Convolution::Param;
+    ConvParam param;
+    param.sparse = ConvParam::Sparse::GROUP;
+    HostTensorGenerator<> gen;
+    auto cn = CompNode::load("cpux");
+    auto host_x = gen({3, 4, 10, 8}, cn), host_y = gen({2, 3, 2, 3, 3}, cn);
+    auto worker = [&](int record_level) {
+        HostTensorND host_z;
+        auto graph = ComputingGraph::make();
+        auto x = opr::Host2DeviceCopy::make(*graph, host_x),
+             y = opr::Host2DeviceCopy::make(*graph, host_y),
+             z = opr::Convolution::make(x, y, param);
+        graph->options().comp_node_seq_record_level = record_level;
+        graph->options().var_sanity_check_first_run = false;
+        auto func = graph->compile({make_callback_copy(z, host_z)});
+        for (int i = 0; i < 5; i++) {
+            func->execute();
+        }
+        auto expect = eval_conv_cpu<opr::Convolution>(*host_x, *host_y, param);
+        MGB_ASSERT_TENSOR_NEAR(expect, host_z, 1e-3);
+    };
+
+    std::vector<std::thread> workers;
+    for (size_t i = 0; i < 4; ++i)
+        workers.emplace_back(worker, i % 2);
+
+    for (auto&& i : workers)
+        i.join();
+}
+#endif
 
 // vim: syntax=cpp.doxygen foldmethod=marker foldmarker=f{{{,f}}}
diff --git a/src/gopt/impl/inference.cpp b/src/gopt/impl/inference.cpp
index 9e40def9..8115f61c 100644
--- a/src/gopt/impl/inference.cpp
+++ b/src/gopt/impl/inference.cpp
@@ -561,7 +561,7 @@ void ParamFusePass::apply(OptState &state) const {
         }
 
         SymbolVar new_var;
-        bool is_default_format = var->layout().format.is_default();
+        bool is_default_format = var->format().is_default();
         if (cg::is_static_var_value(var) && is_default_format) {
             // use ImmutableTensor for inferable vars
             HostTensorND hv;
@@ -771,6 +771,29 @@ std::unique_ptr<ConvertF32ToF16Pass> ConvertF32ToF16Pass::make(
         return new_conv_opr.node()->owner_opr();
     };
 
+    auto replace_deconv_opr = [use_f32_comp](OperatorNodeBase* opr,
+                                           const VarNodeArray& new_inp) {
+        mgb_assert(opr->input().size() == new_inp.size());
+        auto& deconv_opr = opr->cast_final_safe<opr::ConvolutionBackwardData>();
+        auto new_param = deconv_opr.param();
+        if (use_f32_comp) {
+            new_param.compute_mode =
+                    megdnn::param::Convolution::ComputeMode::FLOAT32;
+        }
+        mgb_assert(new_inp[0]->dtype() == dtype::Float16(),
+                   "inp %s:%s, owner_opr:%s", new_inp[0]->dtype().name(),
+                   new_inp[0]->name().c_str(),
+                   new_inp[0]->owner_opr()->name().c_str());
+        mgb_assert(new_inp[1]->dtype() == dtype::Float16(),
+                   "inp %s:%s, owner_opr:%s", new_inp[1]->dtype().name(),
+                   new_inp[1]->name().c_str(),
+                   new_inp[1]->owner_opr()->name().c_str());
+        auto new_deconv_opr = opr::ConvolutionBackwardData::make(
+                new_inp[0], new_inp[1], new_param, deconv_opr.execution_policy(),
+                deconv_opr.config());
+        return new_deconv_opr.node()->owner_opr();
+    };
+
     auto replace_convbias_opr = [use_f32_comp](OperatorNodeBase* opr,
                                                const VarNodeArray& new_inp) {
         auto& convbias_opr = opr->cast_final_safe<opr::ConvBiasForward>();
@@ -941,6 +964,7 @@ std::unique_ptr<ConvertF32ToF16Pass> ConvertF32ToF16Pass::make(
     replace_func[opr::Host2DeviceCopy::typeinfo()] = replace_h2d_opr;
     replace_func[opr::SharedDeviceTensor::typeinfo()] = replace_sdt_opr;
     replace_func[opr::Convolution::typeinfo()] = replace_conv_opr;
+    replace_func[opr::ConvolutionBackwardData::typeinfo()] = replace_deconv_opr;
     replace_func[opr::ConvBias::typeinfo()] = replace_convbias_opr;
     replace_func[opr::MatrixMul::typeinfo()] = replace_matmul_opr;
     replace_func[opr::Reduce::typeinfo()] = replace_reduce_opr;
diff --git a/src/gopt/impl/tensor_reformat.cpp b/src/gopt/impl/tensor_reformat.cpp
index c6ad8bd1..debac210 100644
--- a/src/gopt/impl/tensor_reformat.cpp
+++ b/src/gopt/impl/tensor_reformat.cpp
@@ -1415,6 +1415,7 @@ VarNode* EnableNCHW4Pass::on_graph_endpoint_var(VarNode* new_var,
     return new_var;
 }
 
+//! FIXME: All float oprs do not support NCHW4. Supports it in the future plz.
 std::unique_ptr<EnableNCHW4Pass> EnableNCHW4Pass::make_nchw4_converter() {
     MIDOUT_B("EnableNCHW4Pass::make")
     auto ret = std::make_unique<EnableNCHW4Pass>();
@@ -1467,6 +1468,10 @@ std::unique_ptr<EnableNCHW4Pass> EnableNCHW4Pass::make_nchw4_converter() {
     auto replace_conv_opr = [trans_nchw4, conv_format](
                                     OperatorNodeBase* opr,
                                     const VarNodeArray& new_inp) {
+        if (new_inp[0]->dtype().enumv() == DTypeEnum::Float32) {
+            return serialization::copy_opr_shallow(*opr, new_inp,
+                                                   opr->config());
+        }
         mgb_assert(opr->input().size() == new_inp.size());
         auto& conv_opr = opr->cast_final_safe<opr::ConvolutionForward>();
         if (conv_opr.param().format !=
@@ -1503,6 +1508,10 @@ std::unique_ptr<EnableNCHW4Pass> EnableNCHW4Pass::make_nchw4_converter() {
                                         src_to_nchw4_mode](
                                                OperatorNodeBase* opr,
                                                const VarNodeArray& new_inp) {
+        if (new_inp[0]->dtype().enumv() == DTypeEnum::Float32) {
+            return serialization::copy_opr_shallow(*opr, new_inp,
+                                                   opr->config());
+        }
         mgb_assert(opr->input().size() == new_inp.size());
         auto& batch_conv_bias_opr =
                 opr->cast_final_safe<opr::BatchConvBiasForward>();
@@ -1580,6 +1589,10 @@ std::unique_ptr<EnableNCHW4Pass> EnableNCHW4Pass::make_nchw4_converter() {
                                   src_to_nchw4_mode](
                                          OperatorNodeBase* opr,
                                          const VarNodeArray& new_inp) {
+        if (new_inp[0]->dtype().enumv() == DTypeEnum::Float32) {
+            return serialization::copy_opr_shallow(*opr, new_inp,
+                                                   opr->config());
+        }
         mgb_assert(opr->input().size() == new_inp.size());
         auto& conv_bias_opr = opr->cast_final_safe<opr::ConvBiasForward>();
         if (conv_bias_opr.param().format !=
@@ -1647,6 +1660,10 @@ std::unique_ptr<EnableNCHW4Pass> EnableNCHW4Pass::make_nchw4_converter() {
     };
     auto replace_elemwise_opr = [=](OperatorNodeBase* opr,
                                     const VarNodeArray& new_inp) {
+        if (new_inp[0]->dtype().enumv() == DTypeEnum::Float32) {
+            return serialization::copy_opr_shallow(*opr, new_inp,
+                                                   opr->config());
+        }
         mgb_assert(opr->input().size() == new_inp.size());
         bool has_inp_changed = false;
         for (size_t i = 0; i < opr->input().size(); i++) {
@@ -1691,6 +1708,10 @@ std::unique_ptr<EnableNCHW4Pass> EnableNCHW4Pass::make_nchw4_converter() {
     };
     auto replace_pooling_opr = [](OperatorNodeBase* opr,
                                   const VarNodeArray& new_inp) {
+        if (new_inp[0]->dtype().enumv() == DTypeEnum::Float32) {
+            return serialization::copy_opr_shallow(*opr, new_inp,
+                                                   opr->config());
+        }
         using Param = opr::PoolingForward::Param;
         using Format = Param::Format;
         mgb_assert(opr->input().size() == new_inp.size());
@@ -1716,6 +1737,10 @@ std::unique_ptr<EnableNCHW4Pass> EnableNCHW4Pass::make_nchw4_converter() {
     };
     auto replace_resize_opr = [](OperatorNodeBase* opr,
                                  const VarNodeArray& new_inp) {
+        if (new_inp[0]->dtype().enumv() == DTypeEnum::Float32) {
+            return serialization::copy_opr_shallow(*opr, new_inp,
+                                                   opr->config());
+        }
         using Param = opr::ResizeForward::Param;
         using Format = Param::Format;
         mgb_assert(opr->input().size() == new_inp.size());
@@ -1738,6 +1763,10 @@ std::unique_ptr<EnableNCHW4Pass> EnableNCHW4Pass::make_nchw4_converter() {
     };
     auto replace_warp_perspective_opr = [](OperatorNodeBase* opr,
                                            const VarNodeArray& new_inp) {
+        if (new_inp[0]->dtype().enumv() == DTypeEnum::Float32) {
+            return serialization::copy_opr_shallow(*opr, new_inp,
+                                                   opr->config());
+        }
         using Param = opr::WarpPerspective::Param;
         using Format = Param::Format;
         mgb_assert(opr->input().size() == new_inp.size());
@@ -1833,7 +1862,8 @@ static inline bool nchw_nchwxx_valid(
     auto& src_node = new_inp[0];
     auto& filter_node = new_inp[1];
     auto dst_node = opr.output(0);
-    if (filter_node->shape().ndim != 4) {
+    //! already transformed or have fuse Z
+    if (filter_node->shape().ndim != 4 || new_inp.size() == 4) {
         return false;
     }
     megdnn::ConvolutionBase<megdnn::param::Convolution>::CanonizedFilterMeta fm;
@@ -1855,7 +1885,8 @@ static inline bool nchw_nchwxx_valid(
 
     megdnn::ConvBiasForward::BiasMode bias_mode =
             megdnn::ConvBiasForward::BiasMode::NO_BIAS;
-    if (std::is_same<OprType, opr::ConvBiasForward>::value) {
+    if (std::is_same<OprType, opr::ConvBiasForward>::value &&
+        new_inp.size() > 2) {
         TensorShape bias_shape = new_inp[2]->shape();
         if (bias_shape.ndim == 5) {
             bias_shape = nchwxx_shape_2_nchw_shape(bias_shape);
@@ -2038,6 +2069,8 @@ void EnableNchwxxPass::fill_opr_convert_fun(size_t pack_c_size) {
                                   pack_c_size](OperatorNodeBase* opr,
                                                const VarNodeArray& new_inp) {
         mgb_assert(opr->input().size() == new_inp.size());
+        mgb_assert(opr->input().size() <= 3,
+                   "nchwxx does not support conv_bias fuse Z right now");
         auto& conv_bias_opr = opr->cast_final_safe<opr::ConvBiasForward>();
         mgb_assert(conv_bias_opr.param().format ==
                            megdnn::param::ConvBias::Format::NCHW,
@@ -2063,7 +2096,7 @@ void EnableNchwxxPass::fill_opr_convert_fun(size_t pack_c_size) {
                 temp_inp[0] = new_src.node();
             }
             //! the bias is nchwxx
-            if (temp_inp[2]->shape().ndim == 5) {
+            if (new_inp.size() > 2 && temp_inp[2]->shape().ndim == 5) {
                 auto new_bias =
                         RelayoutPlaceholder::make(new_inp[2], src_to_nchw_mode);
                 temp_inp[2] = new_bias.node();
@@ -2073,7 +2106,7 @@ void EnableNchwxxPass::fill_opr_convert_fun(size_t pack_c_size) {
             return new_opr;
         } else if (is_trans.first == TransType::TRANS_PURE_NCHWXX) {
             VarNode *conv_bias_src = new_inp[0], *conv_bias_filter = new_inp[1],
-                    *conv_bias_bias = new_inp[2];
+                    *conv_bias_bias = nullptr;
             //! filter trans to nchwxx mode
             mgb_assert(new_inp[1]->shape().ndim == 4 ||
                                new_inp[1]->shape().ndim == 5,
@@ -2088,21 +2121,34 @@ void EnableNchwxxPass::fill_opr_convert_fun(size_t pack_c_size) {
                                                          src_to_nchwxx_mode);
                 conv_bias_src = new_src.node();
             }
-            //! bias trans to nchwxx mode, bias may be scale
-            if (new_inp[2]->shape().ndim == 4) {
-                auto new_bias = RelayoutPlaceholder::make(new_inp[2],
-                                                          src_to_nchwxx_mode);
-                conv_bias_bias = new_bias.node();
+            //! bias trans to nchwxx mode
+            if (new_inp.size() > 2) {
+                if (new_inp[2]->shape().ndim == 4) {
+                    auto new_bias = RelayoutPlaceholder::make(
+                            new_inp[2], src_to_nchwxx_mode);
+                    conv_bias_bias = new_bias.node();
+                } else {
+                    mgb_assert(new_inp[2]->shape().ndim == 5);
+                    conv_bias_bias = new_inp[2];
+                }
             }
-
             auto new_param = conv_bias_opr.param();
             new_param.format = conv_bias_format;
             mgb_assert(conv_bias_src->shape().ndim == 5 &&
                                conv_bias_filter->shape().ndim >= 6,
                        "The conv_bias src dim is not trans to nchwxx");
-            auto new_conv_bias_opr = opr::ConvBias::make(
-                    conv_bias_src, conv_bias_filter, conv_bias_bias, new_param,
-                    conv_bias_opr.execution_policy(), conv_bias_opr.config());
+            SymbolVar new_conv_bias_opr;
+            if (conv_bias_bias) {
+                new_conv_bias_opr = opr::ConvBias::make(
+                        conv_bias_src, conv_bias_filter, conv_bias_bias,
+                        new_param, conv_bias_opr.execution_policy(),
+                        conv_bias_opr.config());
+            } else {
+                new_conv_bias_opr = opr::ConvBias::make(
+                        conv_bias_src, conv_bias_filter, new_param,
+                        conv_bias_opr.execution_policy(),
+                        conv_bias_opr.config());
+            }
             OperatorNodeBase* new_opr = new_conv_bias_opr.node()->owner_opr();
             mgb_assert(new_conv_bias_opr.shape().ndim == 5,
                        "The conv_bias dst dim is not trans to nchwxx");
@@ -2110,25 +2156,37 @@ void EnableNchwxxPass::fill_opr_convert_fun(size_t pack_c_size) {
         } else {
             mgb_assert(is_trans.first == TransType::TRANS_HYBIRD_NCHWXX);
             VarNode *conv_bias_src = new_inp[0], *conv_bias_filter = new_inp[1],
-                    *conv_bias_bias = new_inp[2];
+                    *conv_bias_bias = nullptr;
             auto new_filter =
                     RelayoutPlaceholder::make(new_inp[1], is_trans.second);
             conv_bias_filter = new_filter.node();
             //! bias trans to nchwxx mode, bias may be scale
-            if (new_inp[2]->shape().ndim == 4) {
-                auto new_bias = RelayoutPlaceholder::make(new_inp[2],
-                                                          src_to_nchwxx_mode);
-                conv_bias_bias = new_bias.node();
+            if (new_inp.size() > 2) {
+                if (new_inp[2]->shape().ndim == 4) {
+                    auto new_bias = RelayoutPlaceholder::make(
+                            new_inp[2], src_to_nchwxx_mode);
+                    conv_bias_bias = new_bias.node();
+                } else {
+                    mgb_assert(new_inp[2]->shape().ndim == 5);
+                    conv_bias_bias = new_inp[2];
+                }
             }
             mgb_assert(conv_bias_src->shape().ndim == 4 &&
                        conv_bias_filter->shape().ndim == 5);
-            mgb_assert((conv_bias_bias->shape().ndim == 5) ||
-                       conv_bias_bias->shape().is_scalar());
             auto new_param = conv_bias_opr.param();
             new_param.format = conv_bias_format;
-            auto new_conv_bias_opr = opr::ConvBias::make(
-                    conv_bias_src, conv_bias_filter, conv_bias_bias, new_param,
-                    conv_bias_opr.execution_policy(), conv_bias_opr.config());
+            SymbolVar new_conv_bias_opr;
+            if (conv_bias_bias) {
+                new_conv_bias_opr = opr::ConvBias::make(
+                        conv_bias_src, conv_bias_filter, conv_bias_bias,
+                        new_param, conv_bias_opr.execution_policy(),
+                        conv_bias_opr.config());
+            } else {
+                new_conv_bias_opr = opr::ConvBias::make(
+                        conv_bias_src, conv_bias_filter, new_param,
+                        conv_bias_opr.execution_policy(),
+                        conv_bias_opr.config());
+            }
             OperatorNodeBase* new_opr = new_conv_bias_opr.node()->owner_opr();
             mgb_assert(new_conv_bias_opr.shape().ndim == 5,
                        "The conv dst dim is not trans to nchwxx");
@@ -2246,6 +2304,10 @@ void EnableNchwxxPass::fill_opr_convert_fun(size_t pack_c_size) {
             relayout_inp_to_nchw;
     replace_func[opr::WarpAffineForward::typeinfo()] = relayout_inp_to_nchw;
     replace_func[opr::Reshape::typeinfo()] = relayout_inp_to_nchw;
+    replace_func[opr::AxisAddRemove::typeinfo()] = relayout_inp_to_nchw;
+    replace_func[opr::Argmax::typeinfo()] = relayout_inp_to_nchw;
+    replace_func[opr::Broadcast::typeinfo()] = relayout_inp_to_nchw;
+    replace_func[opr::ImmutableTensor::typeinfo()] = relayout_inp_to_nchw;
 }
 
 std::unique_ptr<EnableNchwxxPass> EnableNchwxxPass::make_nchwxx_converter(
@@ -2430,6 +2492,8 @@ EnableNchw44DotPass::make_nchw44_dot_converter() {
                                          OperatorNodeBase* opr,
                                          const VarNodeArray& new_inp) {
         mgb_assert(opr->input().size() == new_inp.size());
+        mgb_assert(opr->input().size() <= 3,
+                   "nchwxx-dot does not support conv_bias fuse Z right now");
         auto& conv_bias_opr = opr->cast_final_safe<opr::ConvBiasForward>();
         mgb_assert(conv_bias_opr.param().format ==
                            megdnn::param::ConvBias::Format::NCHW,
@@ -2460,7 +2524,7 @@ EnableNchw44DotPass::make_nchw44_dot_converter() {
             }
 
             //! the bias is nchwxx
-            if (temp_inp[2]->shape().ndim == 5) {
+            if (new_inp.size() > 2 && temp_inp[2]->shape().ndim == 5) {
                 auto new_bias = RelayoutPlaceholder::make(
                         new_inp[2], RelayoutMode::NCHW4_TO_NCHW);
                 temp_inp[2] = new_bias.node();
@@ -2470,7 +2534,7 @@ EnableNchw44DotPass::make_nchw44_dot_converter() {
             return new_opr;
         } else if (is_trans.trans_type == TransType::TRANS_PURE_NCHWXX) {
             VarNode *conv_bias_src = new_inp[0], *conv_bias_filter = new_inp[1],
-                    *conv_bias_bias = new_inp[2];
+                    *conv_bias_bias = nullptr;
             //! filter trans to nchwxx mode
             mgb_assert(new_inp[1]->shape().ndim == 4 ||
                                new_inp[1]->shape().ndim == 5,
@@ -2485,21 +2549,34 @@ EnableNchw44DotPass::make_nchw44_dot_converter() {
                         new_inp[0], RelayoutMode::NCHW_TO_NCHW4);
                 conv_bias_src = new_src.node();
             }
-            //! bias trans to nchwxx mode, bias may be scale
-            if (new_inp[2]->shape().ndim == 4) {
-                auto new_bias = RelayoutPlaceholder::make(
-                        new_inp[2], RelayoutMode::NCHW_TO_NCHW4);
-                conv_bias_bias = new_bias.node();
+            //! bias trans to nchwxx mode
+            if (new_inp.size() > 2) {
+                if (new_inp[2]->shape().ndim == 4) {
+                    auto new_bias = RelayoutPlaceholder::make(
+                            new_inp[2], RelayoutMode::NCHW_TO_NCHW4);
+                    conv_bias_bias = new_bias.node();
+                } else {
+                    mgb_assert(new_inp[2]->shape().ndim == 5);
+                    conv_bias_bias = new_inp[2];
+                }
             }
-
             auto new_param = conv_bias_opr.param();
             new_param.format = is_trans.conv_format;
             mgb_assert(conv_bias_src->shape().ndim == 5 &&
                                conv_bias_filter->shape().ndim >= 6,
                        "The conv_bias src dim is not trans to nchwxx");
-            auto new_conv_bias_opr = opr::ConvBias::make(
-                    conv_bias_src, conv_bias_filter, conv_bias_bias, new_param,
-                    conv_bias_opr.execution_policy(), conv_bias_opr.config());
+            SymbolVar new_conv_bias_opr;
+            if (conv_bias_bias) {
+                new_conv_bias_opr = opr::ConvBias::make(
+                        conv_bias_src, conv_bias_filter, conv_bias_bias,
+                        new_param, conv_bias_opr.execution_policy(),
+                        conv_bias_opr.config());
+            } else {
+                new_conv_bias_opr = opr::ConvBias::make(
+                        conv_bias_src, conv_bias_filter, new_param,
+                        conv_bias_opr.execution_policy(),
+                        conv_bias_opr.config());
+            }
             OperatorNodeBase* new_opr = new_conv_bias_opr.node()->owner_opr();
             mgb_assert(new_conv_bias_opr.shape().ndim == 5,
                        "The conv_bias dst dim is not trans to nchwxx");
@@ -2507,25 +2584,37 @@ EnableNchw44DotPass::make_nchw44_dot_converter() {
         } else {
             mgb_assert(is_trans.trans_type == TransType::TRANS_HYBIRD_NCHWXX);
             VarNode *conv_bias_src = new_inp[0], *conv_bias_filter = new_inp[1],
-                    *conv_bias_bias = new_inp[2];
+                    *conv_bias_bias = nullptr;
             auto new_filter = RelayoutPlaceholder::make(new_inp[1],
                                                         is_trans.relayout_mod);
             conv_bias_filter = new_filter.node();
             //! bias trans to nchwxx mode, bias may be scale
-            if (new_inp[2]->shape().ndim == 4) {
-                auto new_bias = RelayoutPlaceholder::make(
-                        new_inp[2], RelayoutMode::NCHW_TO_NCHW4);
-                conv_bias_bias = new_bias.node();
+            if (new_inp.size() > 2) {
+                if (new_inp[2]->shape().ndim == 4) {
+                    auto new_bias = RelayoutPlaceholder::make(
+                            new_inp[2], RelayoutMode::NCHW_TO_NCHW4);
+                    conv_bias_bias = new_bias.node();
+                } else {
+                    mgb_assert(new_inp[2]->shape().ndim == 5);
+                    conv_bias_bias = new_inp[2];
+                }
             }
             mgb_assert(conv_bias_src->shape().ndim == 4 &&
                        conv_bias_filter->shape().ndim == 5);
-            mgb_assert((conv_bias_bias->shape().ndim == 5) ||
-                       conv_bias_bias->shape().is_scalar());
             auto new_param = conv_bias_opr.param();
             new_param.format = is_trans.conv_format;
-            auto new_conv_bias_opr = opr::ConvBias::make(
-                    conv_bias_src, conv_bias_filter, conv_bias_bias, new_param,
-                    conv_bias_opr.execution_policy(), conv_bias_opr.config());
+            SymbolVar new_conv_bias_opr;
+            if (conv_bias_bias) {
+                new_conv_bias_opr = opr::ConvBias::make(
+                        conv_bias_src, conv_bias_filter, conv_bias_bias,
+                        new_param, conv_bias_opr.execution_policy(),
+                        conv_bias_opr.config());
+            } else {
+                new_conv_bias_opr = opr::ConvBias::make(
+                        conv_bias_src, conv_bias_filter, new_param,
+                        conv_bias_opr.execution_policy(),
+                        conv_bias_opr.config());
+            }
             OperatorNodeBase* new_opr = new_conv_bias_opr.node()->owner_opr();
             mgb_assert(new_conv_bias_opr.shape().ndim == 5,
                        "The conv dst dim is not trans to nchwxx");
@@ -3127,4 +3216,4 @@ void ShuffleShuffleRemovePass::apply(OptState& opt) const {
     MIDOUT_E
 }
 
-// vim: syntax=cpp.doxygen foldmethod=marker foldmarker=f{{{,f}}}
\ No newline at end of file
+// vim: syntax=cpp.doxygen foldmethod=marker foldmarker=f{{{,f}}}
diff --git a/src/gopt/test/inference.cpp b/src/gopt/test/inference.cpp
index 44989430..cb2cd183 100644
--- a/src/gopt/test/inference.cpp
+++ b/src/gopt/test/inference.cpp
@@ -710,6 +710,33 @@ TEST(TestGoptInference, Float16IOFloat32Compute) {
     MGB_ASSERT_TENSOR_NEAR(host_y, host_y_opt, 1e-3);
 }
 
+TEST(TestGoptInference, Float16IOFloat32ComputeDeConv) {
+    constexpr size_t INP_H = 10, INP_W = 10;
+    HostTensorGenerator<> gen;
+    auto graph = ComputingGraph::make();
+    auto mkvar = [&](const char* name, const TensorShape& shp) {
+        return opr::Host2DeviceCopy::make(*graph, gen(shp)).rename(name);
+    };
+    graph->options().graph_opt_level = 0;
+
+    auto s0 = mkvar("s0", {5, 5, 3, 3}),
+         s1 = mkvar("s1", {1, 5, INP_H, INP_W});
+    auto y = opr::ConvolutionBackwardData::make(s0, s1, {}, {});
+    SymbolVar y_opt;
+    auto options = gopt::OptimizeForInferenceOptions{};
+    options.enable_f16_io_f32_comp();
+    unpack_vector(gopt::optimize_for_inference({y}, options), y_opt);
+    ASSERT_EQ(find_opr<opr::ConvolutionBackwardData>(y_opt).param().compute_mode,
+              opr::ConvBias::Param::ConvBias::ComputeMode::FLOAT32);
+    ASSERT_EQ(y_opt.dtype(), dtype::Float32());
+
+    HostTensorND host_y, host_y_opt;
+    auto func = graph->compile({make_callback_copy(y, host_y),
+                                make_callback_copy(y_opt, host_y_opt)});
+    func->execute();
+    MGB_ASSERT_TENSOR_NEAR(host_y, host_y_opt, 1e-2);
+}
+
 TEST(TestGoptInference, Float16IOFloat32ComputeWarpPerspective) {
     constexpr size_t INP_H = 10, INP_W = 10, N = 2;
     HostTensorGenerator<> gen;
@@ -2816,7 +2843,7 @@ TEST(TestGoptInference, ConvertFormatNCHW4) {
         unpack_vector(gopt::optimize_for_inference({y}, options), y_opt);
     }
 
-    ASSERT_EQ(opr::ConvBias::Param::Format::NCHW4,
+    ASSERT_EQ(opr::ConvBias::Param::Format::NCHW,
               find_opr<opr::ConvBias>(y_opt).param().format);
 
     graph->compile({{y_opt, {}}})
@@ -3009,9 +3036,8 @@ TEST(TestGoptInference, ConvertFormatNCHW44) {
     //! no supported hybrid nchw44
     opr::ConvBias::Param param_conv_bias_pad0;
     param_conv_bias_pad0.pad_h = param_conv_bias_pad0.pad_w = 0;
-    auto b1 = mkcvar("b1", {1, 8, 1, 1});
     auto w1_f1 = mkcvar("w1_1", {8, 3, 1, 1});
-    auto conv1_f1 = opr::ConvBias::make(x, w1_f1, b1, param_conv_bias_pad0, {},
+    auto conv1_f1 = opr::ConvBias::make(x, w1_f1, param_conv_bias_pad0, {},
                                         OperatorNodeConfig("conv1_f1"));
 
     auto conv1_add = conv1_f1 * conv1;
@@ -3263,9 +3289,8 @@ TEST(TestGoptInference, ConvertFormatNCHW44_DOT) {
     opr::ConvBias::Param param_conv_bias;
     param_conv_bias.pad_h = param_conv_bias.pad_w = 1;
     auto w1_2 = mkcvar_dtype("w1_2", {8, 8, 3, 3}, dtype::QuantizedS8(2.5f));
-    auto b1_2 = mkcvar_dtype("b1_2", {1, 8, 1, 1}, dtype::QuantizedS32(6.25f));
     auto conv_1_2 = opr::ConvBias::make(
-            conv_1_q8, w1_2, b1_2, param_conv_bias, {},
+            conv_1_q8, w1_2, param_conv_bias, {},
             OperatorNodeConfig{"conv_1_2", cn, dtype::QuantizedS8{6.25f}});
     auto conv_1_2_fp32 = opr::TypeCvt::make(conv_1_2, dtype::Float32());
 
diff --git a/src/jit/impl/fusion_pass.cpp b/src/jit/impl/fusion_pass.cpp
index 8f728470..00ea7196 100644
--- a/src/jit/impl/fusion_pass.cpp
+++ b/src/jit/impl/fusion_pass.cpp
@@ -297,7 +297,7 @@ void JITFusionPass::Impl::process_opr(OperatorNodeBase* opr) {
 #if MGB_JIT_MLIR
         //! FIXME mlir does't support broadcast currently.
         auto backend = MGB_GETENV("MGB_JIT_BACKEND");
-        if (!strcmp(backend, "MLIR")) {
+        if (backend && !strcmp(backend, "MLIR")) {
             for (VarNode* var : opr->input()) {
                 if (!SymbolVar{var}.as_immutable_scalar().valid()) {
                     if (opr->node_prop().dep_map().at(var) &
diff --git a/src/jit/impl/mlir/ir/create_gpu_kernel_outlining_pass.cpp b/src/jit/impl/mlir/ir/create_gpu_kernel_outlining_pass.cpp
index d7c89e42..ea35f00c 100644
--- a/src/jit/impl/mlir/ir/create_gpu_kernel_outlining_pass.cpp
+++ b/src/jit/impl/mlir/ir/create_gpu_kernel_outlining_pass.cpp
@@ -44,6 +44,7 @@
 
 using namespace mlir;
 
+namespace {
 template <typename OpTy>
 static void createForAllDimensions(OpBuilder& builder, Location loc,
                                    SmallVectorImpl<Value>& values) {
@@ -80,7 +81,7 @@ static bool isSinkingBeneficiary(Operation* op) {
     return isa<ConstantOp, DimOp>(op);
 }
 
-LogicalResult mlir::sinkOperationsIntoLaunchOp(gpu::LaunchOp launchOp) {
+LogicalResult sink_operations_into_launch_op(gpu::LaunchOp launchOp) {
     Region& launchOpBody = launchOp.body();
 
     // Identify uses from values defined outside of the scope of the launch
@@ -232,7 +233,6 @@ static void convertToLaunchFuncOp(gpu::LaunchOp launchOp,
     launchOp.erase();
 }
 
-namespace {
 /// Pass that moves the kernel of each LaunchOp into its separate nested module.
 ///
 /// This pass moves the kernel code of each LaunchOp into a function created
@@ -258,7 +258,7 @@ public:
                                 .str();
 
                 // Pull in instructions that can be sunk
-                if (failed(sinkOperationsIntoLaunchOp(op)))
+                if (failed(sink_operations_into_launch_op(op)))
                     return WalkResult::interrupt();
                 gpu::GPUFuncOp outlinedFunc =
                         outlineKernelFuncImpl(op, kernelFnName, operands);
@@ -327,7 +327,6 @@ private:
         return kernelModule;
     }
 };
-
 }  // namespace
 
 std::unique_ptr<mlir::Pass> mgb::jit::create_gpu_kernel_outlining_pass() {
diff --git a/src/jit/impl/mlir/ir/lower_to_affine_pass.cpp b/src/jit/impl/mlir/ir/lower_to_affine_pass.cpp
index 04e4f43f..a63a556e 100644
--- a/src/jit/impl/mlir/ir/lower_to_affine_pass.cpp
+++ b/src/jit/impl/mlir/ir/lower_to_affine_pass.cpp
@@ -20,13 +20,12 @@
 
 #include "./each_mode.h"
 
+#include <llvm/ADT/Sequence.h>
 #include <mlir/Dialect/Affine/IR/AffineOps.h>
 #include <mlir/Pass/Pass.h>
 #include <mlir/Transforms/DialectConversion.h>
 #include "mlir/IR/StandardTypes.h"
 
-#include <llvm/ADT/Sequence.h>
-
 using namespace mgb;
 using namespace jit;
 
@@ -188,6 +187,7 @@ struct ReturnOpLowering : public OpRewritePattern<jit::ReturnOp> {
 
     LogicalResult matchAndRewrite(jit::ReturnOp op,
                                   PatternRewriter& rewriter) const final {
+        // We lower "mgb.return" directly to "std.return".
         rewriter.replaceOpWithNewOp<mlir::ReturnOp>(op);
         return success();
     }
@@ -212,6 +212,7 @@ public:
     void runOnFunction() override final {
         ConversionTarget target(getContext());
         target.addLegalDialect<AffineDialect, StandardOpsDialect>();
+        // target.addLegalDialect<AffineDialect>();
         target.addIllegalDialect<MgbDialect>();
 
         OwningRewritePatternList patterns;
@@ -236,6 +237,16 @@ std::unique_ptr<mlir::Pass> mgb::jit::create_lower_to_affine_pass() {
     return std::make_unique<MgbToAffineLoweringPass>();
 }
 
+namespace mgb {
+namespace jit {
+void register_test_mgb_to_affine_lowering_pass() {
+    PassRegistration<MgbToAffineLoweringPass>(
+            "mgb-convert-to-affine",
+            "Perform conversion from MGB Dialect to Affine Dialect ",
+            [] { return std::make_unique<MgbToAffineLoweringPass>(); });
+}
+}  // namespace jit
+}  // namespace mgb
 #endif  // MGB_JIT && MGB_JIT_MLIR
 
 // vim: syntax=cpp.doxygen
diff --git a/src/jit/impl/mlir/ir/lower_to_llvm_pass.cpp b/src/jit/impl/mlir/ir/lower_to_llvm_pass.cpp
index 6415ab43..9e17e0c1 100644
--- a/src/jit/impl/mlir/ir/lower_to_llvm_pass.cpp
+++ b/src/jit/impl/mlir/ir/lower_to_llvm_pass.cpp
@@ -53,6 +53,16 @@ std::unique_ptr<mlir::Pass> mgb::jit::create_lower_to_llvm_pass() {
     return std::make_unique<AffineToLLVMLoweringPass>();
 }
 
+namespace mgb {
+namespace jit {
+void register_test_affine_to_llvm_lowering_pass() {
+    PassRegistration<AffineToLLVMLoweringPass>(
+            "mgb-codegen-convert-affine-to-llvm",
+            "Perform final conversion from Affine to LLVMIR ",
+            [] { return std::make_unique<AffineToLLVMLoweringPass>(); });
+}
+}  // namespace jit
+}  // namespace mgb
 #endif  // MGB_JIT && MGB_JIT_MLIR
 
 // vim: syntax=cpp.doxygen
diff --git a/src/jit/impl/mlir/ir/ops.td b/src/jit/impl/mlir/ir/ops.td
index 6e8079fa..f18a5ad0 100644
--- a/src/jit/impl/mlir/ir/ops.td
+++ b/src/jit/impl/mlir/ir/ops.td
@@ -177,6 +177,12 @@ def ReturnOp : GenericOp<"return",
     The operation takes an no tensor operand and produces no results.
   }];
 
+  // The return operation takes an optional input operand to return. This
+  // value must match the return type of the enclosing function.
+  let arguments = (ins);
+
+  // The return operation only emits the input in the format if it is present.
+  let assemblyFormat = "attr-dict";
 }
 
 def ConstantScalarOp: GenericOp<"sconst", [NoSideEffect]> {
diff --git a/src/jit/impl/mlir/ir/types.h b/src/jit/impl/mlir/ir/types.h
index 9390c30f..548b5db4 100644
--- a/src/jit/impl/mlir/ir/types.h
+++ b/src/jit/impl/mlir/ir/types.h
@@ -19,7 +19,7 @@
 namespace mgb {
 namespace jit {
 
-inline const bool is_elemwise_float(const mlir::Type& dt) {
+inline bool is_elemwise_float(const mlir::Type& dt) {
     if (auto cast = dt.dyn_cast_or_null<mlir::MemRefType>()) {
         if (cast.getElementType().getKind() == mlir::StandardTypes::F32) {
             return true;
diff --git a/src/jit/test/mlir/CMakeLists.txt b/src/jit/test/mlir/CMakeLists.txt
new file mode 100644
index 00000000..aad1717d
--- /dev/null
+++ b/src/jit/test/mlir/CMakeLists.txt
@@ -0,0 +1,27 @@
+configure_lit_site_cfg(
+        ${CMAKE_CURRENT_SOURCE_DIR}/utils/lit.site.cfg.py.in
+        ${CMAKE_CURRENT_BINARY_DIR}/utils/lit.site.cfg.py
+        MAIN_CONFIG
+        ${CMAKE_CURRENT_SOURCE_DIR}/utils/lit.cfg.py
+)
+
+set(LLVM_EXTERNAL_LIT "${PROJECT_SOURCE_DIR}/third_party/llvm-project/llvm/utils/lit/lit.py" CACHE STRING "External lit")
+
+set(MLIR_MGB_TEST_DEPENDS
+        mgb-file-check
+        count not
+        mgb-opt
+)
+
+add_lit_testsuite(mgb-mlir-test-lit "Running the mgb regression tests"
+        ${CMAKE_CURRENT_BINARY_DIR}/utils
+        DEPENDS ${MLIR_MGB_TEST_DEPENDS}
+        )
+set_target_properties(mgb-mlir-test-lit PROPERTIES FOLDER "Tests")
+
+add_lit_testsuites(MLIR_TEST ${CMAKE_CURRENT_SOURCE_DIR} 
+        DEPENDS ${MLIR_MGB_TEST_DEPENDS}
+)
+
+add_custom_target(mlir_pass_check)
+add_dependencies(mlir_pass_check mgb-mlir-test-lit)
diff --git a/src/jit/test/mlir/ir/BUILD b/src/jit/test/mlir/ir/BUILD
new file mode 100644
index 00000000..9c9f6c66
--- /dev/null
+++ b/src/jit/test/mlir/ir/BUILD
@@ -0,0 +1,16 @@
+load("//brain/megbrain/src/jit/test/mlir/utils:lit.bzl", "mlir_lit_test_suite")
+
+filegroup(
+    name = "mlir_test_tools",
+    testonly = True,
+    data = [
+        "//brain/megbrain/tools/mlir:mgb-opt",
+        "//brain/megbrain/tools/mlir:mgb-file-check"
+    ],
+)
+
+mlir_lit_test_suite(
+    name = "mlir_pass_check",
+    data = [":mlir_test_tools"],
+    test_file_exts = ["mlir",]
+)
diff --git a/src/jit/test/mlir/ir/add.mlir b/src/jit/test/mlir/ir/add.mlir
new file mode 100644
index 00000000..6966b083
--- /dev/null
+++ b/src/jit/test/mlir/ir/add.mlir
@@ -0,0 +1,58 @@
+// RUN: mgb-opt --mgb-convert-to-affine --split-input-file -canonicalize -cse %s | mgb-file-check %s
+// RUN: mgb-opt --mgb-convert-to-affine --mgb-codegen-convert-affine-to-llvm --split-input-file -canonicalize -cse %s
+
+func @add_dim1(%lhs: memref<2xf32>, %rhs: memref<2xf32>, %res: memref<2xf32>) -> () {
+    %0 = "mgb.add"(%lhs, %rhs) {name = "add.f"} :
+       (memref<2xf32>, memref<2xf32>) -> memref<2xf32>
+    "mgb.assign"(%0, %res) : (memref<2xf32>, memref<2xf32>) -> ()
+    mgb.return 
+}
+// CHECK-LABEL: func @add_dim1(%arg0: memref<2xf32>, %arg1: memref<2xf32>, %arg2: memref<2xf32>) {
+// CHECK:   %0 = alloc() : memref<2xf32>
+// CHECK:   affine.for %arg3 = 0 to 2 {
+// CHECK:     %1 = affine.load %arg0[%arg3] : memref<2xf32>
+// CHECK:     %2 = affine.load %arg1[%arg3] : memref<2xf32>
+// CHECK:     %3 = addf %1, %2 : f32
+// CHECK:     affine.store %3, %0[%arg3] : memref<2xf32>
+// CHECK:   }
+// CHECK:   affine.for %arg3 = 0 to 2 {
+// CHECK:     %1 = affine.load %0[%arg3] : memref<2xf32>
+// CHECK:     affine.store %1, %arg2[%arg3] : memref<2xf32>
+// CHECK:   }
+// CHECK:   dealloc %0 : memref<2xf32>
+// CHECK:   return
+// CHECK: }
+
+func @add_dim4(%lhs: memref<4x3x64x64xf32>, %rhs: memref<4x3x64x64xf32>, %res: memref<4x3x64x64xf32>) -> () {
+    %0 = "mgb.add"(%lhs, %rhs) {name = "add.f"} :
+       (memref<4x3x64x64xf32>, memref<4x3x64x64xf32>) -> memref<4x3x64x64xf32>
+    "mgb.assign"(%0, %res) : (memref<4x3x64x64xf32>, memref<4x3x64x64xf32>) -> ()
+    mgb.return
+}
+// CHECK-LABEL: func @add_dim4(%arg0: memref<4x3x64x64xf32>, %arg1: memref<4x3x64x64xf32>, %arg2: memref<4x3x64x64xf32>) {
+// CHECK:   %0 = alloc() : memref<4x3x64x64xf32>
+// CHECK:   affine.for %arg3 = 0 to 4 {
+// CHECK:     affine.for %arg4 = 0 to 3 {
+// CHECK:       affine.for %arg5 = 0 to 64 {
+// CHECK:         affine.for %arg6 = 0 to 64 {
+// CHECK:           %1 = affine.load %arg0[%arg3, %arg4, %arg5, %arg6] : memref<4x3x64x64xf32>
+// CHECK:           %2 = affine.load %arg1[%arg3, %arg4, %arg5, %arg6] : memref<4x3x64x64xf32>
+// CHECK:           %3 = addf %1, %2 : f32
+// CHECK:           affine.store %3, %0[%arg3, %arg4, %arg5, %arg6] : memref<4x3x64x64xf32>
+// CHECK:         }
+// CHECK:       }
+// CHECK:     }
+// CHECK:   }
+// CHECK:   affine.for %arg3 = 0 to 4 {
+// CHECK:     affine.for %arg4 = 0 to 3 {
+// CHECK:       affine.for %arg5 = 0 to 64 {
+// CHECK:         affine.for %arg6 = 0 to 64 {
+// CHECK:           %1 = affine.load %0[%arg3, %arg4, %arg5, %arg6] : memref<4x3x64x64xf32>
+// CHECK:           affine.store %1, %arg2[%arg3, %arg4, %arg5, %arg6] : memref<4x3x64x64xf32>
+// CHECK:         }
+// CHECK:       }
+// CHECK:     }
+// CHECK:   }
+// CHECK:   dealloc %0 : memref<4x3x64x64xf32>
+// CHECK:   return
+// CHECK: }
\ No newline at end of file
diff --git a/src/jit/test/mlir/utils/BUILD b/src/jit/test/mlir/utils/BUILD
new file mode 100644
index 00000000..ed3a3dc9
--- /dev/null
+++ b/src/jit/test/mlir/utils/BUILD
@@ -0,0 +1,5 @@
+filegroup(
+    name = "litfiles",
+    srcs = glob(["lit.bzl.*py"]),
+    visibility = ["//visibility:public"],
+)
\ No newline at end of file
diff --git a/src/jit/test/mlir/utils/lit.bzl b/src/jit/test/mlir/utils/lit.bzl
new file mode 100644
index 00000000..51ef8392
--- /dev/null
+++ b/src/jit/test/mlir/utils/lit.bzl
@@ -0,0 +1,127 @@
+# Test definitions for Lit, the LLVM test runner.
+#
+"""Lit runner globbing test
+"""
+
+# Default values used by the test runner.
+_default_test_file_exts = ["mlir", "pbtxt", "td"]
+_default_size = "small"
+_default_tags = []
+
+# These are patterns which we should never match, for tests, subdirectories, or
+# test input data files.
+_ALWAYS_EXCLUDE = [
+    "**/LICENSE.txt",
+    "**/README.txt",
+    "**/lit.local.cfg",
+    # Exclude input files that have spaces in their names, since bazel
+    # cannot cope with such "targets" in the srcs list.
+    "**/* *",
+    "**/* */**",
+]
+
+def _run_lit_test(name, data, size, tags, features):
+    """Runs lit on all tests it can find in `data` under megbrain/src/jit/test/mlir/ir.
+
+    Note that, due to Bazel's hermetic builds, lit only sees the tests that
+    are included in the `data` parameter, regardless of what other tests might
+    exist in the directory searched.
+
+    Args:
+      name: str, the name of the test, including extension.
+      data: [str], the data input to the test.
+      size: str, the size of the test.
+      tags: [str], tags to attach to the test.
+      features: [str], list of extra features to enable.
+    """
+
+    native.py_test(
+        name = name,
+        srcs = ["@llvm-project//llvm:lit"],
+        tags = tags,
+        args = [
+            "brain/megbrain/src/jit/test/mlir/utils --config-prefix=lit.bzl -v",
+        ] + features,
+        data = data + [
+            "//brain/megbrain/src/jit/test/mlir/utils:litfiles",
+            "//brain/megbrain/tools/mlir:mgb-file-check",
+            "@llvm-project//llvm:count",
+            "@llvm-project//llvm:not",
+        ],
+        size = size,
+        main = "lit.py",
+    )
+
+def mlir_lit_test_suite(
+        name,
+        exclude = [],
+        test_file_exts = _default_test_file_exts,
+        default_size = _default_size,
+        size_override = {},
+        data = [],
+        per_test_extra_data = {},
+        default_tags = _default_tags,
+        tags_override = {},
+        features = []):
+    """Creates all plausible Lit tests (and their inputs) under this directory.
+
+    Args:
+      name: str, name of the generated test suite.
+      exclude: [str], paths to exclude (for tests and inputs).
+      test_file_exts: [str], extensions for files that are tests.
+      default_size: str, the test size for targets not in "size_override".
+      size_override: {str: str}, sizes to use for specific tests.
+      data: [str], additional input data to the test.
+      per_test_extra_data: {str: [str]}, extra data to attach to a given file.
+      default_tags: [str], additional tags to attach to the test.
+      tags_override: {str: str}, tags to add to specific tests.
+      features: [str], list of extra features to enable.
+    """
+
+    # Ignore some patterns by default for tests and input data.
+    exclude = _ALWAYS_EXCLUDE + exclude
+
+    test_names = []
+    tests = native.glob(
+        ["*." + ext for ext in test_file_exts],
+        exclude = exclude,
+    )
+
+    # Run tests individually such that errors can be attributed to a specific
+    # failure.
+    for i in range(len(tests)):
+        cur_test = tests[i]
+
+        # Instantiate this test with updated parameters.
+        internal_name = cur_test
+        lit_test(
+            name = internal_name,
+            data = data + per_test_extra_data.pop(cur_test, []),
+            size = size_override.pop(cur_test, default_size),
+            tags = ["windows_fail"] + default_tags + tags_override.pop(cur_test, []),
+            features = features,
+        )
+        test_names.append(internal_name + ".test")
+
+    native.test_suite(
+        name = name,
+        tests = test_names,
+        tags = default_tags,
+    )
+
+def lit_test(
+        name,
+        data = [],
+        size = _default_size,
+        tags = _default_tags,
+        features = []):
+    """Runs test files under lit.
+
+    Args:
+      name: str, the name of the test.
+      data: [str], labels that should be provided as data inputs.
+      size: str, the size of the test.
+      tags: [str], tags to attach to the test.
+      features: [str], list of extra features to enable.
+    """
+    _run_lit_test(name + ".test", data + [name], size, tags, features)
diff --git a/src/jit/test/mlir/utils/lit.bzl.cfg.py b/src/jit/test/mlir/utils/lit.bzl.cfg.py
new file mode 100644
index 00000000..64eb7b5d
--- /dev/null
+++ b/src/jit/test/mlir/utils/lit.bzl.cfg.py
@@ -0,0 +1,52 @@
+# -*- coding: utf-8 -*-
+# MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
+#
+# Copyright (c) 2014-2020 Megvii Inc. All rights reserved.
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+
+import os
+import platform
+import re
+import subprocess
+import tempfile
+
+import lit.formats
+import lit.util
+
+from lit.llvm import llvm_config
+from lit.llvm.subst import ToolSubst
+from lit.llvm.subst import FindTool
+
+# Configuration file for the 'lit' test runner.
+
+# name: The name of this test suite.
+config.name = 'MLIR_TEST'
+
+config.test_format = lit.formats.ShTest(not llvm_config.use_lit_shell)
+
+# suffixes: A list of file extensions to treat as test files.
+config.suffixes = ['.mlir']
+
+# test_source_root: The root path where tests are located.
+config.test_source_root = config.mlir_test_dir
+
+# test_exec_root: The root path where tests should be run.
+config.test_exec_root = os.environ['RUNFILES_DIR']
+
+llvm_config.use_default_substitutions()
+
+# Tweak the PATH to include the tools dir.
+llvm_config.with_environment('PATH', config.llvm_tools_dir, append_path=True)
+
+tool_dirs = config.mlir_mgb_tools_dirs + [config.mlir_tools_dir, config.llvm_tools_dir]
+tool_names = [
+    'mgb-opt',
+    'mlir-tblgen',
+    'mlir-translate',
+    'mgb-file-check',
+]
+tools = [ToolSubst(s, unresolved='ignore') for s in tool_names]
+llvm_config.add_tool_substitutions(tools, tool_dirs)
diff --git a/src/jit/test/mlir/utils/lit.bzl.site.cfg.py b/src/jit/test/mlir/utils/lit.bzl.site.cfg.py
new file mode 100644
index 00000000..c1499052
--- /dev/null
+++ b/src/jit/test/mlir/utils/lit.bzl.site.cfg.py
@@ -0,0 +1,43 @@
+# -*- coding: utf-8 -*-
+# MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
+#
+# Copyright (c) 2014-2020 Megvii Inc. All rights reserved.
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+
+"""Lit runner site configuration."""
+import os
+import lit.llvm
+
+config.llvm_tools_dir = os.path.join(os.environ['TEST_SRCDIR'], 'llvm-project', 'llvm')
+config.mlir_obj_root = os.path.join(os.environ['TEST_SRCDIR'])
+config.mlir_tools_dir = os.path.join(os.environ['TEST_SRCDIR'], 'llvm-project', 'mlir')
+config.suffixes = ['.td', '.mlir', '.pbtxt']
+
+mlir_mgb_tools_dirs = [
+    'brain/megbrain/tools/mlir',
+]
+config.mlir_mgb_tools_dirs = [
+    os.path.join(os.environ['TEST_SRCDIR'], os.environ['TEST_WORKSPACE'], s)
+    for s in mlir_mgb_tools_dirs
+]
+test_dir = os.environ['TEST_TARGET']
+test_dir = test_dir.strip('/').rsplit(':', 1)[0]
+config.mlir_test_dir = os.path.join(
+    os.environ['TEST_SRCDIR'],
+    os.environ['TEST_WORKSPACE'],
+    test_dir,
+)
+lit.llvm.initialize(lit_config, config)
+
+# Let the main config do the real work.
+lit_config.load_config(
+    config,
+    os.path.join(
+        os.path.join(
+            os.environ['TEST_SRCDIR'],
+            os.environ['TEST_WORKSPACE'],
+            'brain/megbrain/src/jit/test/mlir/utils/lit.bzl.cfg.py',
+        )))
diff --git a/src/jit/test/mlir/utils/lit.bzl.site.cfg.py.in b/src/jit/test/mlir/utils/lit.bzl.site.cfg.py.in
new file mode 100644
index 00000000..da7b522e
--- /dev/null
+++ b/src/jit/test/mlir/utils/lit.bzl.site.cfg.py.in
@@ -0,0 +1,49 @@
+@LIT_SITE_CFG_IN_HEADER@
+
+import sys
+
+config.host_triple = "@LLVM_HOST_TRIPLE@"
+config.target_triple = "@TARGET_TRIPLE@"
+config.llvm_src_root = "@LLVM_SOURCE_DIR@"
+config.llvm_obj_root = "@LLVM_BINARY_DIR@"
+config.llvm_tools_dir = "@LLVM_TOOLS_DIR@"
+config.llvm_lib_dir = "@LLVM_LIBRARY_DIR@"
+config.llvm_shlib_dir = "@SHLIBDIR@"
+config.llvm_shlib_ext = "@SHLIBEXT@"
+config.llvm_exe_ext = "@EXEEXT@"
+config.lit_tools_dir = "@LLVM_LIT_TOOLS_DIR@"
+config.python_executable = "@PYTHON_EXECUTABLE@"
+config.gold_executable = "@GOLD_EXECUTABLE@"
+config.ld64_executable = "@LD64_EXECUTABLE@"
+config.enable_shared = @ENABLE_SHARED@
+config.enable_assertions = @ENABLE_ASSERTIONS@
+config.targets_to_build = "@TARGETS_TO_BUILD@"
+config.native_target = "@LLVM_NATIVE_ARCH@"
+config.llvm_bindings = "@LLVM_BINDINGS@".split(' ')
+config.host_os = "@HOST_OS@"
+config.host_cc = "@HOST_CC@"
+config.host_cxx = "@HOST_CXX@"
+# Note: ldflags can contain double-quoted paths, so must use single quotes here.
+config.host_ldflags = '@HOST_LDFLAGS@'
+config.llvm_use_sanitizer = "@LLVM_USE_SANITIZER@"
+config.llvm_host_triple = '@LLVM_HOST_TRIPLE@'
+config.host_arch = "@HOST_ARCH@"
+config.mgb_src_root = "@CMAKE_SOURCE_DIR@"
+config.mgb_obj_root = "@CMAKE_BINARY_DIR@"
+
+# Support substitution of the tools_dir with user parameters. This is
+# used when we can't determine the tool dir at configuration time.
+try:
+    config.llvm_tools_dir = config.llvm_tools_dir % lit_config.params
+    config.llvm_shlib_dir = config.llvm_shlib_dir % lit_config.params
+except KeyError:
+    e = sys.exc_info()[1]
+    key, = e.args
+    lit_config.fatal("unable to find %r parameter, use '--param=%s=VALUE'" % (key,key))
+
+
+import lit.llvm
+lit.llvm.initialize(lit_config, config)
+
+# Let the main config do the real work.
+lit_config.load_config(config, "@CMAKE_SOURCE_DIR@/src/jit/test/mlir/utils/lit.cfg.py")
diff --git a/src/jit/test/mlir/utils/lit.cfg.py b/src/jit/test/mlir/utils/lit.cfg.py
new file mode 100644
index 00000000..a4a06252
--- /dev/null
+++ b/src/jit/test/mlir/utils/lit.cfg.py
@@ -0,0 +1,58 @@
+# -*- coding: utf-8 -*-
+# MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
+#
+# Copyright (c) 2014-2020 Megvii Inc. All rights reserved.
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+
+import os
+import platform
+import re
+import subprocess
+import tempfile
+
+import lit.formats
+import lit.util
+
+from lit.llvm import llvm_config
+from lit.llvm.subst import ToolSubst
+from lit.llvm.subst import FindTool
+
+# Configuration file for the 'lit' test runner.
+
+# name: The name of this test suite.
+config.name = 'MLIR_TEST'
+
+config.test_format = lit.formats.ShTest(not llvm_config.use_lit_shell)
+
+# suffixes: A list of file extensions to treat as test files.
+config.suffixes = ['.mlir']
+
+# test_source_root: The root path where tests are located.
+config.test_source_root = os.path.join(os.path.dirname(__file__), '../ir')
+
+# test_exec_root: The root path where tests should be run.
+config.test_exec_root = config.test_source_root
+
+# llvm_config.use_default_substitutions()
+
+# Tweak the PATH to include the tools dir.
+llvm_config.with_environment('PATH', config.llvm_tools_dir, append_path=True)
+
+tool_dirs = [
+    os.path.join(config.mgb_obj_root, 'tools/mlir'),
+    os.path.join(config.mgb_obj_root, 'tools/mlir/mgb-opt'),
+    os.path.join(config.mgb_obj_root, 'tools/mlir/mgb-file-check'),
+    config.llvm_tools_dir]
+tool_names = [
+    'mgb-opt',
+    'mlir-tblgen',
+    'mlir-translate',
+    'mgb-file-check',
+]
+tools = [ToolSubst(s, unresolved='ignore') for s in tool_names]
+llvm_config.add_tool_substitutions(tools, tool_dirs)
+
+lit.llvm.initialize(lit_config, config)
diff --git a/src/jit/test/mlir/utils/lit.site.cfg.py.in b/src/jit/test/mlir/utils/lit.site.cfg.py.in
new file mode 100644
index 00000000..664e54d1
--- /dev/null
+++ b/src/jit/test/mlir/utils/lit.site.cfg.py.in
@@ -0,0 +1,49 @@
+@LIT_SITE_CFG_IN_HEADER@
+
+import sys
+
+config.host_triple = "@LLVM_HOST_TRIPLE@"
+config.target_triple = "@TARGET_TRIPLE@"
+config.llvm_src_root = "@LLVM_SOURCE_DIR@"
+config.llvm_obj_root = "@LLVM_BINARY_DIR@"
+config.llvm_tools_dir = "@LLVM_BINARY_DIR@/bin"
+config.llvm_lib_dir = "@LLVM_LIBRARY_DIR@"
+config.llvm_shlib_dir = "@SHLIBDIR@"
+config.llvm_shlib_ext = "@SHLIBEXT@"
+config.llvm_exe_ext = "@EXEEXT@"
+config.lit_tools_dir = "@LLVM_LIT_TOOLS_DIR@"
+config.python_executable = "@PYTHON_EXECUTABLE@"
+config.gold_executable = "@GOLD_EXECUTABLE@"
+config.ld64_executable = "@LD64_EXECUTABLE@"
+config.enable_shared = @ENABLE_SHARED@
+config.enable_assertions = @ENABLE_ASSERTIONS@
+config.targets_to_build = "@TARGETS_TO_BUILD@"
+config.native_target = "@LLVM_NATIVE_ARCH@"
+config.llvm_bindings = "@LLVM_BINDINGS@".split(' ')
+config.host_os = "@HOST_OS@"
+config.host_cc = "@HOST_CC@"
+config.host_cxx = "@HOST_CXX@"
+# Note: ldflags can contain double-quoted paths, so must use single quotes here.
+config.host_ldflags = '@HOST_LDFLAGS@'
+config.llvm_use_sanitizer = "@LLVM_USE_SANITIZER@"
+config.llvm_host_triple = '@LLVM_HOST_TRIPLE@'
+config.host_arch = "@HOST_ARCH@"
+config.mgb_src_root = "@CMAKE_SOURCE_DIR@"
+config.mgb_obj_root = "@CMAKE_BINARY_DIR@"
+
+# Support substitution of the tools_dir with user parameters. This is
+# used when we can't determine the tool dir at configuration time.
+try:
+    config.llvm_tools_dir = config.llvm_tools_dir % lit_config.params
+    config.llvm_shlib_dir = config.llvm_shlib_dir % lit_config.params
+except KeyError:
+    e = sys.exc_info()[1]
+    key, = e.args
+    lit_config.fatal("unable to find %r parameter, use '--param=%s=VALUE'" % (key,key))
+
+
+import lit.llvm
+lit.llvm.initialize(lit_config, config)
+
+# Let the main config do the real work.
+lit_config.load_config(config, "@CMAKE_SOURCE_DIR@/src/jit/test/mlir/utils/lit.cfg.py")
diff --git a/src/megbrain_build_config.h.in b/src/megbrain_build_config.h.in
index 30daf3e3..6caf1c08 100644
--- a/src/megbrain_build_config.h.in
+++ b/src/megbrain_build_config.h.in
@@ -34,8 +34,6 @@
 #cmakedefine01 MGB_ENABLE_FBS_SERIALIZATION
 #cmakedefine01 MGB_IS_DEV
 
-#cmakedefine01 MGB_ENABLE_IMPERATIVE
-
 // DNN related flags
 // Platform macro's
 #cmakedefine01 MEGDNN_WITH_CUDA
diff --git a/src/opr-mm/impl/collective_comm.cpp b/src/opr-mm/impl/collective_comm.cpp
index 46e43872..e3bc3f13 100644
--- a/src/opr-mm/impl/collective_comm.cpp
+++ b/src/opr-mm/impl/collective_comm.cpp
@@ -630,11 +630,7 @@ void CollectiveComm::get_output_var_shape(const TensorShapeArray& inp_shape,
                                   inp_shape, out_shape);
 }
 
-void CollectiveComm::init_output_comp_node() {
-    mgb_assert(output().size() == 1, "exactly one output expected, got %zu", output().size());
-    owner_graph()->seq_comp_node_optimizer().register_stream_var(output()[0],
-        {CompNode::Stream::NCCL, cg::SeqCompNodeOptimizer::StreamPropType::WEAK});
-}
+void CollectiveComm::init_output_comp_node() {}
 
 void CollectiveComm::init_output_mem_plan(bool dynamic) {
     for (size_t i = 0; i < output().size(); i++) {
diff --git a/src/opr/impl/dnn/adaptive_pooling.cpp b/src/opr/impl/dnn/adaptive_pooling.cpp
new file mode 100644
index 00000000..93d829d3
--- /dev/null
+++ b/src/opr/impl/dnn/adaptive_pooling.cpp
@@ -0,0 +1,148 @@
+/**
+ * \file src/opr/impl/dnn/adaptive_pooling.cpp
+ * MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
+ *
+ * Copyright (c) 2014-2020 Megvii Inc. All rights reserved.
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or
+ * implied.
+ */
+#include "megbrain/opr/dnn/adaptive_pooling.h"
+#include "../internal/megdnn_opr_wrapper.inl"
+#include "megbrain/graph/grad_impl.h"
+#include "megbrain/opr/utility.h"
+
+#include "megdnn/opr_param_defs.h"
+#include "megdnn/oprs/nn.h"
+
+using namespace mgb;
+using namespace opr;
+
+MGB_DYN_TYPE_OBJ_FINAL_IMPL(AdaptivePoolingForward);
+AdaptivePoolingForward::AdaptivePoolingForward(VarNode* src, VarNode* out_shape,
+                                               const Param& param,
+                                               const OperatorNodeConfig& config)
+        : Super(OperatorNodeBaseCtorParam{src->owner_graph(),
+                                          config,
+                                          "adaptive_pooling",
+                                          {src, out_shape}}) {
+    init_megdnn_opr(*this, param);
+    add_input({src, out_shape});
+    outshape_by_symvar_enable(1, 1);
+}
+
+SymbolVar AdaptivePoolingForward::make(SymbolVar src, SymbolVar out_shape,
+                                       const Param& param,
+                                       const OperatorNodeConfig& config) {
+    return src.insert_single_output_opr<AdaptivePoolingForward>(
+            src.node(), out_shape.node(), param, config);
+}
+
+void AdaptivePoolingForward::scn_do_execute() {
+    megdnn_opr()->exec(input(0)->dev_tensor().as_megdnn(),
+                       output(0)->dev_tensor().as_megdnn(),
+                       intl::get_megdnn_workspace_from_var(output().back()));
+}
+
+void AdaptivePoolingForward::outshape_by_symvar_do_get_output_shape(
+        TensorShape& dest, const ShapeInferInfo& shpinfo) {
+    TensorShape oshp2d;
+    cg::copy_tensor_value_to_shape(oshp2d, *shpinfo.shpval_inp_val.at(0));
+    auto src = shpinfo.shape_inp_shp.at(0);
+    mgb_assert(src.ndim == 4 && oshp2d.ndim == 2,
+               "shape mismatch for AdaptivePooling: src=%s, out2d=%s",
+               src.to_string().c_str(), oshp2d.to_string().c_str());
+
+    mgb_assert(param().format == Param::Format::NCHW,
+               "AdaptivePooling only support NCHW");
+    dest.ndim = 4;
+    dest.shape[0] = src.shape[0];
+    dest.shape[1] = src.shape[1];
+    dest.shape[2] = oshp2d.shape[0];
+    dest.shape[3] = oshp2d.shape[1];
+}
+
+size_t AdaptivePoolingForward::get_workspace_size_bytes(
+        const TensorShapeArray& input_shapes,
+        const TensorShapeArray& output_shapes) const {
+    return megdnn_opr()->get_workspace_in_bytes(
+            {input_shapes[0], this->input(0)->dtype(),
+             this->input(0)->format()},
+            {output_shapes[0], this->output(0)->dtype(),
+             this->output(0)->format()});
+}
+
+void AdaptivePoolingForward::init_output_dtype() {
+    output(0)->dtype(input(0)->dtype());
+}
+
+void AdaptivePoolingForward::add_input_layout_constraint() {
+    mixin::megdnn_utils::add_input_layout_constraint_contig(*this);
+}
+
+void AdaptivePoolingForward::init_output_static_infer_desc() {
+    Super::init_output_static_infer_desc();
+    init_output_static_infer_desc_workspace(false);
+}
+
+void AdaptivePoolingForward::record_execute_deps(ExecDependencyArray& deps) {
+    record_megdnn_opr(deps);
+}
+
+#ifdef MGB_ENABLE_GRAD
+MGB_IMPL_OPR_GRAD(AdaptivePoolingForward) {
+    if (wrt_idx == 0) {
+        // wrt src
+        SymbolVar grad = AdaptivePoolingBackward::make(
+                opr.input(0), opr.input(1), opr.output(0), out_grad[0],
+                opr.param());
+        return grad.node();
+    } else {
+        mgb_assert(wrt_idx == 1);
+        return InvalidGrad::make(opr, wrt_idx);
+    }
+}
+#endif
+
+MGB_DYN_TYPE_OBJ_FINAL_IMPL(AdaptivePoolingBackward);
+AdaptivePoolingBackward::AdaptivePoolingBackward(
+        VarNode* src, VarNode* out_shape, VarNode* dst, VarNode* diff,
+        const Param& param, const OperatorNodeConfig& config)
+        : Super(OperatorNodeBaseCtorParam{src->owner_graph(),
+                                          config,
+                                          "adaptive_pooling_bwd",
+                                          {src}},
+                0, true) {
+    init_megdnn_opr(*this, param);
+    add_input({src, out_shape, dst, diff});
+}
+
+SymbolVar AdaptivePoolingBackward::make(SymbolVar src, SymbolVar out_shape,
+                                        SymbolVar dst, SymbolVar diff,
+                                        const Param& param,
+                                        const OperatorNodeConfig& config) {
+    return src.insert_single_output_opr<AdaptivePoolingBackward>(
+            src.node(), out_shape.node(), dst.node(), diff.node(), param,
+            config);
+}
+
+void AdaptivePoolingBackward::scn_do_execute() {
+    megdnn_opr()->exec(input(0)->dev_tensor().as_megdnn(),
+                       input(2)->dev_tensor().as_megdnn(),
+                       input(3)->dev_tensor().as_megdnn(),
+                       output(0)->dev_tensor().as_megdnn(),
+                       intl::get_megdnn_workspace_from_var(output().back()));
+}
+size_t AdaptivePoolingBackward::get_workspace_size_bytes(
+        const TensorShapeArray& input_shapes,
+        const TensorShapeArray& output_shapes) const {
+    return megdnn_opr()->get_workspace_in_bytes(
+            {input_shapes[0], input(0)->dtype(), input(0)->format()},
+            {input_shapes[2], input(2)->dtype(), input(2)->format()},
+            {input_shapes[3], input(3)->dtype(), input(3)->format()},
+            {output_shapes[0], output(0)->dtype(), output(0)->format()});
+}
+
+// vim: syntax=cpp.doxygen foldmethod=marker foldmarker=f{{{,f}}}
diff --git a/src/opr/impl/dnn/batch_norm.cpp b/src/opr/impl/dnn/batch_norm.cpp
index 6461573b..5d403fbf 100644
--- a/src/opr/impl/dnn/batch_norm.cpp
+++ b/src/opr/impl/dnn/batch_norm.cpp
@@ -44,7 +44,7 @@ BatchNormForward::BatchNormForward(VarNode *x,
         m_force_inplace = false;
     }
 
-    if (m_force_inplace) {
+    if (m_force_inplace && param.fwd_mode == Param::FwdMode::TRAINING) {
         auto check_dest = [&](VarNode* dest) {
             auto dest_opr = dest->owner_opr();
             mgb_throw_if(!(dest_opr->same_type<SharedDeviceTensor>() ||
@@ -62,7 +62,14 @@ BatchNormForward::BatchNormForward(VarNode *x,
 
     add_input({x, scale, bias, mean, variance});
 
-    if (m_force_inplace) {
+    if (param.fwd_mode == Param::FwdMode::INFERENCE) {
+        auto mark_empty_var = [&](VarNode *var) {
+            var->add_flag(VarNode::Flag::ALLOW_EMPTY_SHAPE)
+                .add_flag(VarNode::Flag::VOLATILE_CONTENT);
+        };
+        mark_empty_var(output(0));
+        mark_empty_var(output(1));
+    } else if (m_force_inplace) {
         output(0)->
             set_fwd_in2out_writable_force(input(3)).
             add_flag(VarNode::Flag::NO_MEM_RECLAIM);
@@ -129,7 +136,7 @@ SymbolVarArray BatchNormForward::make(SymbolVar x,
 cg::OperatorNodeBase::NodeProp*
 BatchNormForward::do_make_node_prop() const {
     auto ret = Super::do_make_node_prop();
-    if (input().size() == 5) {
+    if (need_stats()) {
         ret->add_flag(NodeProp::Flag::FORCE_UPDATE_INPUT_VAR);
     }
     return ret;
@@ -140,8 +147,7 @@ void BatchNormForward::scn_do_execute() {
     auto &&y = output(4)->dev_tensor();
     mgb_assert(x.layout().is_contiguous() &&
                y.layout().is_contiguous());
-#if MGB_ENABLE_IMPERATIVE
-    if (input().size() == 5) { // need running mean/variance
+    if (need_stats()) {
         auto &&o0 = output(0)->dev_tensor(),
              &&o1 = output(1)->dev_tensor(),
              &&i0 = input(3)->dev_tensor(),
@@ -163,11 +169,16 @@ void BatchNormForward::scn_do_execute() {
                     && o1.raw_ptr() == i1.raw_ptr());
         }
     }
-#endif
     auto scale = input(1)->dev_tensor().as_megdnn();
     auto bias = input(2)->dev_tensor().as_megdnn();
-    auto mean = output(0)->dev_tensor().as_megdnn();
-    auto variance = output(1)->dev_tensor().as_megdnn();
+    megdnn::TensorND mean, variance;
+    if (param().fwd_mode == Param::FwdMode::INFERENCE) {
+        mean = input(3)->dev_tensor().as_megdnn();
+        variance = input(4)->dev_tensor().as_megdnn();
+    } else {
+        mean = output(0)->dev_tensor().as_megdnn();
+        variance = output(1)->dev_tensor().as_megdnn();
+    }
     auto save_mean = output(2)->dev_tensor().as_megdnn();
     auto save_variance = output(3)->dev_tensor().as_megdnn();
     auto workspace = intl::get_megdnn_workspace_from_var(output().back());
@@ -182,12 +193,11 @@ void BatchNormForward::add_input_layout_constraint() {
 void BatchNormForward::get_output_var_shape(
         const TensorShapeArray &inp_shape,
         TensorShapeArray &out_shape) const {
-    size_t nr_inp = input().size();
     out_shape[4] = inp_shape[0];
     for (size_t i = 0; i < 4; ++ i) {
         out_shape[i] = inp_shape[1];
     }
-    if (nr_inp == 3) {
+    if (!need_stats()) {
         out_shape[0] = out_shape[1] = {0};
     }
 }
@@ -223,7 +233,7 @@ void BatchNormForward::init_output_dtype() {
 }
 
 void BatchNormForward::mem_plan_fwd_in2out_writable() {
-    if (!m_force_inplace && input().size() == 5) {
+    if (need_stats() && !m_force_inplace) {
         // TODO: testing
         output(0)->set_fwd_in2out_writable(input(3));
         output(1)->set_fwd_in2out_writable(input(4));
diff --git a/src/opr/impl/dnn/convolution.cpp b/src/opr/impl/dnn/convolution.cpp
index eda63b4a..f4a2abfb 100644
--- a/src/opr/impl/dnn/convolution.cpp
+++ b/src/opr/impl/dnn/convolution.cpp
@@ -963,6 +963,9 @@ void mixin::WeightPreprocessExecutor::record_preprocessed_weight(
 
 bool mixin::WeightPreprocessExecutor::mixin_allow_weight_preprocess(
         const cg::OperatorNodeBase& opr) const {
+    if (!opr.owner_graph()->options().graph_opt.weight_preprocess) {
+        return false;
+    }
     if (!opr.input(1)->contain_flag(VarNode::Flag::PERSISTENT_DEVICE_VALUE))
         return false;
     if (cg::is_const_var_value(opr.input(1)))
diff --git a/src/opr/impl/dnn/dnn.oprdecl b/src/opr/impl/dnn/dnn.oprdecl
index cadc8da0..224d251b 100644
--- a/src/opr/impl/dnn/dnn.oprdecl
+++ b/src/opr/impl/dnn/dnn.oprdecl
@@ -95,6 +95,7 @@ r"""
 """))
 
 decl_opr('Local',
+         pyname='local',
          inputs=[Doc('src',
                      'input image in (batch, channel, row, col) format'),
                  Doc('filter',
@@ -105,6 +106,19 @@ decl_opr('Local',
          desc='batched convolution on channeled 2D images, but kernels are '
          'not shared across different output positions')
 
+decl_opr('Local',
+         pyname='local_v1',
+         inputs=[Doc('src',
+                     'input image in (batch, channel, row, col) format'),
+                 Doc('filter',
+                     'convolution kernel in '
+                     '(out row, out col, in channel, '
+                     'kern row, kern col, out channel) format')],
+         params='Convolution',
+         desc='batched convolution on channeled 2D images, but kernels are '
+         'not shared across different output positions',
+         version=1)
+
 decl_opr('GroupLocal',
          inputs=[Doc('src',
                      'input image in (batch, channel, row, col) format'),
@@ -113,7 +127,7 @@ decl_opr('GroupLocal',
                      '(group, out row, out col, in channel / group, '
                      'kern row, kern col, out channel / group) format')],
          params=[('param', 'Convolution')],
-		 desc='batched convolution on groupped channeled 2D images, but '
+         desc='batched convolution on groupped channeled 2D images, but '
          'kernels are not shared across different output positions',
          version=1)
 
@@ -126,6 +140,13 @@ decl_opr('Pooling',
          inputs=['src'],
          params='Pooling')
 
+decl_opr('AdaptivePooling',
+        inputs=[Doc('src', 'input image, shape (n, c, ih, iw)'),
+                Doc('out_shape', 'output image shape, containing two elements specifying output height and width.')],
+        params='AdaptivePooling',
+        desc='Adaptive Pooling.'
+        'The output shape is (n, c, oh, ow), where (oh, ow) is given by *out_shape*.')
+
 decl_opr('ROIPooling', outputs=[0],
          inputs=[Doc('src', 'input image, shape (n, c, ih, iw)'),
                  Doc('rois', 'regions of interest, shape (m, 5). '
@@ -244,7 +265,7 @@ decl_opr('ROIAlign', outputs=[0],
                      'store it as a float, but it should be an integral value.'
                      ' The rois[:, 1:5] are (x0, y0, x1, y1) for each ROI, '
                      'which would be multiplied by the scale value given in '
-                     'param.')], 
+                     'param.')],
          params='ROIAlign',
          desc='ROI Align, see '
          'Mask-RCNN: https://arxiv.org/pdf/1703.06870.pdf, '
@@ -281,7 +302,7 @@ decl_opr('BatchConvBiasForward',
                  ('execution_policy', 'ExecutionPolicy')],
          desc=Doc(None,
 r"""
-    Apply a convolution of input tensor and filter tensor whose weights are not shared in batch dimensions. Outputs with batch index use the same weight. 
+    Apply a convolution of input tensor and filter tensor whose weights are not shared in batch dimensions. Outputs with batch index use the same weight.
     Assume input shape is :math:`(N, IC, IH, IW)` and filter shape is :math:`(batch, OC, IC, FH, FW)`, the output shape will be :math:`(N, OC, OH, OW)` where :math:`(OH, OW)` would be computed from padding, stride, :math:`(FH, FW)` and :math:`(IH, IW)`, as in convolution.
     for each output location, we have;
 
diff --git a/src/opr/impl/dnn/dnn.sereg.h b/src/opr/impl/dnn/dnn.sereg.h
index 63f6d948..345bfd05 100644
--- a/src/opr/impl/dnn/dnn.sereg.h
+++ b/src/opr/impl/dnn/dnn.sereg.h
@@ -13,6 +13,7 @@
 #include "megbrain/opr/dnn/convolution.h"
 #include "megbrain/opr/dnn/images2neibs.h"
 #include "megbrain/opr/dnn/pooling.h"
+#include "megbrain/opr/dnn/adaptive_pooling.h"
 #include "megbrain/opr/dnn/roi_pooling.h"
 #include "megbrain/opr/dnn/roi_align.h"
 #include "megbrain/opr/dnn/local.h"
@@ -388,6 +389,9 @@ namespace opr {
     MGB_SEREG_OPR(Pooling, 1);
     MGB_SEREG_OPR(PoolingBackward, 3);
 
+    MGB_SEREG_OPR(AdaptivePooling, 2);
+    MGB_SEREG_OPR(AdaptivePoolingBackward, 4);
+
     MGB_SEREG_OPR(ROIPooling, 3);
     MGB_SEREG_OPR(ROIPoolingBackward, 4);
 
diff --git a/src/opr/impl/misc.cpp b/src/opr/impl/misc.cpp
index 3f738d4e..d4c1bb0b 100644
--- a/src/opr/impl/misc.cpp
+++ b/src/opr/impl/misc.cpp
@@ -159,6 +159,7 @@ void Cumsum::init_output_static_infer_desc() {
             {SourceType::DEP, {{input(0), DepType::SHAPE}}, infer_workspace});
 }
 
+
 /* ================= CondTake =================  */
 MGB_DYN_TYPE_OBJ_FINAL_IMPL(CondTake);
 
diff --git a/src/opr/impl/misc.oprdecl b/src/opr/impl/misc.oprdecl
index b2444c65..d76f473d 100644
--- a/src/opr/impl/misc.oprdecl
+++ b/src/opr/impl/misc.oprdecl
@@ -63,4 +63,5 @@ decl_opr('TopK',
          inputs=['data', 'k'], params='TopK',
          desc='Select the top k values from sorted result.')
 
+
 # vim: ft=python
diff --git a/src/opr/impl/misc.sereg.h b/src/opr/impl/misc.sereg.h
index 7c5e7ea6..b8562ee5 100644
--- a/src/opr/impl/misc.sereg.h
+++ b/src/opr/impl/misc.sereg.h
@@ -70,6 +70,7 @@ namespace opr {
     using CumsumV1 = opr::Cumsum;
     MGB_SEREG_OPR(CumsumV1, 1);
 
+
 } // namespace opr
 } // namespace mgb
 
diff --git a/src/opr/impl/tensor_manip.cpp b/src/opr/impl/tensor_manip.cpp
index 862e44ce..d49bd82c 100644
--- a/src/opr/impl/tensor_manip.cpp
+++ b/src/opr/impl/tensor_manip.cpp
@@ -237,7 +237,8 @@ void GetVarShape::record_execute_deps(ExecDependencyArray& deps) {
 
 void ReshapeBrdcastHelper::reshapebrdcast_init(VarNode *inp, VarNode *tshp) {
     add_input({inp, tshp});
-    add_output(None)->dtype(inp->dtype());
+    add_output(None)->dtype(inp->dtype())
+                    .add_flag(VarNode::Flag::ALLOW_EMPTY_SHAPE);
     if (reshapebrdcast_output_shape_need_input_shape())
         outshape_by_symvar_enable(1, 1);
     else
@@ -340,6 +341,14 @@ void ReshapeBrdcastHelper::init_output_static_infer_desc() {
             infer_value});
 }
 
+ReshapeBrdcastHelper::NodeProp*
+ReshapeBrdcastHelper::do_make_node_prop() const {
+    auto ret = Super::do_make_node_prop();
+    ret->add_dep_type_existing_var(input(0),
+                                   NodeProp::DepType::VALUE_ALLOW_EMPTY);
+    return ret;
+}
+
 // f}}}
 
 /* f{{{ ======================= Reshape ======================= */
@@ -394,7 +403,7 @@ Maybe<TensorLayout> Reshape::reshapebrdcast_get_dest_layout(
     }
     auto tot_nr_elem = src.total_nr_elems();
     actual_tshape.shape[unspec] = 0;
-    mgb_throw_if(tot_nr_elem % rem_nr_elem, TensorReshapeError,
+    mgb_throw_if(!rem_nr_elem || tot_nr_elem % rem_nr_elem, TensorReshapeError,
             "could not reshape: src=%s tshape=%s unspec_axis=%zd",
             static_cast<const TensorShape&>(src).to_string().c_str(),
             actual_tshape.to_string().c_str(),
@@ -484,6 +493,17 @@ void AxisManipOprBase::init_output_static_infer_desc() {
             {SourceType::DEP, {{input(0), DepType::VALUE}}, infer_value});
 }
 
+AxisManipOprBase::NodeProp* AxisManipOprBase::do_make_node_prop() const {
+    auto ret = Super::do_make_node_prop();
+    ret->add_dep_type_existing_var(input(0),
+                                   NodeProp::DepType::VALUE_ALLOW_EMPTY);
+    return ret;
+}
+
+void AxisManipOprBase::axis_manip_init(VarNode* inp) {
+    add_input({inp});
+    add_output(None)->add_flag(VarNode::Flag::ALLOW_EMPTY_SHAPE);
+}
 
 // f}}}
 
@@ -504,8 +524,7 @@ Dimshuffle::Dimshuffle(VarNode *inp, const std::vector<int> &pattern,
         mgb_throw_if(i < -1 || i >= int(ndim), GraphError,
                 "bad Dimshuffle pattern");
     }
-    add_input({inp});
-    add_output(None);
+    axis_manip_init(inp);
     add_equivalence_component<PODHash<int>>(m_pattern.data(), m_pattern.size());
 }
 
@@ -587,8 +606,7 @@ AxisAddRemove::AxisAddRemove(
 {
     mgb_throw_if(desc.empty(), GraphError,
             "desc for AxisAddRemove could not be empty");
-    add_input({inp});
-    add_output(None)->add_flag(VarNode::Flag::ALLOW_EMPTY_SHAPE);
+    axis_manip_init(inp);
     add_equivalence_component<PODHash<AxisDesc>>(m_desc.data(), m_desc.size());
 }
 
@@ -631,13 +649,6 @@ TensorLayout AxisAddRemove::axis_manip_get_output_layout(
     return layout;
 }
 
-AxisAddRemove::NodeProp* AxisAddRemove::do_make_node_prop() const {
-    auto ret = Super::do_make_node_prop();
-    ret->add_dep_type_existing_var(input(0),
-                                   NodeProp::DepType::VALUE_ALLOW_EMPTY);
-    return ret;
-}
-
 #ifdef MGB_ENABLE_GRAD
 MGB_IMPL_OPR_GRAD(AxisAddRemove) {
     MGB_MARK_USED_VAR(wrt_idx);
diff --git a/src/opr/include/megbrain/opr/dnn/adaptive_pooling.h b/src/opr/include/megbrain/opr/dnn/adaptive_pooling.h
new file mode 100644
index 00000000..aa94f9c1
--- /dev/null
+++ b/src/opr/include/megbrain/opr/dnn/adaptive_pooling.h
@@ -0,0 +1,76 @@
+/**
+ * \file src/opr/include/megbrain/opr/dnn/adaptive_pooling.h
+ * MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
+ *
+ * Copyright (c) 2014-2020 Megvii Inc. All rights reserved.
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or
+ * implied.
+ */
+
+#pragma once
+
+#include "megbrain/opr/internal/megdnn_opr_wrapper.h"
+#include "megbrain/opr/internal/out_shape_by_sym_var.h"
+#include "megdnn/opr_param_defs.h"
+#include "megdnn/oprs/nn.h"
+
+namespace mgb {
+namespace opr {
+
+MGB_DEFINE_OPR_CLASS(
+        AdaptivePoolingForward,
+        intl::WorkspaceSizeInfer<intl::OutshapeBySymvarSCNOpr<
+                mixin::MegDNNOprHolderImpl<megdnn::AdaptivePoolingForward>>>) // {
+public:
+    AdaptivePoolingForward(VarNode * src, VarNode * out_shape,
+                           const Param& param,
+                           const OperatorNodeConfig& config);
+    static SymbolVar make(SymbolVar src, SymbolVar out_shape,
+                          const Param& param,
+                          const OperatorNodeConfig& config = {});
+    static SymbolVar make(SymbolVar src, const TensorShape& out_shape,
+                          const Param& param,
+                          const OperatorNodeConfig& config = {}) {
+        return make(src, cg::var_from_tensor_shape(src, out_shape), param,
+                    config);
+    }
+
+private:
+    void scn_do_execute() override;
+    void outshape_by_symvar_do_get_output_shape(
+            TensorShape & dest, const ShapeInferInfo& shpinfo) override;
+    size_t get_workspace_size_bytes(const TensorShapeArray& input_shapes,
+                                    const TensorShapeArray& output_shapes)
+            const override;
+    void init_output_dtype() override;
+    void add_input_layout_constraint() override;
+    void init_output_static_infer_desc() override;
+    void record_execute_deps(ExecDependencyArray& deps) override;
+};
+using AdaptivePooling = AdaptivePoolingForward;
+
+MGB_DEFINE_OPR_CLASS(
+        AdaptivePoolingBackward,
+        intl::MegDNNOprWrapperBwd<megdnn::AdaptivePoolingBackward>) // {
+public:
+    AdaptivePoolingBackward(VarNode * src, VarNode * out_shape, VarNode * dst,
+                            VarNode * diff, const Param& param,
+                            const OperatorNodeConfig& config);
+    static SymbolVar make(SymbolVar src, SymbolVar out_shape, SymbolVar dst,
+                          SymbolVar diff, const Param& param,
+                          const OperatorNodeConfig& config = {});
+
+private:
+    void scn_do_execute() override;
+    size_t get_workspace_size_bytes(const TensorShapeArray& input_shapes,
+                                    const TensorShapeArray& output_shapes)
+            const override;
+};
+
+}  // namespace opr
+}  // namespace mgb
+
+// vim: syntax=cpp.doxygen foldmethod=marker foldmarker=f{{{,f}}}
diff --git a/src/opr/include/megbrain/opr/dnn/batch_norm.h b/src/opr/include/megbrain/opr/dnn/batch_norm.h
index 558f9132..c3ca2f24 100644
--- a/src/opr/include/megbrain/opr/dnn/batch_norm.h
+++ b/src/opr/include/megbrain/opr/dnn/batch_norm.h
@@ -79,6 +79,8 @@ MGB_DEFINE_OPR_CLASS(BatchNormForward,
 
         // if set to True, running mean/variance will be updated inplace
         bool m_force_inplace = true;
+        // need running mean/variance
+        bool need_stats() const {return input().size() == 5 && param().fwd_mode == Param::FwdMode::TRAINING;}
 };
 
 using BatchNorm = BatchNormForward;
diff --git a/src/opr/include/megbrain/opr/misc.h b/src/opr/include/megbrain/opr/misc.h
index e6285a41..314adefc 100644
--- a/src/opr/include/megbrain/opr/misc.h
+++ b/src/opr/include/megbrain/opr/misc.h
@@ -94,6 +94,7 @@ MGB_DEFINE_OPR_CLASS(Cumsum, cg::SingleCNOperatorNodeBaseT<
         void init_output_static_infer_desc() override;
 };
 
+
 namespace intl {
 using CondTakeBase =
         cg::SingleCNOperatorNode<cg::OperatorNodeBase,
diff --git a/src/opr/include/megbrain/opr/tensor_manip.h b/src/opr/include/megbrain/opr/tensor_manip.h
index 345c7f6d..d030e239 100644
--- a/src/opr/include/megbrain/opr/tensor_manip.h
+++ b/src/opr/include/megbrain/opr/tensor_manip.h
@@ -92,6 +92,7 @@ MGB_DEFINE_CLS_WITH_SUPER(ReshapeBrdcastHelper,
     void scn_do_execute() override final;
     void add_input_layout_constraint() override final;
     void init_output_static_infer_desc() override;
+    NodeProp* do_make_node_prop() const override;
 
     protected:
         using Super::Super;
@@ -199,11 +200,14 @@ MGB_DEFINE_CLS_WITH_SUPER(AxisManipOprBase,
     void mem_plan_fwd_in2out_readonly() override final;
     void scn_do_execute() override final;
     void init_output_static_infer_desc() override final;
+    NodeProp* do_make_node_prop() const override;
 
     protected:
         using Super::Super;
         virtual TensorLayout axis_manip_get_output_layout(
                 const TensorLayout &inp_layout) const = 0;
+
+        void axis_manip_init(VarNode* inp);
 };
 
 }
@@ -319,8 +323,6 @@ MGB_DEFINE_OPR_CLASS(AxisAddRemove, intl::AxisManipOprBase) // {
 
         TensorLayout axis_manip_get_output_layout(
                 const TensorLayout &inp_layout) const override;
-
-        NodeProp* do_make_node_prop() const override;
 };
 
 namespace intl {
diff --git a/src/opr/test/basic_arith/others.cpp b/src/opr/test/basic_arith/others.cpp
index a3a617af..f599da7d 100644
--- a/src/opr/test/basic_arith/others.cpp
+++ b/src/opr/test/basic_arith/others.cpp
@@ -269,13 +269,13 @@ TEST(TestOprBasicArith, AddUpdateOtherStream) {
     };
 
     std::shared_ptr<HostTensorND> host_val = gen({SIZE});
-    auto cn_nccl = CompNode::load("gpu0").change_stream(CompNode::Stream::NCCL);
+    auto cn1 = CompNode::load("gpu0:0").change_stream(1);
     auto param = opr::SharedDeviceTensor::make(*graph, *host_val);
     param.node()->owner_opr()->node_prop().attribute().priority =
             std::numeric_limits<int>::max();
-    auto copy = opr::Copy::make(param, cn_nccl);
+    auto copy = opr::Copy::make(param, cn1);
     auto add = (copy + 3) * 5;
-    auto add_update = opr::AddUpdate::make(param, add, {}, {cn_nccl});
+    auto add_update = opr::AddUpdate::make(param, add, {}, {cn1});
 
     auto callback = opr::CallbackInjector::make(add_update, set_flag);
 
diff --git a/src/opr/test/dnn/adaptive_pooling.cpp b/src/opr/test/dnn/adaptive_pooling.cpp
new file mode 100644
index 00000000..cf08f889
--- /dev/null
+++ b/src/opr/test/dnn/adaptive_pooling.cpp
@@ -0,0 +1,87 @@
+/**
+ * \file src/opr/test/dnn/adaptive_pooling.cpp
+ * MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
+ *
+ * Copyright (c) 2014-2020 Megvii Inc. All rights reserved.
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or
+ * implied.
+ */
+
+#include "megbrain/opr/dnn/adaptive_pooling.h"
+#include "megbrain/comp_node_env.h"
+#include "megbrain/opr/dnn/pooling.h"
+#include "megbrain/opr/tensor_manip.h"
+#include "megbrain/test/autocheck.h"
+#include "megbrain/test/megdnn_helper.h"
+#include "megdnn/dtype.h"
+#include "megdnn/opr_param_defs.h"
+
+using namespace std;
+using namespace mgb;
+
+namespace {
+
+using Param = opr::AdaptivePoolingForward::Param;
+void run(Param::Mode mode) {
+    using Checker = AutoOprChecker<2, 1>;
+    Param param{mode};
+
+    auto make_graph =
+            [&](const Checker::SymInpArray& inputs) -> Checker::SymOutArray {
+        auto o0 = opr::GetVarShape::make(inputs[1]);
+        auto o1 = opr::AdaptivePoolingForward::make(inputs[0], o0, param);
+        return {o1};
+    };
+
+    auto fwd = [&](Checker::NumOutArray& dest, Checker::NumInpArray inp) {
+        auto opr = MegDNNHandle::get(
+                           CompNodeEnv::from_comp_node(CompNode::default_cpu()))
+                           ->create_operator<megdnn::AdaptivePoolingForward>();
+        opr->param() = param;
+        size_t N = inp[0].get()->shape(0), C = inp[0].get()->shape(1);
+        size_t OH = inp[1].get()->shape(0), OW = inp[1].get()->shape(1);
+        dest[0].resize(TensorShape{N, C, OH, OW});
+        opr->exec(inp[0]->as_megdnn(), dest[0].as_megdnn(), {});
+    };
+
+    auto gen = [&](HostTensorND& src) {
+        if (mode == Param::Mode::MAX) {
+            HostTensorGenerator<dtype::Float32, RandomDistribution::CONSECUTIVE>
+                    src_gen(1.0f, 0.1f);
+            src = *src_gen(src.shape(), src.comp_node());
+        } else {
+            HostTensorGenerator<dtype::Float32, RandomDistribution::GAUSSIAN>
+                    src_gen(10.f);
+            src = *src_gen(src.shape(), src.comp_node());
+        }
+    };
+
+    Checker::RunOptions opt;
+    opt.numdiff_max_err = 1e-2;
+
+    Checker checker{make_graph, fwd};
+    checker.set_input_allow_grad(1, false)
+           .set_input_generator(0, gen);
+    checker.run({TensorShape{1, 1, 10, 7}, TensorShape{5, 4}}, opt);
+    checker.run({TensorShape{1, 1, 9, 7}, TensorShape{5, 4}}, opt);
+    checker.run({TensorShape{1, 2, 8, 9}, TensorShape{3, 4}}, opt);
+}
+
+}  // anonymous namespace
+
+TEST(TestOprDNN, AdaptivePoolingMax) {
+    run(Param::Mode::MAX);
+}
+
+TEST(TestOprDNN, AdaptivePoolingAverage) {
+    run(Param::Mode::AVERAGE);
+}
+
+TEST(TestOprDNN, AdaptivePoolingAverageCountExcludePadding) {
+    run(Param::Mode::AVERAGE_COUNT_EXCLUDE_PADDING);
+}
+
+// vim: syntax=cpp.doxygen foldmethod=marker foldmarker=f{{{,f}}}
diff --git a/src/opr/test/dnn/convolution.cpp b/src/opr/test/dnn/convolution.cpp
index aa79d029..500efd92 100644
--- a/src/opr/test/dnn/convolution.cpp
+++ b/src/opr/test/dnn/convolution.cpp
@@ -2225,6 +2225,7 @@ protected:
                            iw = ih;
         comp_node = CompNode::load("cpux");
         graph = ComputingGraph::make();
+        graph->options().graph_opt.weight_preprocess = is_weight_preprocess();
         TensorShape x_shape{1, ic, ih, iw}, w_shape{oc, ic, fh, fh};
         x_host = std::make_shared<HostTensorND>(comp_node, x_shape);
         auto x = opr::Host2DeviceCopy::make(*graph, x_host);
@@ -2247,6 +2248,8 @@ protected:
 
     void run() { func->execute().wait(); }
 
+    virtual bool is_weight_preprocess() { return true; }
+
     void TearDown() override {
         func.reset();
         // Triggers mock check
@@ -2346,6 +2349,33 @@ TEST_F(TestWeightPreprocess, PreprocessCalledOnlyOnce) {
     }
 }
 
+class TestNoWeightPreprocess : public TestWeightPreprocess {
+    bool is_weight_preprocess() override { return false; }
+};
+
+TEST_F(TestNoWeightPreprocess, NoPreprocess) {
+    using ::testing::_;
+    using ::testing::Return;
+    auto& mock = mock_conv();
+
+    MockAlgorithm algo;
+    EXPECT_CALL(mock, get_algorithm_heuristic(_, _, _, _, _))
+            .WillRepeatedly(Return(&algo));
+    EXPECT_CALL(mock, get_workspace_in_bytes(_, _, _, _))
+            .WillRepeatedly(Return(0));
+    EXPECT_CALL(mock, get_preprocess_workspace_in_bytes(_, _, _))
+            .WillRepeatedly(Return(0));
+
+    {
+        ::testing::InSequence seq;
+        // Return empty preprocess filters, indicating no need to preprocess
+        EXPECT_CALL(mock, deduce_preprocessed_filter_layout(_, _, _)).Times(0);
+        EXPECT_CALL(mock, exec_preprocess(_, _, _, _, _)).Times(0);
+        EXPECT_CALL(mock, exec(_, _, _, nullptr, _));
+        run();
+    }
+}
+
 }  // anonymous namespace
 
 #endif
diff --git a/src/opr/test/tensor_manip.cpp b/src/opr/test/tensor_manip.cpp
index 11dd9727..3fd5dd1b 100644
--- a/src/opr/test/tensor_manip.cpp
+++ b/src/opr/test/tensor_manip.cpp
@@ -17,6 +17,7 @@
 #include "megbrain/opr/io.h"
 #include "megbrain/opr/blas.h"
 #include "megbrain/opr/utility.h"
+#include "megbrain/opr/misc.h"
 #include "megbrain/utils/arith_helper.h"
 
 using namespace mgb;
@@ -138,7 +139,7 @@ TEST(TestTensorManip, Reshape) {
         auto &&dep_map = opr0_reshp.node()->owner_opr()->node_prop().dep_map();
         using DT = cg::OperatorNodeBase::NodeProp::DepType;
         ASSERT_EQ(2u, dep_map.size());
-        ASSERT_EQ(DT::DEV_VALUE, dep_map.at(op->input(0)));
+        ASSERT_EQ(DT::DEV_VALUE | DT::VALUE_ALLOW_EMPTY, dep_map.at(op->input(0)));
         ASSERT_EQ(DT::HOST_VALUE, dep_map.at(op->input(1)));
     }
 
@@ -318,6 +319,39 @@ TEST(TestTensorManip, ReshapeInferShapeForDynamicInput) {
     run({23, 12, 5});
 }
 
+TEST(TestTensorManip, ReshapeEmptyShape) {
+    HostTensorGenerator<> gen;
+    constexpr size_t x_length = 233;
+    auto host_x = gen({x_length}),
+         host_v = gen({2, 3, 3, 3});
+    for (size_t i = 0; i < x_length; ++ i) {
+        host_x->ptr<float>()[i] = 1.f;
+    }
+    constexpr auto INVALID_AXIS = opr::Reshape::Param::INVALID_AXIS;
+    for (auto unspec_axis: {INVALID_AXIS, 0, 1, 3}) {
+        auto graph = ComputingGraph::make();
+        graph->options().graph_opt_level = 0;
+        TensorShape tshape{2, 3, 3, 3};
+        auto zero_axis = unspec_axis;
+        if (unspec_axis == INVALID_AXIS) {
+            tshape[zero_axis = 2] = 0;
+        }
+        using CondTakeMode = opr::CondTake::Param::Mode;
+        auto x = opr::Host2DeviceCopy::make(*graph, host_x),
+             x_empty = opr::CondTake::make(x, x, {CondTakeMode::EQ, 0.f})[0],
+             v = opr::Host2DeviceCopy::make(*graph, host_v),
+             x_reshape = opr::Reshape::make(x_empty, tshape, {unspec_axis}),
+             y = opr::Concat::make({x_reshape, v}, zero_axis);
+        HostTensorND host_empty, host_y;
+        auto func = graph->compile({
+            make_callback_copy(x_reshape, host_empty),
+            make_callback_copy(y, host_y)});
+        func->execute().wait();
+        ASSERT_TRUE(host_empty.layout().is_empty());
+        MGB_ASSERT_TENSOR_EQ(*host_v, host_y);
+    }
+}
+
 TEST(TestTensorManip, ReshapeWithNegativeUnspec) {
     HostTensorGenerator<> gen;
     auto host_x = gen({4, 8});
@@ -365,6 +399,26 @@ TEST(TestTensorManip, Broadcast) {
     }
 }
 
+TEST(TestTensorManip, BroadcastEmptyShape) {
+    HostTensorGenerator<> gen;
+    for (auto&& arg:
+        {std::make_pair(TensorShape{1}, TensorShape{0}),
+         {{1, 2, 3}, {0, 2, 3}},
+         {{2, 3}, {1, 0, 2, 3}},
+         {{1, 0, 2, 3}, {4, 0, 2, 3}},
+         {{0, 1, 2, 3}, {3, 0, 4, 2, 3}}}) {
+        auto host_x = gen(arg.first);
+        auto graph = ComputingGraph::make();
+        graph->options().graph_opt_level = 0;
+        auto x = opr::Host2DeviceCopy::make(*graph, host_x),
+             y = opr::Broadcast::make(x, arg.second);
+        HostTensorND host_y;
+        auto func = graph->compile({make_callback_copy(y, host_y)});
+        func->execute();
+        ASSERT_TRUE(host_y.shape().eq_shape(arg.second));
+    }
+}
+
 TEST(TestTensorManip, Dimshuffle) {
     HostTensorGenerator<> gen;
     constexpr size_t S0 = 8, S1 = 3;
@@ -395,6 +449,34 @@ TEST(TestTensorManip, Dimshuffle) {
         }
 }
 
+TEST(TestTensorManip, DimshuffleEmptyShape) {
+    HostTensorGenerator<> gen;
+    for (auto&& arg:
+        {std::make_pair(
+            TensorShape{3, 0},
+            std::vector<int>{1, -1, 0, -1}),
+         {{3, 1, 0, 4}, {-1, 3, -1, 0, 2}},
+         {{2, 0, 3, 0}, {1, 0, 2, 3}}}) {
+        auto host_x = gen(arg.first);
+        auto graph = ComputingGraph::make();
+        graph->options().graph_opt_level = 0;
+        auto x = opr::Host2DeviceCopy::make(*graph, host_x),
+             y = opr::Dimshuffle::make(x, arg.second);
+        HostTensorND host_y;
+        auto func = graph->compile({make_callback_copy(y, host_y)});
+        func->execute();
+        auto&& y_shape = host_y.shape();
+        for(size_t idx = 0; idx < arg.second.size(); ++ idx) {
+            auto elem = arg.second[idx];
+            if (elem == -1) {
+                ASSERT_EQ(y_shape[idx], 1u);
+            } else {
+                ASSERT_EQ(arg.first[elem], y_shape[idx]);
+            }
+        }
+    }
+}
+
 TEST(TestTensorManip, DimshuffleCombined) {
     using Checker = AutoOprChecker<1, 1>;
     constexpr int RED0 = 2, RED1 = 3;
diff --git a/src/serialization/impl/schema.fbs b/src/serialization/impl/schema.fbs
index 623f5a70..2be86870 100644
--- a/src/serialization/impl/schema.fbs
+++ b/src/serialization/impl/schema.fbs
@@ -28,6 +28,7 @@ table Blob {
 }
 
 table Reserved0 {}
+table Reserved1 {}
 
 union OperatorParam {
     param.Empty = 1,
@@ -99,6 +100,8 @@ union OperatorParam {
     DType = 67,
     param.Remap = 68,
     param.NMSKeep = 69,
+    param.AdaptivePooling = 70,
+    Reserved1 = 71,
 }
 
 table Operator {
diff --git a/imperative/src/version.ld b/src/version.ld
similarity index 100%
rename from imperative/src/version.ld
rename to src/version.ld
diff --git a/test/CMakeLists.txt b/test/CMakeLists.txt
index ab94fe1b..eecc1941 100644
--- a/test/CMakeLists.txt
+++ b/test/CMakeLists.txt
@@ -43,3 +43,9 @@ endif()
 if (MGE_WITH_DISTRIBUTED)
     target_link_libraries(megbrain_test megray)
 endif()
+
+if(MGE_WITH_JIT)
+    if(MGE_WITH_JIT_MLIR)
+        add_subdirectory(${PROJECT_SOURCE_DIR}/src/jit/test/mlir ${CMAKE_CURRENT_BINARY_DIR}/../src/jit/test/mlir)
+    endif()
+endif()
diff --git a/test/src/helper.cpp b/test/src/helper.cpp
index 451bd473..113a5d03 100644
--- a/test/src/helper.cpp
+++ b/test/src/helper.cpp
@@ -113,6 +113,20 @@ dtype, RandomDistribution::CONSTANT>::operator ()(
     return ret;
 }
 
+template<typename dtype>
+std::shared_ptr<HostTensorND> HostTensorGenerator<
+dtype, RandomDistribution::CONSECUTIVE>::operator ()(
+        const TensorShape &shape, CompNode cn) {
+    if (!cn.valid())
+        cn = CompNode::load("xpu0");
+    std::shared_ptr<HostTensorND> ret =
+        std::make_shared<HostTensorND>(cn, shape, dtype());
+    auto ptr = ret->ptr<ctype>();
+    for (size_t i = 0, it = shape.total_nr_elems(); i < it; ++ i) {
+        ptr[i] = m_val + i * m_delta;
+    }
+    return ret;
+}
 
 // explicit instantialization of HostTensorGenerator
 namespace mgb {
@@ -123,12 +137,16 @@ namespace mgb {
     template class HostTensorGenerator<
         dtype::Float32, RandomDistribution::CONSTANT>;
     template class HostTensorGenerator<
+        dtype::Float32, RandomDistribution::CONSECUTIVE>;
+    template class HostTensorGenerator<
         dtype::Float16, RandomDistribution::GAUSSIAN>;
     template class HostTensorGenerator<
         dtype::Int8, RandomDistribution::UNIFORM>;
     template class HostTensorGenerator<
         dtype::Int8, RandomDistribution::CONSTANT>;
     template class HostTensorGenerator<
+        dtype::Int8, RandomDistribution::CONSECUTIVE>;
+    template class HostTensorGenerator<
         dtype::Uint8, RandomDistribution::UNIFORM>;
     template class HostTensorGenerator<
         dtype::Uint8, RandomDistribution::CONSTANT>;
diff --git a/test/src/include/megbrain/test/helper.h b/test/src/include/megbrain/test/helper.h
index c6a0c951..72630a2d 100644
--- a/test/src/include/megbrain/test/helper.h
+++ b/test/src/include/megbrain/test/helper.h
@@ -168,7 +168,7 @@ class RNGxorshf {
 };
 
 enum class RandomDistribution {
-    GAUSSIAN, UNIFORM, CONSTANT
+    GAUSSIAN, UNIFORM, CONSTANT, CONSECUTIVE
 };
 
 template<class dtype>
@@ -342,6 +342,29 @@ class HostTensorGenerator<dtype, RandomDistribution::CONSTANT> final:
     private:
         ctype m_default_val;
 };
+
+//! consecutive value
+template<class dtype>
+class HostTensorGenerator<dtype, RandomDistribution::CONSECUTIVE> final:
+        public HostTensorGeneratorBase {
+
+    public:
+        using ctype = typename DTypeTrait<dtype>::ctype;
+
+        HostTensorGenerator(ctype val, ctype delta)
+                : HostTensorGeneratorBase{next_rand_seed()},
+                  m_val{val}, m_delta{delta} {}
+
+        std::shared_ptr<HostTensorND> operator ()(
+                const TensorShape &shape, CompNode cn = {}) override;
+        using HostTensorGeneratorBase::operator();
+
+    private:
+        ctype m_val;
+        ctype m_delta;
+};
+
+
 template <>
 class HostTensorGenerator<dtype::Bool, RandomDistribution::UNIFORM> final
         : public HostTensorGeneratorBase {
diff --git a/tools/param_defs/mgb_opr_param_defs.py b/tools/param_defs/mgb_opr_param_defs.py
index 6b64364a..bbef32d6 100644
--- a/tools/param_defs/mgb_opr_param_defs.py
+++ b/tools/param_defs/mgb_opr_param_defs.py
@@ -143,3 +143,4 @@ pdef('PersistentOutputStorage').add_fields(
                ' no branch is taken')
            )
  )
+