Merge branch 'master' into release-1.0

4 years ago · 0a0e4b60dd
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -53,9 +53,11 @@ option(MGE_WITH_DISTRIBUTED "Build with distributed support" ON)
 option(MGE_BUILD_IMPERATIVE_RT "Build _imperative_rt Python Module " ON)
 option(MGE_BUILD_SDK "Build load_and_run" ON)
 option(MGE_INFERENCE_ONLY "Build inference only library." OFF)
 option(MGE_WITH_PYTHON_MODULE "Build MegEngine legacy Python Module." OFF)
 option(MGE_WITH_MKLDNN "Enable Intel MKL_DNN support," ON)
 option(MGE_WITH_ROCM "Enable ROCM support" OFF)


 if(NOT ${MGE_BIN_REDUCE} STREQUAL "")
    message("build with BIN REDUCE")
    if(MGE_WITH_MINIMUM_SIZE)
@@ -152,6 +154,14 @@ if(${MGE_ARCH} STREQUAL "x86_64" OR ${MGE_ARCH} STREQUAL "i386" OR ${MGE_ARCH} S
 endif()

 if(MSVC OR WIN32)
    # for cmake after 3.15.2
    cmake_policy(SET CMP0091 NEW)
    if(${CMAKE_BUILD_TYPE} STREQUAL "Debug")
        set(CMAKE_MSVC_RUNTIME_LIBRARY "MultiThreadedDebug")
    else()
        set(CMAKE_MSVC_RUNTIME_LIBRARY "MultiThreaded")
    endif()

    add_compile_definitions(NOMINMAX=1 _USE_MATH_DEFINES=1 WIN32=1)
    message("-- into windows build...")
    message("-- CMAKE_C_COMPILER_ID: ${CMAKE_C_COMPILER_ID}")
@@ -285,7 +295,6 @@ if(MGE_WITH_TEST)
 endif()

 if(MGE_BUILD_IMPERATIVE_RT)
    add_compile_definitions(MGB_ENABLE_IMPERATIVE_RUNTIME)
    set(CMAKE_CXX_STANDARD 17)
 endif()

@@ -701,7 +710,8 @@ endif()

 set (CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${MARCH}")

 set(MGB_ENABLE_IMPERATIVE ${MGE_BUILD_IMPERATIVE_RT})
 set(MGE_VERSION_SCRIPT ${PROJECT_SOURCE_DIR}/src/version.ld CACHE INTERNAL "Path to linker version script")

 # Write out megbrain_build_config.h
 # It defines macros needed by both megbrain and dnn
 configure_file(src/megbrain_build_config.h.in ${CMAKE_CURRENT_BINARY_DIR}/genfiles/megbrain_build_config.h)
@@ -831,3 +841,8 @@ if(MSVC OR WIN32)
        endif()
    endforeach()
 endif()

 if(MGE_WITH_JIT_MLIR)
    add_subdirectory(tools/mlir/mgb-opt)
    add_subdirectory(tools/mlir/mgb-file-check)
 endif()
--- a/dnn/include/megdnn/oprs/nn.h
+++ b/dnn/include/megdnn/oprs/nn.h
@@ -683,6 +683,53 @@ protected:
 };

 /**
 * \brief base class for AdaptivePooling
 */
 class AdaptivePoolingBase : public OperatorBase {
    DEF_OPR_IMPL_CTOR(AdaptivePoolingBase, OperatorBase);
    DEF_OPR_PARAM(AdaptivePooling);

 protected:
    param::Pooling deduce_pooling_param(const TensorLayout& src,
                                        const TensorLayout& dst);
 };

 class AdaptivePoolingForward : public AdaptivePoolingBase {
    DEF_OPR_IMPL(AdaptivePoolingForward, AdaptivePoolingBase, 1, 1);

 public:
    /**
     * \param[in] src input tensor
     * \param[out] dst output tensor
     */
    virtual void exec(_megdnn_tensor_in src, _megdnn_tensor_out dst,
                      _megdnn_workspace workspace) = 0;
    virtual size_t get_workspace_in_bytes(const TensorLayout& src,
                                          const TensorLayout& dst) = 0;
 };

 using AdaptivePooling = AdaptivePoolingForward;

 class AdaptivePoolingBackward : public AdaptivePoolingBase {
    DEF_OPR_IMPL(AdaptivePoolingBackward, AdaptivePoolingBase, 3, 1);

 public:
    /**
     * \param[in] src the `src' parameter in AdaptivePoolingForward::exec
     * \param[in] dst the `dst' parameter in AdaptivePoolingForward::exec
     * \param[in] diff the backpropagated gradient wrt. dst
     * \param[out] grad the backpropagated gradient wrt. src
     */
    virtual void exec(_megdnn_tensor_in src, _megdnn_tensor_in dst,
                      _megdnn_tensor_in diff, _megdnn_tensor_out grad,
                      _megdnn_workspace workspace) = 0;
    virtual size_t get_workspace_in_bytes(const TensorLayout& src,
                                          const TensorLayout& dst,
                                          const TensorLayout& diff,
                                          const TensorLayout& grad) = 0;
 };

 /**
 * \brief base class for Local
 */
 class LocalBase : public OperatorBase {
--- a/dnn/scripts/opr_param_defs.py
+++ b/dnn/scripts/opr_param_defs.py
@@ -179,6 +179,11 @@ pdef('Axis').add_fields('int32', 'axis', 0)
 add_enum_alias('Format', 'ConvolutionV0')
 )

 (pdef('AdaptivePooling').
 add_enum_alias('Mode', 'Pooling').
 add_enum_alias('Format', 'ConvolutionV0')
 )

 (pdef('LRN',
      'see ImageNet Classification with Deep Convolutional Neural Networks for'
      ' meaning of the fields').
--- a/dnn/src/atlas/megcore/computing_context.cpp
+++ b/dnn/src/atlas/megcore/computing_context.cpp
@@ -55,8 +55,12 @@ void AtlasComputingContext::memcpy(void* dst, const void* src,
        default:
            megdnn_throw("bad atlas memcpy kind");
    }
 #if MGB_USE_ATLAS_ASYNC_API
    acl_check(aclrtMemcpyAsync(dst, size_in_bytes, src, size_in_bytes,
                               atlas_kind, m_ctx.stream));
 #else
    acl_check(aclrtMemcpy(dst, size_in_bytes, src, size_in_bytes, atlas_kind));
 #endif
 }

 void AtlasComputingContext::memset(void* dst, int value, size_t size_in_bytes) {
@@ -65,7 +69,11 @@ void AtlasComputingContext::memset(void* dst, int value, size_t size_in_bytes) {
 }

 void AtlasComputingContext::synchronize() {
 #if MGB_USE_ATLAS_ASYNC_API
    acl_check(aclrtSynchronizeStream(m_ctx.stream));
 #else
    return;
 #endif
 }

 // vim: syntax=cpp.doxygen
--- a/dnn/src/common/adaptive_pooling.cpp
+++ b/dnn/src/common/adaptive_pooling.cpp
@@ -0,0 +1,37 @@
 /**
 * \file dnn/src/common/adaptive_pooling.cpp
 * MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
 *
 * Copyright (c) 2014-2020 Megvii Inc. All rights reserved.
 *
 * Unless required by applicable law or agreed to in writing,
 * software distributed under the License is distributed on an
 * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or
 * implied.
 */
 #include "megdnn/opr_param_defs.h"
 #include "megdnn/oprs.h"

 #include "src/common/utils.h"
 namespace megdnn {

 param::Pooling AdaptivePoolingBase::deduce_pooling_param(
        const TensorLayout& src, const TensorLayout& dst) {
    megdnn_assert(param().format == param::AdaptivePooling::Format::NCHW);
    size_t IH = src.shape[2], IW = src.shape[3], OH = dst.shape[2],
           OW = dst.shape[3];

    param::Pooling ret;
    ret.mode = param().mode;
    ret.format = param().format;
    ret.pad_h = ret.pad_w = 0;
    ret.stride_h = floor(IH / OH);
    ret.stride_w = floor(IW / OW);
    ret.window_h = IH - (OH - 1) * ret.stride_h;
    ret.window_w = IW - (OW - 1) * ret.stride_w;

    return ret;
 }
 }  // namespace megdnn

 // vim: syntax=cpp.doxygen
--- a/dnn/src/common/basic_types.cpp
+++ b/dnn/src/common/basic_types.cpp
@@ -392,8 +392,6 @@ TensorLayout TensorLayout::broadcast(const TensorShape& tshape) const {
        TensorLayout result{dtype, format};
        result.ndim = tshape.ndim;
        for (size_t i = 0; i < tshape.ndim; i++) {
            megdnn_throw_if(!tshape.shape[i], tensor_reshape_error,
                            megdnn_mangle("target shape is 0"));
            result.shape[i] = tshape.shape[i];
            result.stride[i] = (tshape.shape[i] == 1);
        }
@@ -409,8 +407,6 @@ TensorLayout TensorLayout::broadcast(const TensorShape& tshape) const {
    for (size_t i = 0; i < tshape.ndim; ++i) {
        int target_idx = tshape.ndim - i - 1;
        int cur_idx = ndim - i - 1;
        megdnn_throw_if(!tshape.shape[target_idx], tensor_reshape_error,
                        megdnn_mangle("target shape is 0"));
        size_t cur_shape = (cur_idx >= 0 ? shape[cur_idx] : 1),
               cur_stride = (cur_idx >= 0 ? stride[cur_idx] : 0);
        if (tshape.shape[target_idx] != cur_shape) {
@@ -434,10 +430,16 @@ TensorLayout TensorLayout::broadcast(const TensorShape& tshape) const {
 bool TensorLayout::try_reshape(TensorLayout& result,
                               const TensorShape& tshp) const {
    megdnn_assert(tshp.ndim);

    bool is_empty_shape = false;
    for (size_t i = 0; i < tshp.ndim; ++i) {
        megdnn_throw_if(!tshp.shape[i], tensor_reshape_error,
                        megdnn_mangle(ssprintf("bad target tshp: %s",
                                               tshp.to_string().c_str())));
        if (!tshp.shape[i]) {
            megdnn_throw_if(!format.is_default(), tensor_reshape_error,
                megdnn_mangle(ssprintf("bad target tshp: %s",
                                tshp.to_string().c_str())));
            is_empty_shape = true;
            break;
        }
    }

    megdnn_throw_if(
@@ -454,6 +456,11 @@ bool TensorLayout::try_reshape(TensorLayout& result,
    result.format = this->format;
    result.TensorShape::operator=(tshp);

    if (is_empty_shape) {
        result.init_contiguous_stride();
        return true;
    }

    size_t sdim = 0, prod = 1, cont_sdim = 0;
    for (size_t i = 0; i < tshp.ndim; ++i) {
        megdnn_assert(cont_sdim < cont.ndim);
--- a/dnn/src/common/handle_impl.h
+++ b/dnn/src/common/handle_impl.h
@@ -199,6 +199,8 @@ private:
    cb(Remap) \
    cb(RemapBackwardData) \
    cb(RemapBackwardMat) \
    cb(AdaptivePoolingForward) \
    cb(AdaptivePoolingBackward) \

 /*!
 * \brief specialize HandleImpl::create_operator for a single opr type;
--- a/dnn/src/cuda/adaptive_pooling/opr_impl.cpp
+++ b/dnn/src/cuda/adaptive_pooling/opr_impl.cpp
@@ -0,0 +1,53 @@
 /**
 * \file dnn/src/cuda/adaptive_pooling/opr_impl.cpp
 * MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
 *
 * Copyright (c) 2014-2020 Megvii Inc. All rights reserved.
 *
 * Unless required by applicable law or agreed to in writing,
 * software distributed under the License is distributed on an
 * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or
 * implied.
 */
 #include "src/cuda/adaptive_pooling/opr_impl.h"
 #include "src/cuda/utils.h"

 namespace megdnn {
 namespace cuda {

 void AdaptivePoolingForwardImpl::exec(_megdnn_tensor_in src,
                                      _megdnn_tensor_out dst,
                                      _megdnn_workspace workspace) {
    auto opr = handle()->create_operator<PoolingForward>();
    opr->param() = deduce_pooling_param(src.layout, dst.layout);
    opr->exec(src, dst, workspace);
 }

 size_t AdaptivePoolingForwardImpl::get_workspace_in_bytes(
        const TensorLayout& src, const TensorLayout& dst) {
    auto opr = handle()->create_operator<PoolingForward>();
    opr->param() = deduce_pooling_param(src, dst);
    return opr->get_workspace_in_bytes(src, dst);
 }

 void AdaptivePoolingBackwardImpl::exec(_megdnn_tensor_in src,
                                       _megdnn_tensor_in dst,
                                       _megdnn_tensor_in diff,
                                       _megdnn_tensor_out grad,
                                       _megdnn_workspace workspace) {
    auto opr = handle()->create_operator<PoolingBackward>();
    opr->param() = deduce_pooling_param(src.layout, dst.layout);
    opr->exec(src, dst, diff, grad, workspace);
 }

 size_t AdaptivePoolingBackwardImpl::get_workspace_in_bytes(
        const TensorLayout& src, const TensorLayout& dst,
        const TensorLayout& diff, const TensorLayout& grad) {
    auto opr = handle()->create_operator<PoolingBackward>();
    opr->param() = deduce_pooling_param(src, dst);
    return opr->get_workspace_in_bytes(src, dst, diff, grad);
 }
 }  // namespace cuda
 }  // namespace megdnn

 // vim: syntax=cpp.doxygen
--- a/dnn/src/cuda/adaptive_pooling/opr_impl.h
+++ b/dnn/src/cuda/adaptive_pooling/opr_impl.h
@@ -0,0 +1,44 @@
 /**
 * \file dnn/src/cuda/adaptive_pooling/opr_impl.h
 * MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
 *
 * Copyright (c) 2014-2020 Megvii Inc. All rights reserved.
 *
 * Unless required by applicable law or agreed to in writing,
 * software distributed under the License is distributed on an
 * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or
 * implied.
 */
 #pragma once
 #include "megdnn/oprs.h"

 #include "src/cuda/cudnn_wrapper.h"
 #include "src/cuda/utils.h"

 namespace megdnn {
 namespace cuda {

 class AdaptivePoolingForwardImpl final : public AdaptivePoolingForward {
 public:
    using AdaptivePoolingForward::AdaptivePoolingForward;
    void exec(_megdnn_tensor_in src, _megdnn_tensor_out dst,
              _megdnn_workspace workspace) override;
    size_t get_workspace_in_bytes(const TensorLayout& src,
                                  const TensorLayout& dst) override;
 };

 class AdaptivePoolingBackwardImpl final : public AdaptivePoolingBackward {
 public:
    using AdaptivePoolingBackward::AdaptivePoolingBackward;
    void exec(_megdnn_tensor_in src, _megdnn_tensor_in dst,
              _megdnn_tensor_in diff, _megdnn_tensor_out grad,
              _megdnn_workspace workspace) override;
    size_t get_workspace_in_bytes(const TensorLayout& src,
                                  const TensorLayout& dst,
                                  const TensorLayout& diff,
                                  const TensorLayout& grad) override;
 };
 }  // namespace cuda
 }  // namespace megdnn

 // vim: syntax=cpp.doxygen
--- a/dnn/src/cuda/handle_create.cpp
+++ b/dnn/src/cuda/handle_create.cpp
@@ -11,6 +11,7 @@

 #include "src/common/handle_impl.h"

 #include "src/cuda/adaptive_pooling/opr_impl.h"
 #include "src/cuda/add_update/opr_impl.h"
 #include "src/cuda/argmxx/opr_impl.h"
 #include "src/cuda/argsort/opr_impl.h"
--- a/dnn/src/cuda/indexing_multi_axis_vec/kern_apply_opr_impl.cuinl
+++ b/dnn/src/cuda/indexing_multi_axis_vec/kern_apply_opr_impl.cuinl
@@ -72,6 +72,7 @@ namespace indexing_multi_axis_vec {
 #define cb0(_dtype) \
    MEGDNN_FOREACH_TENSOR_NDIM(INST, DTypeTrait<_dtype>::ctype)
    MEGDNN_FOREACH_COMPUTING_DTYPE(cb0)
    cb0(::megdnn::dtype::Bool)
 #undef cb0
 #undef INST

--- a/dnn/src/cuda/indexing_multi_axis_vec/kern_apply_opr_incr.cu
+++ b/dnn/src/cuda/indexing_multi_axis_vec/kern_apply_opr_incr.cu
@@ -39,6 +39,11 @@ __device__ void atomicAdd(megdnn::dt_int16 *, megdnn::dt_int16) {
    ((int*)0)[0] = 1;
 }

 __device__ void atomicAdd(megdnn::dt_bool *, megdnn::dt_bool) {
    __trap();
    ((int*)0)[0] = 1;
 }

 #define KERN_APPLY_OPR_OPR \
    ::megdnn::cuda::indexing_multi_axis_vec::OprAtomicIncr
 #include "./kern_apply_opr_impl.cuinl"
--- a/dnn/src/cuda/indexing_multi_axis_vec/opr_impl.cpp
+++ b/dnn/src/cuda/indexing_multi_axis_vec/opr_impl.cpp
@@ -120,6 +120,7 @@ void ExecImpl<Opr>::dispatch_exec() {
        case DTypeTrait<_dtype>::enumv: \
            return dispatch_exec_ctype<DTypeTrait<_dtype>::ctype>();
        MEGDNN_FOREACH_COMPUTING_DTYPE(cb)
        cb(::megdnn::dtype::Bool)
 #undef cb
        default:
            megdnn_throw("bad dtype");
--- a/dnn/src/naive/adaptive_pooling/opr_impl.cpp
+++ b/dnn/src/naive/adaptive_pooling/opr_impl.cpp
@@ -0,0 +1,52 @@
 /**
 * \file dnn/src/naive/adaptive_pooling/opr_impl.cpp
 * MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
 *
 * Copyright (c) 2014-2020 Megvii Inc. All rights reserved.
 *
 * Unless required by applicable law or agreed to in writing,
 * software distributed under the License is distributed on an
 * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or
 * implied.
 */
 #include "src/naive/adaptive_pooling/opr_impl.h"

 #include "src/common/opr_delegate.h"
 #include "src/common/utils.h"
 #include "src/naive/handle.h"

 namespace megdnn {
 namespace naive {

 void AdaptivePoolingForwardImpl::exec(_megdnn_tensor_in src,
                                      _megdnn_tensor_out dst,
                                      _megdnn_workspace workspace) {
    MEGDNN_DISPATCH_CPU_KERN(static_cast<naive::HandleImpl*>(handle()), {
        auto opr = inplace_cpu_handle()->create_operator<PoolingForward>();
        opr->param() = deduce_pooling_param(src.layout, dst.layout);
        opr->exec(src, dst, workspace);
    });
 }

 void AdaptivePoolingBackwardImpl::exec(_megdnn_tensor_in src,
                                       _megdnn_tensor_in dst,
                                       _megdnn_tensor_in diff,
                                       _megdnn_tensor_out grad,
                                       _megdnn_workspace workspace) {
    MEGDNN_DISPATCH_CPU_KERN(static_cast<naive::HandleImpl*>(handle()), {
        auto opr = inplace_cpu_handle()->create_operator<PoolingBackward>();
        opr->param() = deduce_pooling_param(src.layout, dst.layout);
        opr->exec(src, dst, diff, grad, workspace);
    });
 }

 size_t AdaptivePoolingBackwardImpl::get_workspace_in_bytes(
        const TensorLayout& src, const TensorLayout& dst,
        const TensorLayout& diff, const TensorLayout& grad) {
    auto opr = inplace_cpu_handle()->create_operator<PoolingBackward>();
    opr->param() = deduce_pooling_param(src, dst);
    return opr->get_workspace_in_bytes(src, dst, diff, grad);
 }
 }  // namespace naive
 }  // namespace megdnn
 // vim: syntax=cpp.doxygen
--- a/dnn/src/naive/adaptive_pooling/opr_impl.h
+++ b/dnn/src/naive/adaptive_pooling/opr_impl.h
@@ -0,0 +1,43 @@
 /**
 * \file dnn/src/naive/adaptive_pooling/opr_impl.h
 * MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
 *
 * Copyright (c) 2014-2020 Megvii Inc. All rights reserved.
 *
 * Unless required by applicable law or agreed to in writing,
 * software distributed under the License is distributed on an
 * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or
 * implied.
 */
 #pragma once
 #include "megdnn/oprs.h"
 #include "src/common/utils.h"

 namespace megdnn {
 namespace naive {

 class AdaptivePoolingForwardImpl : public AdaptivePoolingForward {
 public:
    using AdaptivePoolingForward::AdaptivePoolingForward;
    void exec(_megdnn_tensor_in src, _megdnn_tensor_out dst,
              _megdnn_workspace workspace) override;
    size_t get_workspace_in_bytes(const TensorLayout&,
                                  const TensorLayout&) override {
        return 0;
    }
 };

 class AdaptivePoolingBackwardImpl : public AdaptivePoolingBackward {
 public:
    using AdaptivePoolingBackward::AdaptivePoolingBackward;
    void exec(_megdnn_tensor_in src, _megdnn_tensor_in dst,
              _megdnn_tensor_in diff, _megdnn_tensor_out grad,
              _megdnn_workspace workspace) override;
    size_t get_workspace_in_bytes(const TensorLayout& src,
                                  const TensorLayout& dst,
                                  const TensorLayout& diff,
                                  const TensorLayout& grad) override;
 };
 }  // namespace naive
 }  // namespace megdnn
 // vim: syntax=cpp.doxygen
--- a/dnn/src/naive/handle.cpp
+++ b/dnn/src/naive/handle.cpp
@@ -13,6 +13,7 @@

 #include "src/common/handle_impl.h"

 #include "src/naive/adaptive_pooling/opr_impl.h"
 #include "src/naive/add_update/opr_impl.h"
 #include "src/naive/argmxx/opr_impl.h"
 #include "src/naive/argsort/opr_impl.h"
--- a/dnn/src/naive/indexing_multi_axis_vec/opr_impl.cpp
+++ b/dnn/src/naive/indexing_multi_axis_vec/opr_impl.cpp
@@ -88,6 +88,7 @@ void dispatch_exec(HandleImpl *handle,
    }
    switch (data.layout.dtype.enumv()) {
        MEGDNN_FOREACH_COMPUTING_DTYPE(cb)
        cb(::megdnn::dtype::Bool)
        default:
            megdnn_throw(megdnn_mangle("bad dtype"));
    }
--- a/dnn/test/common/adaptive_pooling.h
+++ b/dnn/test/common/adaptive_pooling.h
@@ -0,0 +1,55 @@
 /**
 * \file dnn/test/common/adaptive_pooling.h
 * MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
 *
 * Copyright (c) 2014-2020 Megvii Inc. All rights reserved.
 *
 * Unless required by applicable law or agreed to in writing,
 * software distributed under the License is distributed on an
 * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or
 * implied.
 */
 #pragma once
 #include <cstddef>
 #include "megdnn/basic_types.h"
 #include "megdnn/opr_param_defs.h"

 namespace megdnn {
 namespace test {
 namespace adaptive_pooling {

 struct TestArg {
    param::AdaptivePooling param;
    TensorShape ishape;
    TensorShape oshape;
    TestArg(param::AdaptivePooling param, TensorShape ishape,
            TensorShape oshape)
            : param(param), ishape(ishape), oshape(oshape) {}
 };

 inline std::vector<TestArg> get_args() {
    std::vector<TestArg> args;
    using Param = param::AdaptivePooling;
    using Mode = param::AdaptivePooling::Mode;

    for (size_t i = 36; i < 40; ++i) {
        args.emplace_back(Param{Mode::AVERAGE}, TensorShape{2, 3, i, i + 1},
                          TensorShape{2, 3, i - 4, i - 2});
        args.emplace_back(Param{Mode::MAX}, TensorShape{2, 3, i, i + 1},
                          TensorShape{2, 3, i - 4, i - 2});
    }

    for (size_t i = 5; i < 10; ++i) {
        args.emplace_back(Param{Mode::AVERAGE}, TensorShape{2, 3, i, i + 1},
                          TensorShape{2, 3, i - 3, i - 2});
        args.emplace_back(Param{Mode::MAX}, TensorShape{2, 3, i, i + 1},
                          TensorShape{2, 3, i - 3, i - 2});
    }
    return args;
 }

 }  // namespace adaptive_pooling
 }  // namespace test
 }  // namespace megdnn

 // vim: syntax=cpp.doxygen
--- a/dnn/test/common/opr_trait.h
+++ b/dnn/test/common/opr_trait.h
@@ -41,6 +41,8 @@ DEF(Images2NeibsForward, 2, true, true);
 DEF(Images2NeibsBackward, 2, true, false);
 DEF(PoolingForward, 2, true, true);
 DEF(PoolingBackward, 4, true, false);
 DEF(AdaptivePoolingForward, 2, true, false);
 DEF(AdaptivePoolingBackward, 4, true, false);
 DEF(LocalForward, 3, true, true);
 DEF(LocalBackwardData, 3, true, false);
 DEF(LocalBackwardFilter, 3, true, false);
--- a/dnn/test/cuda/adaptive_pooling.cpp
+++ b/dnn/test/cuda/adaptive_pooling.cpp
@@ -0,0 +1,97 @@
 /**
 * \file dnn/test/cuda/adaptive_pooling.cpp
 * MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
 *
 * Copyright (c) 2014-2020 Megvii Inc. All rights reserved.
 *
 * Unless required by applicable law or agreed to in writing,
 * software distributed under the License is distributed on an
 * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or
 * implied.
 */
 #include "test/cuda/fixture.h"

 #include "megdnn/tensor_iter.h"
 #include "test/common/adaptive_pooling.h"
 #include "test/common/checker.h"

 #include "src/common/utils.h"
 #include "test/cuda/utils.h"

 #include <cudnn.h>
 #include "test/cuda/benchmark.h"

 namespace megdnn {
 namespace test {

 TEST_F(CUDA, ADAPTIVE_POOLING_FORWARD) {
    auto args = adaptive_pooling::get_args();
    using Format = param::AdaptivePooling::Format;
    DType dtype = dtype::Float32();
    for (auto&& arg : args) {
        auto param = arg.param;
        auto src = arg.ishape;
        auto dst = arg.oshape;
        param.format = Format::NCHW;
        Checker<AdaptivePooling> checker(handle_cuda());
        checker.set_epsilon(1e-2);
        checker.set_param(param).set_dtype(0, dtype).set_dtype(1, dtype).exec(
                TensorShapeArray{src, dst, {}});
    }
 }

 TEST_F(CUDA, ADAPTIVE_POOLING_BACKWARD) {
    auto args = adaptive_pooling::get_args();
    for (auto&& arg : args) {
        Checker<AdaptivePoolingBackward> checker(handle_cuda());
        TensorLayout ilayout = TensorLayout(arg.ishape, dtype::Float32());
        TensorLayout olayout = TensorLayout(arg.oshape, dtype::Float32());

        auto constraint = [this,
                           arg](CheckerHelper::TensorValueArray& tensors_orig) {
            megdnn_assert(tensors_orig.size() == 4);
            auto opr = handle_cuda()->create_operator<AdaptivePoolingForward>();
            opr->param() = arg.param;

            auto tensors_cuda_storage = CheckerHelper::alloc_tensors(
                    handle_cuda(),
                    {tensors_orig[0].layout, tensors_orig[1].layout}, 0);
            auto&& tensors_cuda = *tensors_cuda_storage;

            auto span = tensors_cuda[0].layout.span();
            auto dst = static_cast<dt_byte*>(tensors_cuda[0].raw_ptr) +
                       span.low_byte;
            auto src = static_cast<const dt_byte*>(tensors_orig[0].raw_ptr) +
                       span.low_byte;
            megdnn_memcpy_H2D(handle_cuda(), dst, src, span.dist_byte());

            auto workspace_size = opr->get_workspace_in_bytes(
                    tensors_cuda[0].layout, tensors_cuda[1].layout);
            auto workspace_cuda = megdnn_malloc(handle_cuda(), workspace_size);
            Workspace workspace{static_cast<dt_byte*>(workspace_cuda),
                                workspace_size};
            opr->exec(tensors_cuda[0], tensors_cuda[1], workspace);
            megdnn_free(handle_cuda(), workspace_cuda);

            span = tensors_cuda[1].layout.span();
            dst = static_cast<dt_byte*>(tensors_orig[1].raw_ptr) +
                  span.low_byte;
            src = static_cast<const dt_byte*>(tensors_cuda[1].raw_ptr) +
                  span.low_byte;
            megdnn_memcpy_D2H(handle_cuda(), dst, src, span.dist_byte());
        };

        DType dtype = dtype::Float32();
        checker.set_tensors_constraint(constraint)
                .set_dtype(0, dtype)
                .set_dtype(1, dtype)
                .set_dtype(2, dtype)
                .set_dtype(3, dtype)
                .set_param(arg.param)
                .exec(TensorShapeArray{ilayout, olayout, olayout, ilayout});
    }
 }
 }  // namespace test
 }  // namespace megdnn

 // vim: syntax=cpp.doxygen
--- a/dnn/test/cuda/conv_bias_int8.cpp
+++ b/dnn/test/cuda/conv_bias_int8.cpp
@@ -6,7 +6,8 @@
 *
 * Unless required by applicable law or agreed to in writing,
 * software distributed under the License is distributed on an
 * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or
 * implied.
 */
 #include "megdnn/oprs/nn.h"

@@ -37,7 +38,7 @@ std::vector<BenchArgs> get_resnet50_bench_args(size_t batch = 64) {
    args.emplace_back(BenchArgs{batch, 256, 56, 56, 32, 3, 1});
    args.emplace_back(BenchArgs{batch, 256, 56, 56, 32, 3, 2});
    args.emplace_back(BenchArgs{batch, 4, 256, 256, 32, 7, 2});
 

    args.emplace_back(BenchArgs{batch, 256, 56, 56, 64, 1, 1});
    args.emplace_back(BenchArgs{batch, 64, 56, 56, 64, 1, 1});
    args.emplace_back(BenchArgs{batch, 64, 56, 56, 64, 3, 1});
@@ -614,11 +615,8 @@ TEST_F(CUDA, CONV_BIAS_INT8_CHWN4_HSWISH) {
    param.stride_h = param.stride_w = 1;
    param.format = param::ConvBias::Format::CHWN4;
    param.nonlineMode = param::ConvBias::NonlineMode::H_SWISH;
    checker.set_param(param).execs({{4, 12, 12, 32, 4},
                                    {4, 3, 3, 16, 4},
                                    {4, 1, 1, 1, 4},
                                    {},
                                    {}});
    checker.set_param(param).execs(
            {{4, 12, 12, 32, 4}, {4, 3, 3, 16, 4}, {4, 1, 1, 1, 4}, {}, {}});
 }

 TEST_F(CUDA, CONV_BIAS_INT8_CHWN4_CHECK_BOUNDS) {
@@ -1076,7 +1074,6 @@ TEST_F(CUDA, CONV_BIAS_INT8_CHWN4_UNROLL_WIDTH_TENSORCORE_1x1_ALGO_2) {
 }



 #if CUDA_VERSION >= 10020
 /// \note: we only check several cases and block sizes in megdnn_test, the full
 /// testcases are written in cutlass repository
@@ -1234,8 +1231,7 @@ TEST_F(CUDA, BENCHMARK_CUTLASS_CONV_BIAS_INT8_NCHW4) {
            handle_cuda(), get_resnet50_bench_args(64),
            dtype::QuantizedS8{1.2f}, dtype::QuantizedS8{1.3f},
            dtype::QuantizedS32{1.2f * 1.3f}, dtype::QuantizedS8{1.0f},
            "INT8_NCHW4_DOTPROD_IMPLICIT_GEMM",
            param::ConvBias::Format::NCHW4);
            "INT8_NCHW4_DOTPROD_IMPLICIT_GEMM", param::ConvBias::Format::NCHW4);
 }
 #endif
 }  // namespace test
--- a/imperative/CMakeLists.txt
+++ b/imperative/CMakeLists.txt
@@ -47,8 +47,7 @@ add_custom_target(gen_opr_py DEPENDS ${GEN_OPS_FILE})

 ##################### end of opdef generation #########################

 set(VERSION_SCRIPT ${CMAKE_CURRENT_SOURCE_DIR}/src/version.ld)
 add_custom_target(_version_ld SOURCES ${VERSION_SCRIPT})
 add_custom_target(_version_ld SOURCES ${MGE_VERSION_SCRIPT})

 add_subdirectory(${PROJECT_SOURCE_DIR}/third_party/pybind11 ${PROJECT_BINARY_DIR}/third_party/pybind11)
 pybind11_add_module(${MODULE_NAME} NO_EXTRAS ${SRCS})
@@ -57,8 +56,21 @@ if (APPLE)
 elseif (MSVC OR WIN32)
    # Windows does not support implicitly importing data members from DLL.
    target_link_libraries(${MODULE_NAME} PRIVATE megbrain megdnn)
    message("-- CMAKE_MSVC_RUNTIME_LIBRARY: ${CMAKE_MSVC_RUNTIME_LIBRARY}")
    set_target_properties(${MODULE_NAME} PROPERTIES MSVC_RUNTIME_LIBRARY "${CMAKE_MSVC_RUNTIME_LIBRARY}")
 else()
    target_link_libraries(${MODULE_NAME} PRIVATE megengine_export -Wl,--version-script=${VERSION_SCRIPT})
    if (MGE_WITH_PYTHON_MODULE)
        # use to fix runtime crash when build both mgb(MGE_WITH_PYTHON_MODULE) and imperative(MGE_BUILD_IMPERATIVE_RT)
        target_link_libraries(${MODULE_NAME} PRIVATE megengine_export -Wl,--version-script=${MGE_VERSION_SCRIPT})
    else()
        # use to reduce whl size by depend on megbrain/dnn directly, caused by cmake create two cuda fatbin
        # elf section on both megengine_export and target which depend on megengine_export
        target_link_libraries(${MODULE_NAME} PRIVATE megbrain megdnn -Wl,--version-script=${MGE_VERSION_SCRIPT})
        if (MGE_WITH_DISTRIBUTED)
            message("-- Imperative configured to link megray")
            target_link_libraries(${MODULE_NAME} PRIVATE megray)
        endif()
    endif()
 endif()

 target_include_directories(${MODULE_NAME} PUBLIC src/include PRIVATE ${PYTHON_INCLUDE_DIRS} ${NUMPY_INCLUDE_DIR})
--- a/imperative/python/megengine/init.py
+++ b/imperative/python/megengine/init.py
@@ -76,7 +76,7 @@ from .logger import enable_debug_log, get_logger, set_log_file, set_log_level
 from .serialization import load, save
 from .tensor import Parameter, Tensor, tensor
 from .version import __version__
 from .core import cgtools
 from .utils import comp_graph_tools as cgtools

 _set_fork_exec_path_for_timed_func(
    sys.executable,
--- a/imperative/python/megengine/autodiff/grad_manager.py
+++ b/imperative/python/megengine/autodiff/grad_manager.py
@@ -20,7 +20,7 @@ class GradManager:
    the forward operations start and when all resources should be released. A typical usage of
    GradManager is as follows:

        .. codeblock::
        .. code-block::

            gm = GradManager()
            gm.attach(model.parameters())
@@ -32,7 +32,7 @@ class GradManager:

    You can also use `record()` and `release()` method instead of `with` context:

        .. codeblock::
        .. code-block::

            gm = GradManager()
            gm.attach(model.parameters())
@@ -50,7 +50,7 @@ class GradManager:
    processes. Users will finally get the averaged gradients if an "AllReduce"
    callback is registered as follows:

        .. codeblock::
        .. code-block::

            import megengine.distributed as dist

@@ -71,7 +71,7 @@ class GradManager:
        r"""Registers parameters that gradients should be calculated with respect to.
        Callback Functions should have a signature like this:

            .. codeblock::
            .. code-block::

                def cb(param: Tensor, grad: Tensor) -> Tensor:
                    # do something
@@ -100,6 +100,8 @@ class GradManager:
        :param ys: outputs of forward operators, e.g., the loss tensor
        :param dys: derivatives of ys
        """
        from ..functional import ones_like

        global backwarding_grad_manager
        cache = backwarding_grad_manager
        backwarding_grad_manager = self
@@ -113,7 +115,7 @@ class GradManager:
        if not isinstance(ys, (tuple, list)):
            ys = [ys]
        if dys is None:
            dys = [tensor(1.0).broadcast(y.shape) for y in ys]
            dys = [ones_like(y) for y in ys]
        if not isinstance(dys, (tuple, list)):
            dys = [dys]
        try:
--- a/imperative/python/megengine/core/init.py
+++ b/imperative/python/megengine/core/init.py
@@ -11,4 +11,3 @@ import sys

 from .tensor import Tensor
 from .tensor.megbrain_graph import Graph
 from .utils import comp_graph_tools as cgtools
--- a/imperative/python/megengine/core/_wrap.py
+++ b/imperative/python/megengine/core/_wrap.py
@@ -22,11 +22,13 @@ class Device:
        else:
            self._cn = CompNode(device)

        self.logical_name = self._cn.logical_name

    def to_c(self):
        return self._cn

    def __repr__(self):
        return "{}({})".format(type(self).__qualname__, self)
        return "{}({})".format(type(self).__qualname__, repr(self._cn))

    def __str__(self):
        return str(self._cn)
--- a/imperative/python/megengine/core/autodiff/builtin_op_utils.py
+++ b/imperative/python/megengine/core/autodiff/builtin_op_utils.py
@@ -160,7 +160,7 @@ def subtensor_grad_fn(op, inputs, outputs, input_requires_grad):
    def make_grad(grad_op, dy):
        grad = (
            TensorWrapper(0, dtype=dy.dtype, device=dy.device)
            .broadcast(TensorWrapper(input_shape))
            ._broadcast(TensorWrapper(input_shape))
            .__wrapped__
        )
        (dx,) = apply(grad_op, grad, dy, *params)
@@ -186,7 +186,7 @@ def indexingMultiAxisVec_grad_fn(op, inputs, outputs, input_requires_grad):
    def make_grad(grad_op, dy):
        grad = (
            TensorWrapper(0, dtype=dy.dtype, device=dy.device)
            .broadcast(TensorWrapper(input_shape))
            ._broadcast(TensorWrapper(input_shape))
            .__wrapped__
        )
        (dx,) = apply(grad_op, grad, dy, *params)
--- a/imperative/python/megengine/core/tensor/function.py
+++ b/imperative/python/megengine/core/tensor/function.py
@@ -50,8 +50,8 @@ class Function:
        """
        Applies operations to ``inputs`` and returns results. It must be overriden by all subclasses.

        :param input: Input tensors.
        :return: A tuple of Tensor or a single Tensor.
        :param input: input tensors.
        :return: a tuple of Tensor or a single Tensor.

        .. note::

@@ -64,12 +64,12 @@ class Function:
        """
        Compute the gradient of the forward function. It must be overriden by all subclasses.

        :param output_grads: gradients of outputs that are returned by :meth:`~.function.Function.forward`
        :param output_grads: gradients of outputs that are returned by :meth:`~.function.Function.forward`.

            .. note::
        .. note::

                In case when some tensors of outputs are not related to loss function, the corresponding
                values in ``output_grads`` would be ``None``.
            In case when some tensors of outputs are not related to loss function, the corresponding
            values in ``output_grads`` would be ``None``.

        .. note::

--- a/imperative/python/megengine/core/tensor/indexing.py
+++ b/imperative/python/megengine/core/tensor/indexing.py
@@ -173,7 +173,7 @@ def unpack_getitem(inp, tuple_val, *, allow_newaxis=True):
                item.append(True)
                v = get_index(v)
                assert np.issubdtype(v.dtype, np.integer) or np.issubdtype(
                    v.dtype, np.bool
                    v.dtype, np.bool_
                ), "var type in the subscript must be int or bool"
                tensors.append(v)

@@ -267,7 +267,7 @@ def setitem(tensor, index, value):
                        value.shape, tmp_result.shape
                    )
                )
        value = value.broadcast(tmp_result.shape)
        value = value._broadcast(tmp_result.shape)
    if use_subtensor:
        op = builtin.SetSubtensor(items=items)
    else:
--- a/imperative/python/megengine/core/tensor/megbrain_graph.py
+++ b/imperative/python/megengine/core/tensor/megbrain_graph.py
@@ -8,6 +8,7 @@
 # "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 import collections
 import json
 import os
 import threading
 import weakref
 from concurrent.futures import Future, ThreadPoolExecutor
@@ -49,7 +50,16 @@ class Graph(_imperative_rt.ComputingGraph):

    def execute(self, *args):
        assert self._future is None
        self._future = self._executor.submit(self._function.execute, *args)

        def wrapped(*args):
            try:
                self._function.execute(*args)
            except Exception as exc:
                for i in self._function._all_rendezvous:
                    i.set_exception(str(exc))
                raise exc

        self._future = self._executor.submit(wrapped, *args)

    def wait(self):
        assert self._future is not None
@@ -275,6 +285,7 @@ def dump_graph(
    keep_param_name: bool = False,
    keep_opr_priority: bool = False,
    strip_info_file=None,
    append_json=False
 ):
    """serialize the computing graph of `output_vars` and get byte result.

@@ -295,6 +306,9 @@ def dump_graph(
    :param keep_opr_priority: whether to keep priority setting for operators
    :param strip_info_file: a string for path or a file handler. if is not None,
        then the dump information for code strip would be written to ``strip_info_file``
    :param append_json: will be check when `strip_info_file` is not None. if set
        true, the information for code strip will be append to strip_info_file.
        if set false, will rewrite strip_info_file
    :return: dump result as byte string, and an instance of namedtuple
        :class:`CompGraphDumpResult`, whose fields are:

@@ -342,10 +356,25 @@ def dump_graph(

    if strip_info_file is not None:
        if isinstance(strip_info_file, str):
            strip_info_file = open(strip_info_file, "w")
        strip_info = json.loads(_imperative_rt.get_info_for_strip(ov))
        strip_info["hash"] = dump_info.content_hash
        json.dump(strip_info, strip_info_file)
            if not os.path.exists(strip_info_file):
                os.mknod(strip_info_file)
            strip_info_file = open(strip_info_file, "r+")
        new_strip_dict = json.loads(_imperative_rt.get_info_for_strip(ov))
        ori_strip_dict = new_strip_dict
        json_content = strip_info_file.read()
        if append_json and len(json_content) != 0:
            # if there are contents in json file. Read them first and then append new information
            ori_strip_dict = json.loads(json_content)
            for k in ori_strip_dict:
                new_strip_dict_v = new_strip_dict.get(k)
                if new_strip_dict_v is not None:
                    for value in new_strip_dict_v:
                        if not value in ori_strip_dict[k]:
                            ori_strip_dict[k].append(value)
        ori_strip_dict["hash"] = dump_info.content_hash
        strip_info_file.seek(0)
        strip_info_file.truncate()
        json.dump(ori_strip_dict, strip_info_file)

    return dump_content, dump_info

@@ -358,7 +387,7 @@ CompGraphLoadResult = collections.namedtuple(
 def load_graph(fpath):
    """Load a serialized computing graph from file.

    :parma fpath: Path or Handle for the output file
    :param fpath: Path or Handle of the input file
    :return: An instance of namedtuple :class:`CompGraphLoadResult`,
        whose fields are:

--- a/imperative/python/megengine/core/tensor/multipledispatch/conflict.py
+++ b/imperative/python/megengine/core/tensor/multipledispatch/conflict.py
@@ -40,6 +40,8 @@
 #  All Megvii Modifications are Copyright (C) 2014-2020 Megvii Inc. All rights reserved.
 # --------------------------------------------------------------------------------------

 from collections import OrderedDict

 from .utils import _toposort, groupby
 from .variadic import isvariadic

@@ -159,5 +161,5 @@ def ordering(signatures):
    for s in signatures:
        if s not in edges:
            edges[s] = []
    edges = dict((k, [b for a, b in v]) for k, v in edges.items())
    edges = OrderedDict((k, [b for a, b in v]) for k, v in edges.items())
    return _toposort(edges)
--- a/imperative/python/megengine/core/tensor/raw_tensor/init.py
+++ b/imperative/python/megengine/core/tensor/raw_tensor/init.py
@@ -100,6 +100,8 @@ def _(data: DeviceTensorND):
@as_raw_tensor.register(np.ndarray)
 def _(array: np.ndarray, dtype=None, device=None):
    device = None if device is None else as_device(device).to_c()
    if 0 in array.strides:
        array = array.squeeze().reshape(array.shape)
    return RawTensor(put(array, dtype=dtype, device=device))


--- a/imperative/python/megengine/core/tensor/tensor_wrapper.py
+++ b/imperative/python/megengine/core/tensor/tensor_wrapper.py
@@ -57,7 +57,29 @@ def _transpose(data, axes):


 def _broadcast(inp, shape):
    def valid_broadcast(src, tar):
        def failed():
            raise ValueError(
                "the input shape {} can not be broadcasted to target shape {}".format(
                    src, tar
                )
            )

        if isinstance(src, (TensorBase, TensorWrapperBase)):
            src = src.numpy()

        if isinstance(tar, (TensorBase, TensorWrapperBase)):
            tar = tar.numpy()

        if len(src) > len(tar):
            failed()

        for i in range(min(len(src), len(tar))):
            if src[-i - 1] != 1 and src[-i - 1] != tar[-i - 1]:
                failed()

    shape = utils.astensor1d(shape, inp, dtype="int32", device=inp.device)
    valid_broadcast(inp.shape, shape)
    (result,) = apply(builtin.Broadcast(), inp, shape)
    return result

@@ -158,6 +180,10 @@ def _reduce(mode):
    def f(self, axis=None, keepdims: bool = False):
        data = self
        (data,) = utils.convert_inputs(data)
        if mode == "MEAN":
            data = data.astype("float32")
        elif self.dtype == np.bool_:
            data = data.astype("int32")
        if axis is None:
            data = data.reshape(-1)
            assert not keepdims, "can not set axis=None and keepdims=True"
@@ -180,6 +206,9 @@ def _reduce(mode):

            if not keepdims:
                result = _remove_axis(result, axis)
        if self.dtype == np.bool_:
            if mode in ["MIN", "MAX"]:
                result = result.astype("bool")
        return result

    return f
@@ -203,7 +232,8 @@ def _todo(*_):
 def _expand_args(args):
    if len(args) == 1:
        if isinstance(
            args[0], (collections.abc.Sequence, TensorBase, TensorWrapperBase)
            args[0],
            (collections.abc.Sequence, TensorBase, TensorWrapperBase, np.ndarray),
        ):
            args = args[0]
    return args
@@ -366,7 +396,8 @@ class ArrayMethodMixin(abc.ABC):
    def reshape(self, *args):
        return _reshape(self, _expand_args(args))

    def broadcast(self, *args):
    # FIXME: remove this method
    def _broadcast(self, *args):
        return _broadcast(self, _expand_args(args))

    def transpose(self, *args):
@@ -377,7 +408,38 @@ class ArrayMethodMixin(abc.ABC):
    def flatten(self):
        return self.reshape(-1)

    sum = _reduce("SUM")
    def sum(self, axis=None, keepdims: bool = False):
        r"""Returns the sum of each row of the input tensor in the given dimension ``axis``.
        If ``axis`` is a list of axises, reduce over all of them.

        If ``keepdims`` is ``True``, the shape of output tensor is the same as the input tensor, except in the dimension(s) ``axis`` where it is of size 1. Otherwise, ``axis`` is squeezed(see :meth:`~.functional.tensor.squeeze`).

        Same for prod/mean/max/min.

        :param axis: the dimension or dimensions to reduce.
        :param keepdim: whether the output tensor has ndim retained or not.
        :return: output tensor.

        Examples:

        .. testcode::

            from megengine import tensor
            a = tensor([False, True, True, False])
            b = tensor([1.0, 2.0, 3.0, 4.0])
            print(a.sum().numpy())
            print(b.sum().numpy())

        Outputs:

        .. testoutput::

            [2]
            [10.]

        """
        return _reduce("SUM")(self, axis, keepdims)

    prod = _reduce("PRODUCT")
    min = _reduce("MIN")
    max = _reduce("MAX")
--- a/imperative/python/megengine/core/tensor/utils.py
+++ b/imperative/python/megengine/core/tensor/utils.py
@@ -16,39 +16,74 @@ from ..ops.special import Const
 from ..tensor.core import OpBase, TensorBase, TensorWrapperBase, apply


 def dtype_promotion(raw_inputs):
    def add_dtype(i):
        if type(i) == int:
            return np.array(i, dtype=np.int32)
        if type(i) == float:
            return np.array(i, dtype=np.float32)
        if type(i) == bool:
            return np.array(i, dtype=np.bool_)
        return None

    scalar_inputs = [
        add_dtype(i) for i in raw_inputs if not hasattr(i, "dtype") and add_dtype(i)
    ]
    inputs = [i for i in raw_inputs if hasattr(i, "dtype")]
    assert len(scalar_inputs + inputs) > 0
    dtype = None
    if len(inputs) > 0:
        dtype = np.result_type(*inputs)
    dtype_all = np.result_type(*(inputs + scalar_inputs))
    assert (
        dtype != np.float64 and dtype != np.int64
    ), "unsupport dtype {} by dtype_promotion, please use explict type convert".format(
        dtype
    )
    if dtype_all == np.bool_:
        for i in raw_inputs:
            if not hasattr(i, "dtype") or i.dtype != np.bool_:
                raise TypeError(
                    "bool dtype can not be operated with an element without bool dtype"
                )
    if dtype_all == np.float64:
        dtype_all = np.float32
    return dtype_all
 def dtype_promotion(inputs):
    """
    Returns the dtype that would result from performing an arithmetic
    operation on the provided input tensors and scalars.
    """
    # map numpy.dtype.kind to priority
    category_priority = {
        "f": 3,  # floating-point
        "i": 2,  # signed integer
        "u": 2,  # unsigned integer
        "b": 1,  # boolean
    }

    def scalar2dtype(x):
        """
        For scalar `x`, returns its corresponding type. A floating point scalar
        has dtype 'float32'. An integral non-boolean scalar has dtype 'int32'.
        A boolean scalar has dtype 'bool'.
        """
        if isinstance(x, bool):
            return np.bool_
        if isinstance(x, int):
            return np.int32
        if isinstance(x, float):
            return np.float32

    def promote_types(types, cat):
        """
        Returns the data type with sufficient size to hold all types of
        category `cat` in the list `types`.
        """
        used_types = [
            i for i in types if category_priority.get(np.dtype(i).kind, 0) == cat
        ]
        assert len(used_types) > 0
        res = used_types[0]
        for i in used_types:
            res = np.promote_types(res, i)
        return res

    def max_priority(types):
        """
        Returns the maximum value of the priority of each type in the list
        `types`.
        """
        if not types:
            return 0
        else:
            return max([category_priority.get(np.dtype(i).kind, 0) for i in types])

    scalars = []
    tensors = []

    for data in inputs:
        if hasattr(data, "dtype"):
            tensors.append(data.dtype)
        elif isinstance(data, (float, int, bool)):
            scalars.append(scalar2dtype(data))

    max_pri_scalars = max_priority(scalars)
    max_pri_tensors = max_priority(tensors)

    assert max_pri_scalars > 0 or max_pri_tensors > 0

    if max_pri_scalars > max_pri_tensors:
        return promote_types(scalars, max_pri_scalars)
    else:
        return promote_types(tensors, max_pri_tensors)


 def get_device(inputs):
--- a/imperative/python/megengine/core/utils/init.py
+++ b/imperative/python/megengine/core/utils/init.py
@@ -1,9 +0,0 @@
 # -*- coding: utf-8 -*-
 # MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
 #
 # Copyright (c) 2014-2020 Megvii Inc. All rights reserved.
 #
 # Unless required by applicable law or agreed to in writing,
 # software distributed under the License is distributed on an
 # "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 from .comp_graph_tools import *
--- a/imperative/python/megengine/data/_queue.py
+++ b/imperative/python/megengine/data/_queue.py
@@ -26,7 +26,7 @@ def _clear_plasma_store():
    # `_PlasmaStoreManager.__del__` will not be called automaticly in subprocess,
    # so this function should be called explicitly
    global MGE_PLASMA_STORE_MANAGER
    if MGE_PLASMA_STORE_MANAGER is not None:
    if MGE_PLASMA_STORE_MANAGER is not None and MGE_PLASMA_STORE_MANAGER.refcount == 0:
        del MGE_PLASMA_STORE_MANAGER
        MGE_PLASMA_STORE_MANAGER = None

@@ -50,6 +50,7 @@ class _PlasmaStoreManager:
            stderr=None if debug_flag else subprocess.DEVNULL,
        )
        self.__initialized = True
        self.refcount = 1

    def __del__(self):
        if self.__initialized and self.plasma_store.returncode is None:
@@ -83,6 +84,8 @@ class PlasmaShmQueue:
                    "Exception happened in starting plasma_store: {}\n"
                    "Tips: {}".format(str(e), err_info)
                )
        else:
            MGE_PLASMA_STORE_MANAGER.refcount += 1

        self.socket_name = MGE_PLASMA_STORE_MANAGER.socket_name

@@ -133,6 +136,8 @@ class PlasmaShmQueue:
    def close(self):
        self.queue.close()
        self.disconnect_client()
        global MGE_PLASMA_STORE_MANAGER
        MGE_PLASMA_STORE_MANAGER.refcount -= 1
        _clear_plasma_store()

    def cancel_join_thread(self):
--- a/imperative/python/megengine/data/collator.py
+++ b/imperative/python/megengine/data/collator.py
@@ -34,14 +34,14 @@ default_collate_err_msg_format = (

 class Collator:
    r"""
    Used for merge a list of samples to form a mini-batch of Tenor(s). Used when using batched loading from a dataset.
    modified from https://github.com/pytorch/pytorch/blob/master/torch/utils/data/_utils/collate.py
    Used for merging a list of samples to form a mini-batch of Tensor(s). Used when using batched loading from a dataset.
    Modified from https://github.com/pytorch/pytorch/blob/master/torch/utils/data/_utils/collate.py
    """

    def apply(self, inputs):
        """
        input : sequence_N(tuple(CHW, C, CK))
        output : tuple(NCHW, NC, NCK)
        :param input: sequence_N(tuple(CHW, C, CK)).
        :return: tuple(NCHW, NC, NCK).
        """
        elem = inputs[0]
        elem_type = type(elem)
--- a/imperative/python/megengine/data/dataloader.py
+++ b/imperative/python/megengine/data/dataloader.py
@@ -43,7 +43,7 @@ class DataLoader:
    ):
        r"""Provides a convenient way to iterate on a given dataset.

        `DataLoader` combines a dataset with sampler, transform and collator,
        `DataLoader` combines a dataset with `sampler`, `transform` and `collator`,
        make it flexible to get minibatch continually from a dataset.

        :type dataset: Dataset
@@ -53,21 +53,21 @@ class DataLoader:
            If specified, :attr:`shuffle` must be ``False``.
        :type transform: Transform
        :param transform: defined the transforming strategy for a sampled batch.
            (default: ``None``)
            Default: None
        :type collator: Collator
        :param collator: defined the merging strategy for a transformed batch.
            (default: ``None``)
            Default: None
        :type num_workers: int
        :param num_workers: the number of sub-process to load, transform and collate
            the batch. ``0`` means using single-process. (default: ``0``)
            the batch. ``0`` means using single-process. Default: 0
        :type timeout: int
        :param timeout: if positive, means the timeout value(second) for collecting a
            batch from workers. (default: 0)
            batch from workers. Default: 0
        :type divide: bool
        :param divide: define the paralleling strategy in multi-processing mode.
            ``True`` means one batch is divided into :attr:`num_workers` pieces, and
            the workers will process these pieces parallelly. ``False`` means
            different sub-process will process different batch. (default: ``False``)
            different sub-process will process different batch. Default: False

        """

--- a/imperative/python/megengine/data/dataset/meta_dataset.py
+++ b/imperative/python/megengine/data/dataset/meta_dataset.py
@@ -12,7 +12,7 @@ from typing import Tuple

 class Dataset(ABC):
    r"""
    An abstract class for all Datasets
    An abstract class for all Datasets.
    """

    @abstractmethod
@@ -22,8 +22,8 @@ class Dataset(ABC):

 class MapDataset(Dataset):
    r"""
    An abstract class for map data
    __getitem__ and __len__ method are aditionally needed
    An abstract class for map data.
    __getitem__ and __len__ method are aditionally needed.
    """

    @abstractmethod
@@ -41,8 +41,8 @@ class MapDataset(Dataset):

 class StreamDataset(Dataset):
    r"""
    An abstract class for stream data
    __iter__ method is aditionally needed
    An abstract class for stream data.
    __iter__ method is aditionally needed.
    """

    @abstractmethod
--- a/imperative/python/megengine/data/dataset/vision/cifar.py
+++ b/imperative/python/megengine/data/dataset/vision/cifar.py
@@ -21,7 +21,7 @@ logger = get_logger(__name__)


 class CIFAR10(VisionDataset):
    r""" ``Dataset`` for CIFAR10 meta data
    r""" ``Dataset`` for CIFAR10 meta data.
    """

    url_path = "http://www.cs.utoronto.ca/~kriz/"
--- a/imperative/python/megengine/data/dataset/vision/coco.py
+++ b/imperative/python/megengine/data/dataset/vision/coco.py
@@ -118,7 +118,7 @@ class COCO(VisionDataset):
            self.ids = ids

        self.json_category_id_to_contiguous_id = {
            v: i + 1 for i, v in enumerate(self.cats.keys())
            v: i + 1 for i, v in enumerate(sorted(self.cats.keys()))
        }

        self.contiguous_category_id_to_json_id = {
--- a/imperative/python/megengine/data/dataset/vision/folder.py
+++ b/imperative/python/megengine/data/dataset/vision/folder.py
@@ -30,19 +30,18 @@ class ImageFolder(VisionDataset):
        r"""
        ImageFolder is a class for loading image data and labels from a organized folder.

        the folder is expected to be organized as followed
        root/cls/xxx.img_ext
        The folder is expected to be organized as followed: root/cls/xxx.img_ext

        labels are indices of sorted classes in the root directory
        Labels are indices of sorted classes in the root directory.

        :param root: root directory of an image folder
        :param root: root directory of an image folder.
        :param loader: a function used to load image from path,
                       if ``None``, default function that loads
                       images with PILwill be called
                       images with PIL will be called.
        :param check_valid_func: a function used to check if files in folder are
                                 expected image files, if ``None``, default function
                                 that checks file extensions will be called
        :param class_name: if ``True``, return class name instead of class index
                                 that checks file extensions will be called.
        :param class_name: if ``True``, return class name instead of class index.

        """
        super().__init__(root, order=("image", "image_category"))
--- a/imperative/python/megengine/data/dataset/vision/imagenet.py
+++ b/imperative/python/megengine/data/dataset/vision/imagenet.py
@@ -31,7 +31,7 @@ logger = get_logger(__name__)

 class ImageNet(ImageFolder):
    r"""
    Load ImageNet from raw files or folder, expected folder looks like
    Load ImageNet from raw files or folder. Expected folder looks like:

    .. code-block:: bash

@@ -60,25 +60,25 @@ class ImageNet(ImageFolder):

    def __init__(self, root: str = None, train: bool = True, **kwargs):
        r"""
        initialization:
        Initialization:

        * if ``root`` contains ``self.target_folder`` depent on ``train``:
        * if ``root`` contains ``self.target_folder`` depending on ``train``:

          * initialize ImageFolder with target_folder
          * initialize ImageFolder with target_folder.

        * else:

          * if all raw files are in ``root``:

            * parse ``self.target_folder`` from raw files
            * initialize ImageFolder with ``self.target_folder``
            * parse ``self.target_folder`` from raw files.
            * initialize ImageFolder with ``self.target_folder``.

          * else:

            * raise error
            * raise error.

        :param root: root directory of imagenet data, if root is ``None``, used default_dataset_root
        :param train: if ``True``, load the train split, otherwise load the validation split
        :param root: root directory of imagenet data, if root is ``None``, use default_dataset_root.
        :param train: if ``True``, load the train split, otherwise load the validation split.
        """

        # process the root path
--- a/imperative/python/megengine/data/dataset/vision/mnist.py
+++ b/imperative/python/megengine/data/dataset/vision/mnist.py
@@ -22,12 +22,12 @@ logger = get_logger(__name__)


 class MNIST(VisionDataset):
    r""" ``Dataset`` for MNIST meta data
    r""" ``Dataset`` for MNIST meta data.
    """

    url_path = "http://yann.lecun.com/exdb/mnist/"
    """
    url prefix for downloading raw file
    Url prefix for downloading raw file.
    """
    raw_file_name = [
        "train-images-idx3-ubyte.gz",
@@ -36,7 +36,7 @@ class MNIST(VisionDataset):
        "t10k-labels-idx1-ubyte.gz",
    ]
    """
    raw file names of both training set and test set (10k)
    Raw file names of both training set and test set (10k).
    """
    raw_file_md5 = [
        "f68b3c2dcbeaaa9fbdd348bbdeb94873",
@@ -45,7 +45,7 @@ class MNIST(VisionDataset):
        "ec29112dd5afa0611ce80d1b7f02629c",
    ]
    """
    md5 for checking raw files
    Md5 for checking raw files.
    """

    def __init__(
@@ -57,10 +57,10 @@ class MNIST(VisionDataset):
    ):
        r"""
        :param root: path for mnist dataset downloading or loading, if ``None``,
            set ``root`` to the ``_default_root``
        :param train: if ``True``, loading trainingset, else loading test set
            set ``root`` to the ``_default_root``.
        :param train: if ``True``, loading trainingset, else loading test set.
        :param download: if raw files do not exists and download sets to ``True``,
            download raw files and process, otherwise raise ValueError, default is True
            download raw files and process, otherwise raise ValueError, default is True.

        """
        super().__init__(root, order=("image", "image_category"))
--- a/imperative/python/megengine/data/dataset/vision/objects365.py
+++ b/imperative/python/megengine/data/dataset/vision/objects365.py
@@ -81,7 +81,7 @@ class Objects365(VisionDataset):
            self.ids = ids

        self.json_category_id_to_contiguous_id = {
            v: i + 1 for i, v in enumerate(self.cats.keys())
            v: i + 1 for i, v in enumerate(sorted(self.cats.keys()))
        }

        self.contiguous_category_id_to_json_id = {
--- a/imperative/python/megengine/data/dataset/vision/voc.py
+++ b/imperative/python/megengine/data/dataset/vision/voc.py
@@ -75,6 +75,8 @@ class PascalVOC(VisionDataset):
        else:
            raise NotImplementedError

        self.img_infos = dict()

    def __getitem__(self, index):
        target = []
        for k in self.order:
@@ -107,9 +109,8 @@ class PascalVOC(VisionDataset):
                mask = mask[:, :, np.newaxis]
                target.append(mask)
            elif k == "info":
                if image is None:
                    image = cv2.imread(self.images[index], cv2.IMREAD_COLOR)
                info = [image.shape[0], image.shape[1], self.file_names[index]]
                info = self.get_img_info(index, image)
                info = [info["height"], info["width"], info["file_name"]]
                target.append(info)
            else:
                raise NotImplementedError
@@ -119,6 +120,17 @@ class PascalVOC(VisionDataset):
    def __len__(self):
        return len(self.images)

    def get_img_info(self, index, image=None):
        if index not in self.img_infos:
            if image is None:
                image = cv2.imread(self.images[index], cv2.IMREAD_COLOR)
            self.img_infos[index] = dict(
                height=image.shape[0],
                width=image.shape[1],
                file_name=self.file_names[index],
            )
        return self.img_infos[index]

    def _trans_mask(self, mask):
        label = np.ones(mask.shape[:2]) * 255
        for i in range(len(self.class_colors)):
@@ -171,25 +183,3 @@ class PascalVOC(VisionDataset):
        "train",
        "tvmonitor",
    )
    class_colors = [
        [0, 0, 128],
        [0, 128, 0],
        [0, 128, 128],
        [128, 0, 0],
        [128, 0, 128],
        [128, 128, 0],
        [128, 128, 128],
        [0, 0, 64],
        [0, 0, 192],
        [0, 128, 64],
        [0, 128, 192],
        [128, 0, 64],
        [128, 0, 192],
        [128, 128, 64],
        [128, 128, 192],
        [0, 64, 0],
        [0, 64, 128],
        [0, 192, 0],
        [0, 192, 128],
        [128, 64, 0],
    ]
--- a/imperative/python/megengine/data/sampler.py
+++ b/imperative/python/megengine/data/sampler.py
@@ -28,25 +28,25 @@ class Sampler(ABC):
        seed=None,
    ):
        r"""
        An abstract class for all sampler
        An abstract class for all sampler.

        :type dataset: `dataset`
        :param dataset: dataset to sample from
        :param dataset: dataset to sample from.
        :type batch_size: positive integer
        :param batch_size: batch size for batch method
        :param batch_size: batch size for batch method.
        :type drop_last: bool
        :param drop_last: set ``True`` to drop the last incomplete batch,
            if the dataset size is not divisible by the batch size. If ``False`` and 
            the size of dataset is not divisible by the batch_size, then the last batch will
            be smaller. (default: ``False``)
            be smaller. Default: False
        :type num_samples: positive integer
        :param num_samples: number of samples assigned to one rank
        :param num_samples: number of samples assigned to one rank.
        :type world_size: positive integer
        :param world_size: number of ranks
        :param world_size: number of ranks.
        :type rank: non-negative integer within 0 and world_size
        :param rank: rank id, non-negative interger within 0 and ``world_size``
        :param rank: rank id, non-negative interger within 0 and ``world_size``.
        :type seed: non-negative integer
        :param seed: seed for random operators
        :param seed: seed for random operators.
        """
        if (
            not isinstance(batch_size, int)
@@ -103,15 +103,15 @@ class Sampler(ABC):

    def sample(self):
        """
        return a list contains all sample indices
        Return a list contains all sample indices.
        """
        raise NotImplementedError

    def scatter(self, indices) -> List:
        r"""
        scatter method is used for splitting indices into subset, each subset
        Scatter method is used for splitting indices into subset, each subset
        will be assigned to a rank. Indices are evenly splitted by default.
        If customized indices assignment method is needed, please rewrite this method
        If customized indices assignment method is needed, please rewrite this method.
        """
        total_size = self.num_samples * self.world_size

@@ -127,7 +127,7 @@ class Sampler(ABC):

    def batch(self) -> Iterator[List[Any]]:
        r"""
        batch method provides a batch indices generator
        Batch method provides a batch indices generator.
        """
        indices = list(self.sample())

@@ -156,7 +156,7 @@ class SequentialSampler(Sampler):
        rank=None,
    ):
        r"""
        Sample elements sequentially
        Sample elements sequentially.
        """
        super().__init__(dataset, batch_size, drop_last, None, world_size, rank)
        if indices is not None and not isinstance(indices, collections.abc.Sequence):
@@ -168,7 +168,7 @@ class SequentialSampler(Sampler):

    def sample(self) -> Iterator[Any]:
        r"""
        return a generator 
        Return a generator.
        """
        if self.indices is None:
            return iter(range(len(self.dataset)))
@@ -188,7 +188,7 @@ class RandomSampler(Sampler):
        seed=None,
    ):
        r"""
        Sample elements randomly without replacement
        Sample elements randomly without replacement.
        """
        super().__init__(dataset, batch_size, drop_last, None, world_size, rank, seed)
        if indices is not None and not isinstance(indices, collections.abc.Sequence):
@@ -218,10 +218,10 @@ class ReplacementSampler(Sampler):
        seed=None,
    ):
        r"""
        Sample elements randomly with replacement
        Sample elements randomly with replacement.

        :type weights: List
        :param weights: weights for sampling indices, it could be unnormalized weights
        :param weights: weights for sampling indices, it could be unnormalized weights.
        """
        super().__init__(
            dataset, batch_size, drop_last, num_samples, world_size, rank, seed
@@ -250,7 +250,7 @@ class ReplacementSampler(Sampler):


 class Infinite(Sampler):
    r"""Infinite Sampler warper for basic sampler"""
    r"""Infinite Sampler warper for basic sampler."""

    def sample(self):
        raise NotImplementedError("sample method not supported in Infinite")
--- a/imperative/python/megengine/data/transform/meta_transform.py
+++ b/imperative/python/megengine/data/transform/meta_transform.py
@@ -12,7 +12,7 @@ from typing import Sequence, Tuple

 class Transform(ABC):
    """
    rewrite apply method in subclass
    Rewrite apply method in subclass.
    """

    def apply_batch(self, inputs: Sequence[Tuple]):
--- a/imperative/python/megengine/data/transform/vision/functional.py
+++ b/imperative/python/megengine/data/transform/vision/functional.py
@@ -15,7 +15,7 @@ import numpy as np


 def wrap_keepdims(func):
    """Wraper to keep the dimension of input images unchanged"""
    """Wraper to keep the dimension of input images unchanged."""

    @functools.wraps(func)
    def wrapper(image, *args, **kwargs):
@@ -34,10 +34,10 @@ def wrap_keepdims(func):
@wrap_keepdims
 def to_gray(image):
    r"""
    Change BGR format image's color space to gray
    Change BGR format image's color space to gray.

    :param image: Input BGR format image, with (H, W, C) shape
    :return: Gray format image, with (H, W, C) shape
    :param image: input BGR format image, with `(H, W, C)` shape.
    :return: gray format image, with `(H, W, C)` shape.
    """
    return cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)

@@ -45,10 +45,10 @@ def to_gray(image):
@wrap_keepdims
 def to_bgr(image):
    r"""
    Change gray format image's color space to BGR
    Change gray format image's color space to BGR.

    :param image: input Gray format image, with (H, W, C) shape
    :return: BGR format image, with (H, W, C) shape
    :param image: input Gray format image, with `(H, W, C)` shape.
    :return: BGR format image, with `(H, W, C)` shape.
    """
    return cv2.cvtColor(image, cv2.COLOR_GRAY2BGR)

@@ -56,18 +56,18 @@ def to_bgr(image):
@wrap_keepdims
 def pad(input, size, value):
    r"""
    Pad input data with *value* and given *size*
    Pad input data with *value* and given *size*.

    :param input: Input data, with (H, W, C) shape
    :param size: Padding size of input data, it could be integer or sequence.
        If it's an integer, the input data will be padded in four directions.
        If it's a sequence contains two integer, the bottom and right side
    :param input: input data, with `(H, W, C)` shape.
    :param size: padding size of input data, it could be integer or sequence.
        If it is an integer, the input data will be padded in four directions.
        If it is a sequence contains two integer, the bottom and right side
        of input data will be padded.
        If it's a sequence contains four integer, the top, bottom, left, right
        If it is a sequence contains four integer, the top, bottom, left, right
        side of input data will be padded with given size.
    :param value: Padding value of data, could be a sequence of int or float.
        if it's float value, the dtype of image will be casted to float32 also.
    :return: Padded image
    :param value: padding value of data, could be a sequence of int or float.
        If it is float value, the dtype of image will be casted to float32 also.
    :return: padded image.
    """
    if isinstance(size, int):
        size = (size, size, size, size)
@@ -81,14 +81,18 @@ def pad(input, size, value):
@wrap_keepdims
 def flip(image, flipCode):
    r"""
    Accordding to the flipCode (the type of flip), flip the input image
    Accordding to the flipCode (the type of flip), flip the input image.

    :param image: Input image, with (H, W, C) shape
    :param image: input image, with `(H, W, C)` shape.
    :param flipCode: code that indicates the type of flip.
        1 : Flip horizontally
        0 : Flip vertically
        -1 : Flip horizontally and vertically
    :return: BGR format image, with (H, W, C) shape

        * 1 : Flip horizontally

        * 0 : Flip vertically

        * -1: Flip horizontally and vertically

    :return: BGR format image, with `(H, W, C)` shape.
    """
    return cv2.flip(image, flipCode=flipCode)

@@ -96,12 +100,12 @@ def flip(image, flipCode):
@wrap_keepdims
 def resize(input, size, interpolation=cv2.INTER_LINEAR):
    r"""
    resize the input data to given size
    Resize the input data to given size.

    :param input: Input data, could be image or masks, with (H, W, C) shape
    :param size: Target size of input data, with (height, width) shape.
    :param interpolation: Interpolation method.
    :return: Resized data, with (H, W, C) shape
    :param input: input data, could be image or masks, with `(H, W, C)` shape.
    :param size: target size of input data, with (height, width) shape.
    :param interpolation: interpolation method.
    :return: resized data, with `(H, W, C)` shape.
    """
    if len(size) != 2:
        raise ValueError("resize needs (h, w), but got {}".format(size))
--- a/imperative/python/megengine/data/transform/vision/transform.py
+++ b/imperative/python/megengine/data/transform/vision/transform.py
@@ -44,26 +44,26 @@ __all__ = [
 class VisionTransform(Transform):
    r"""
    Base class of all transforms used in computer vision.
    calling logic: apply_batch() -> apply() -> _apply_image() and other _apply_*()
    Calling logic: apply_batch() -> apply() -> _apply_image() and other _apply_*()
    method. If you want to implement a self-defined transform method for image,
    rewrite _apply_image method in subclass.

    :param order: Input type order. Input is a tuple contains different structures,
    :param order: input type order. Input is a tuple containing different structures,
        order is used to specify the order of structures. For example, if your input
        is (image, boxes) type, then the order should be ("image", "boxes").
        Current available strings & data type are describe below:
        is (image, boxes) type, then the ``order`` should be ("image", "boxes").
        Current available strings and data type are describe below:

        * "image": input image, with shape of (H, W, C)
        * "coords": coordinates, with shape of (N, 2)
        * "boxes": bounding boxes, with shape of (N, 4), "xyxy" format,
        * "image": input image, with shape of `(H, W, C)`.
        * "coords": coordinates, with shape of `(N, 2)`.
        * "boxes": bounding boxes, with shape of `(N, 4)`, "xyxy" format,
          the 1st "xy" represents top left point of a box,
          the 2nd "xy" represents right bottom point.
        * "mask": map used for segmentation, with shape of (H, W, 1)
        * "keypoints": keypoints with shape of (N, K, 3), N for number of instances,
        * "mask": map used for segmentation, with shape of `(H, W, 1)`.
        * "keypoints": keypoints with shape of `(N, K, 3)`, N for number of instances,
          and K for number of keypoints in one instance. The first two dimensions
          of last axis is coordinate of keypoints and the the 3rd dimension is
          the label of keypoints.
        * "polygons": A sequence contains numpy array, its length is number of instances.
        * "polygons": a sequence containing numpy arrays, its length is the number of instances.
          Each numpy array represents polygon coordinate of one instance.
        * "category": categories for some data type. For example, "image_category"
          means category of the input image and "boxes_category" means categories of
@@ -94,11 +94,11 @@ class VisionTransform(Transform):
        self.order = order

    def apply_batch(self, inputs: Sequence[Tuple]):
        r"""Apply transform on batch input data"""
        r"""Apply transform on batch input data."""
        return tuple(self.apply(input) for input in inputs)

    def apply(self, input: Tuple):
        r"""Apply transform on single input data"""
        r"""Apply transform on single input data."""
        if not isinstance(input, tuple):
            input = (input,)

@@ -156,10 +156,10 @@ class VisionTransform(Transform):
 class ToMode(VisionTransform):
    r"""Change input data to a target mode.
    For example, most transforms use HWC mode image,
    while the Neural Network might use CHW mode input tensor
    while the neural network might use CHW mode input tensor.

    :param mode: Output mode of input. Use "CHW" mode by default.
    :param order: The same with :class:`VisionTransform`
    :param mode: output mode of input. Default: "CHW"
    :param order: the same with :class:`VisionTransform`
    """

    def __init__(self, mode="CHW", *, order=None):
@@ -185,14 +185,14 @@ class Compose(VisionTransform):
    r"""
    Composes several transforms together.

    :param transforms: List of :class:`VisionTransform` to compose.
    :param batch_compose: Whether use shuffle_indices for batch data or not.
    :param transforms: list of :class:`VisionTransform` to compose.
    :param batch_compose: whether use shuffle_indices for batch data or not.
        If True, use original input sequence.
        Otherwise, the shuffle_indices will be used for transforms.
    :param shuffle_indices: Indices used for random shuffle, start at 1.
    :param shuffle_indices: indices used for random shuffle, start at 1.
        For example, if shuffle_indices is [(1, 3), (2, 4)], then the 1st and 3rd transform
        will be random shuffled, the 2nd and 4th transform will also be shuffled.
    :param order: The same with :class:`VisionTransform`
    :param order: the same with :class:`VisionTransform`

    Examples:

@@ -264,8 +264,8 @@ class TorchTransformCompose(VisionTransform):
    some transforms with tensor in torchvision are not supported,
    such as Normalize and ToTensor in torchvision.

    :param transforms: The same with ``Compose``
    :param order: The same with :class:`VisionTransform`
    :param transforms: the same with ``Compose``.
    :param order: the same with :class:`VisionTransform`.
    """

    def __init__(self, transforms, *, order=None):
@@ -303,16 +303,16 @@ class TorchTransformCompose(VisionTransform):
 class Pad(VisionTransform):
    r"""Pad the input data.

    :param size: Padding size of input image, it could be integer or sequence.
        If it's an integer, the input image will be padded in four directions.
        If it's a sequence contains two integer, the bottom and right side
    :param size: padding size of input image, it could be integer or sequence.
        If it is an integer, the input image will be padded in four directions.
        If it is a sequence containing two integers, the bottom and right side
        of image will be padded.
        If it's a sequence contains four integer, the top, bottom, left, right
        If it is a sequence containing four integers, the top, bottom, left, right
        side of image will be padded with given size.
    :param value: Padding value of image, could be a sequence of int or float.
        if it's float value, the dtype of image will be casted to float32 also.
    :param mask_value: Padding value of segmentation map.
    :param order: The same with :class:`VisionTransform`
    :param value: padding value of image, could be a sequence of int or float.
        if it is float value, the dtype of image will be casted to float32 also.
    :param mask_value: padding value of segmentation map.
    :param order: the same with :class:`VisionTransform`.
    """

    def __init__(self, size=0, value=0, mask_value=0, *, order=None):
@@ -350,15 +350,15 @@ class Pad(VisionTransform):
 class Resize(VisionTransform):
    r"""Resize the input data.

    :param output_size: Target size of image, with (height, width) shape.
    :param interpolation: Interpolation method. All methods are listed below:
    :param output_size: target size of image, with (height, width) shape.
    :param interpolation: interpolation method. All methods are listed below:

        * cv2.INTER_NEAREST – a nearest-neighbor interpolation.
        * cv2.INTER_LINEAR – a bilinear interpolation (used by default).
        * cv2.INTER_AREA – resampling using pixel area relation.
        * cv2.INTER_CUBIC – a bicubic interpolation over 4×4 pixel neighborhood.
        * cv2.INTER_LANCZOS4 – a Lanczos interpolation over 8×8 pixel neighborhood.
    :param order: The same with :class:`VisionTransform`
    :param order: the same with :class:`VisionTransform`.
    """

    def __init__(self, output_size, interpolation=cv2.INTER_LINEAR, *, order=None):
@@ -476,8 +476,8 @@ class ShortestEdgeResize(VisionTransform):
 class RandomResize(VisionTransform):
    r"""Resize the input data randomly.

    :param scale_range: .
    :param order: The same with :class:`VisionTransform`
    :param scale_range: range of scaling.
    :param order: the same with :class:`VisionTransform`.
    """

    def __init__(self, scale_range, interpolation=cv2.INTER_LINEAR, *, order=None):
@@ -519,13 +519,13 @@ class RandomResize(VisionTransform):

 class RandomCrop(VisionTransform):
    r"""Crop the input data randomly. Before applying the crop transform,
    pad the image first. And if target size is still bigger than the size of
    pad the image first. If target size is still bigger than the size of
    padded image, pad the image size to target size.

    :param output_size: Target size of output image, with (height, width) shape.
    :param padding_size: The same with `size` in ``Pad``
    :param padding_value: The same with `value` in ``Pad``
    :param order: The same with :class:`VisionTransform`
    :param output_size: target size of output image, with (height, width) shape.
    :param padding_size: the same with `size` in ``Pad``.
    :param padding_value: the same with `value` in ``Pad``.
    :param order: the same with :class:`VisionTransform`.
    """

    def __init__(
@@ -580,10 +580,10 @@ class RandomResizedCrop(VisionTransform):
    aspect ratio (default: of 3/4 to 1.33) of the original aspect ratio is made.
    After applying crop transfrom, the input data will be resized to given size.

    :param output_size: Target size of output image, with (height, width) shape.
    :param scale_range: Range of size of the origin size cropped. Default: (0.08, 1.0)
    :param ratio_range: Range of aspect ratio of the origin aspect ratio cropped. Default: (0.75, 1.33)
    :param order: The same with :class:`VisionTransform`
    :param output_size: target size of output image, with (height, width) shape.
    :param scale_range: range of size of the origin size cropped. Default: (0.08, 1.0)
    :param ratio_range: range of aspect ratio of the origin aspect ratio cropped. Default: (0.75, 1.33)
    :param order: the same with :class:`VisionTransform`.
    """

    def __init__(
@@ -666,8 +666,8 @@ class RandomResizedCrop(VisionTransform):
 class CenterCrop(VisionTransform):
    r"""Crops the given the input data at the center.

    :param output_size: Target size of output image, with (height, width) shape.
    :param order: The same with :class:`VisionTransform`
    :param output_size: target size of output image, with (height, width) shape.
    :param order: the same with :class:`VisionTransform`.
    """

    def __init__(self, output_size, *, order=None):
@@ -710,7 +710,7 @@ class RandomHorizontalFlip(VisionTransform):
    r"""Horizontally flip the input data randomly with a given probability.

    :param p: probability of the input data being flipped. Default: 0.5
    :param order: The same with :class:`VisionTransform`
    :param order: the same with :class:`VisionTransform`.
    """

    def __init__(self, prob: float = 0.5, *, order=None):
@@ -742,7 +742,7 @@ class RandomVerticalFlip(VisionTransform):
    r"""Vertically flip the input data randomly with a given probability.

    :param p: probability of the input data being flipped. Default: 0.5
    :param order: The same with :class:`VisionTransform`
    :param order: the same with :class:`VisionTransform`.
    """

    def __init__(self, prob: float = 0.5, *, order=None):
@@ -776,9 +776,9 @@ class Normalize(VisionTransform):
    this transform will normalize each channel of the input data.
    ``output[channel] = (input[channel] - mean[channel]) / std[channel]``

    :param mean: Sequence of means for each channel.
    :param std: Sequence of standard deviations for each channel.
    :param order: The same with :class:`VisionTransform`
    :param mean: sequence of means for each channel.
    :param std: sequence of standard deviations for each channel.
    :param order: the same with :class:`VisionTransform`.
    """

    def __init__(self, mean=0.0, std=1.0, *, order=None):
@@ -802,7 +802,7 @@ class GaussianNoise(VisionTransform):

    :param mean: Gaussian mean used to generate noise.
    :param std: Gaussian standard deviation used to generate noise.
    :param order: The same with :class:`VisionTransform`
    :param order: the same with :class:`VisionTransform`
    """

    def __init__(self, mean=0.0, std=1.0, *, order=None):
@@ -826,9 +826,9 @@ class GaussianNoise(VisionTransform):
 class BrightnessTransform(VisionTransform):
    r"""Adjust brightness of the input data.

    :param value: How much to adjust the brightness. Can be any
        non negative number. 0 gives the original image
    :param order: The same with :class:`VisionTransform`
    :param value: how much to adjust the brightness. Can be any
        non negative number. 0 gives the original image.
    :param order: the same with :class:`VisionTransform`.
    """

    def __init__(self, value, *, order=None):
@@ -857,9 +857,9 @@ class BrightnessTransform(VisionTransform):
 class ContrastTransform(VisionTransform):
    r"""Adjust contrast of the input data.

    :param value: How much to adjust the contrast. Can be any
        non negative number. 0 gives the original image
    :param order: The same with :class:`VisionTransform`
    :param value: how much to adjust the contrast. Can be any
        non negative number. 0 gives the original image.
    :param order: the same with :class:`VisionTransform`.
    """

    def __init__(self, value, *, order=None):
@@ -888,9 +888,9 @@ class ContrastTransform(VisionTransform):
 class SaturationTransform(VisionTransform):
    r"""Adjust saturation of the input data.

    :param value: How much to adjust the saturation. Can be any
        non negative number. 0 gives the original image
    :param order: The same with :class:`VisionTransform`
    :param value: how much to adjust the saturation. Can be any
        non negative number. 0 gives the original image.
    :param order: the same with :class:`VisionTransform`.
    """

    def __init__(self, value, *, order=None):
@@ -919,9 +919,9 @@ class SaturationTransform(VisionTransform):
 class HueTransform(VisionTransform):
    r"""Adjust hue of the input data.

    :param value: How much to adjust the hue. Can be any number
        between 0 and 0.5, 0 gives the original image
    :param order: The same with :class:`VisionTransform`
    :param value: how much to adjust the hue. Can be any number
        between 0 and 0.5, 0 gives the original image.
    :param order: the same with :class:`VisionTransform`.
    """

    def __init__(self, value, *, order=None):
@@ -957,19 +957,19 @@ class HueTransform(VisionTransform):
 class ColorJitter(VisionTransform):
    r"""Randomly change the brightness, contrast, saturation and hue of an image.

    :param brightness: How much to jitter brightness.
    :param brightness: how much to jitter brightness.
        Chosen uniformly from [max(0, 1 - brightness), 1 + brightness]
        or the given [min, max]. Should be non negative numbers.
    :param contrast: How much to jitter contrast.
    :param contrast: how much to jitter contrast.
        Chosen uniformly from [max(0, 1 - contrast), 1 + contrast]
        or the given [min, max]. Should be non negative numbers.
    :param saturation: How much to jitter saturation.
    :param saturation: how much to jitter saturation.
        Chosen uniformly from [max(0, 1 - saturation), 1 + saturation]
        or the given [min, max]. Should be non negative numbers.
    :param hue: How much to jitter hue.
    :param hue: how much to jitter hue.
        Chosen uniformly from [-hue, hue] or the given [min, max].
        Should have 0<= hue <= 0.5 or -0.5 <= min <= max <= 0.5.
    :param order: The same with :class:`VisionTransform`
    :param order: the same with :class:`VisionTransform`.
    """

    def __init__(self, brightness=0, contrast=0, saturation=0, hue=0, *, order=None):
--- a/imperative/python/megengine/device.py
+++ b/imperative/python/megengine/device.py
@@ -7,6 +7,7 @@
 # software distributed under the License is distributed on an
 # "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 import os
 import re

 from .core._imperative_rt.common import CompNode, DeviceType
 from .core._imperative_rt.common import set_prealloc_config as _set_prealloc_config
@@ -22,10 +23,8 @@ __all__ = [


 def _valid_device(inp):
    if isinstance(inp, str) and len(inp) == 4:
        if inp[0] in {"x", "c", "g"} and inp[1:3] == "pu":
            if inp[3] == "x" or inp[3].isdigit():
                return True
    if isinstance(inp, str) and re.match("^[cxg]pu(\d+|\d+:\d+|x)$", inp):
        return True
    return False


@@ -71,11 +70,11 @@ def set_default_device(device: str = "xpux"):

        'multithread' device type is avaliable when inference, which implements
        multi-threading parallelism at the operator level. For example,
        'multithread4' will compute with 4 threads. which implements
        'multithread4' will compute with 4 threads.

        The default value is 'xpux' to specify any device available. The priority of using gpu is higher when both gpu and cpu are available.

        It can also be set by environmental variable `MGE_DEFAULT_DEVICE`.
        It can also be set by environment variable `MGE_DEFAULT_DEVICE`.
    """
    assert _valid_device(device), "Invalid device name {}".format(device)
    CompNode._set_default_device(device)
@@ -99,13 +98,13 @@ def set_prealloc_config(
    growth_factor=2.0,
    device_type=DeviceType.CUDA,
 ):
    """specifies how to pre-allocate from raw dev allocator
    """Specifies how to pre-allocate from raw device allocator.

    :param alignment: specifies the alignment in bytes.
    :param min_req: min request size in bytes.
    :param max_overhead: max overhead above required size in bytes.
    :growth_factor: request size / cur allocated
    :device_type: the device type
    :param growth_factor: `request size / cur allocated`
    :param device_type: the device type

    """
    assert alignment > 0
--- a/imperative/python/megengine/distributed/functional.py
+++ b/imperative/python/megengine/distributed/functional.py
@@ -102,7 +102,7 @@ def _(op: RemoteRecv):


 def collective_comm(inp, mode, group, device):
    """Helper function for applying collective communication functions"""
    """Helper function for applying collective communication functions."""
    assert isinstance(group, Group)
    if group is None:
        return inp
@@ -123,11 +123,11 @@ def collective_comm(inp, mode, group, device):
 def reduce_sum(
    inp: Tensor, group: Optional[Group] = WORLD, device: Optional[str] = ""
 ) -> Tensor:
    """Create reduce_sum operator for collective communication
    """Create reduce_sum operator for collective communication.

    :param inp: input tensor
    :param group: communication group
    :param device: execute placement
    :param inp: input tensor.
    :param group: communication group.
    :param device: execution device.
    """
    mode = CollectiveCommMode.REDUCE_SUM
    return collective_comm(inp, mode, group, device)
@@ -136,11 +136,11 @@ def reduce_sum(
 def broadcast(
    inp: Tensor, group: Optional[Group] = WORLD, device: Optional[str] = ""
 ) -> Tensor:
    """Create broadcast operator for collective communication
    """Create broadcast operator for collective communication.

    :param inp: input tensor
    :param group: communication group
    :param device: execute placement
    :param inp: input tensor.
    :param group: communication group.
    :param device: execution device.
    """
    mode = CollectiveCommMode.BROADCAST
    return collective_comm(inp, mode, group, device)
@@ -149,11 +149,11 @@ def broadcast(
 def all_gather(
    inp: Tensor, group: Optional[Group] = WORLD, device: Optional[str] = ""
 ) -> Tensor:
    """Create all_gather operator for collective communication
    """Create all_gather operator for collective communication.

    :param inp: input tensor
    :param group: communication group
    :param device: execute placement
    :param inp: input tensor.
    :param group: communication group.
    :param device: execution device.
    """
    mode = CollectiveCommMode.ALL_GATHER
    return collective_comm(inp, mode, group, device)
@@ -162,11 +162,11 @@ def all_gather(
 def reduce_scatter_sum(
    inp: Tensor, group: Optional[Group] = WORLD, device: Optional[str] = ""
 ) -> Tensor:
    """Create reduce_scatter_sum operator for collective communication
    """Create reduce_scatter_sum operator for collective communication.

    :param inp: input tensor
    :param group: communication group
    :param device: execute placement
    :param inp: input tensor.
    :param group: communication group.
    :param device: execution device.
    """
    mode = CollectiveCommMode.REDUCE_SCATTER_SUM
    return collective_comm(inp, mode, group, device)
@@ -175,11 +175,11 @@ def reduce_scatter_sum(
 def all_reduce_sum(
    inp: Tensor, group: Optional[Group] = WORLD, device: Optional[str] = ""
 ) -> Tensor:
    """Create all_reduce_sum operator for collective communication
    """Create all_reduce_sum operator for collective communication.

    :param inp: input tensor
    :param group: communication group
    :param device: execute placement
    :param inp: input tensor.
    :param group: communication group.
    :param device: execution device.
    """
    mode = CollectiveCommMode.ALL_REDUCE_SUM
    return collective_comm(inp, mode, group, device)
@@ -188,11 +188,11 @@ def all_reduce_sum(
 def all_reduce_max(
    inp: Tensor, group: Optional[Group] = WORLD, device: Optional[str] = ""
 ) -> Tensor:
    """Create all_reduce_max operator for collective communication
    """Create all_reduce_max operator for collective communication.

    :param inp: input tensor
    :param group: communication group
    :param device: execute placement
    :param inp: input tensor.
    :param group: communication group.
    :param device: execution device.
    """
    mode = CollectiveCommMode.ALL_REDUCE_MAX
    return collective_comm(inp, mode, group, device)
@@ -201,11 +201,11 @@ def all_reduce_max(
 def all_reduce_min(
    inp: Tensor, group: Optional[Group] = WORLD, device: Optional[str] = ""
 ) -> Tensor:
    """Create all_reduce_min operator for collective communication
    """Create all_reduce_min operator for collective communication.

    :param inp: input tensor
    :param group: communication group
    :param device: execute placement
    :param inp: input tensor.
    :param group: communication group.
    :param device: execution device.
    """
    mode = CollectiveCommMode.ALL_REDUCE_MIN
    return collective_comm(inp, mode, group, device)
@@ -214,11 +214,11 @@ def all_reduce_min(
 def gather(
    inp: Tensor, group: Optional[Group] = WORLD, device: Optional[str] = ""
 ) -> Tensor:
    """Create gather operator for collective communication
    """Create gather operator for collective communication.

    :param inp: input tensor
    :param group: communication group
    :param device: execute placement
    :param inp: input tensor.
    :param group: communication group.
    :param device: execution device.
    """
    mode = CollectiveCommMode.GATHER
    return collective_comm(inp, mode, group, device)
@@ -227,11 +227,11 @@ def gather(
 def scatter(
    inp: Tensor, group: Optional[Group] = WORLD, device: Optional[str] = ""
 ) -> Tensor:
    """Create scatter operator for collective communication
    """Create scatter operator for collective communication.

    :param inp: input tensor
    :param group: communication group
    :param device: execute placement
    :param inp: input tensor.
    :param group: communication group.
    :param device: execution device.
    """
    mode = CollectiveCommMode.SCATTER
    return collective_comm(inp, mode, group, device)
@@ -240,21 +240,21 @@ def scatter(
 def all_to_all(
    inp: Tensor, group: Optional[Group] = WORLD, device: Optional[str] = ""
 ) -> Tensor:
    """Create all_to_all operator for collective communication
    """Create all_to_all operator for collective communication.

    :param inp: input tensor
    :param group: communication group
    :param device: execute placement
    :param inp: input tensor.
    :param group: communication group.
    :param device: execution device.
    """
    mode = CollectiveCommMode.ALL_TO_ALL
    return collective_comm(inp, mode, group, device)


 def remote_send(inp: Tensor, dest_rank: int) -> Tensor:
    """Send a Tensor to a remote process
    """Send a Tensor to a remote process.

    :param inp: tensor to send
    :param dest_rank: destination process rank
    :param inp: tensor to send.
    :param dest_rank: destination process rank.
    """
    op = RemoteSend()
    op.key = "{}->{}".format(get_rank(), dest_rank)
@@ -266,12 +266,12 @@ def remote_send(inp: Tensor, dest_rank: int) -> Tensor:
 def remote_recv(
    src_rank: int, shape: Tuple[int], dtype: type, device: Optional[str] = None
 ) -> Tensor:
    """Receive a Tensor from a remote process
    """Receive a Tensor from a remote process.

    :param src_rank: source process rank
    :param shape: the shape of the tensor to receive
    :param dtype: the data type of the tensor to receive
    :param device: the device to place the received tensor
    :param src_rank: source process rank.
    :param shape: the shape of the tensor to receive.
    :param dtype: the data type of the tensor to receive.
    :param device: the device to place the received tensor.
    """
    key = "{}->{}".format(src_rank, get_rank())

--- a/imperative/python/megengine/distributed/group.py
+++ b/imperative/python/megengine/distributed/group.py
@@ -83,12 +83,12 @@ def init_process_group(
 ) -> None:
    """Initialize the distributed process group and specify the device used in the current process

    :param master_ip: IP address of the master node
    :param port: Port available for all processes to communicate
    :param world_size: Total number of processes participating in the job
    :param rank: Rank of the current process
    :param device: The GPU device id to bind this process to
    :param backend: Communicator backend, currently support 'nccl' and 'ucx'
    :param master_ip: ip address of the master node.
    :param port: port available for all processes to communicate.
    :param world_size: total number of processes participating in the job.
    :param rank: rank of the current process.
    :param device: the GPU device id to bind this process to.
    :param backend: communicator backend, currently support 'nccl' and 'ucx'.
    """
    if not isinstance(master_ip, str):
        raise TypeError("Expect type str but got {}".format(type(master_ip)))
@@ -127,50 +127,50 @@ def init_process_group(


 def is_distributed() -> bool:
    """Return True if the distributed process group has been initialized"""
    """Return True if the distributed process group has been initialized."""
    return _sd is not None


 def get_rank() -> int:
    """Get the rank of the current process"""
    """Get the rank of the current process."""
    return _sd.proc_rank if _sd is not None else 0


 def get_world_size() -> int:
    """Get the total number of processes participating in the job"""
    """Get the total number of processes participating in the job."""
    return _sd.world_size if _sd is not None else 1


 def get_backend() -> str:
    """Get the backend str"""
    """Get the backend str."""
    assert _sd is not None, "please call init_process_group first"
    return _sd.backend if _sd is not None else None


 def get_py_server_addr() -> Tuple[str, int]:
    """Get master_ip and port of python XML RPC server"""
    """Get master_ip and port of python XML RPC server."""
    assert _sd is not None, "please call init_process_group first"
    return _sd.master_ip, _sd.py_server_port


 def get_mm_server_addr() -> Tuple[str, int]:
    """Get master_ip and port of C++ mm_server"""
    """Get master_ip and port of C++ mm_server."""
    assert _sd is not None, "please call init_process_group first"
    return _sd.master_ip, _sd.mm_server_port


 def get_client() -> Client:
    """Get client of python XML RPC server"""
    """Get client of python XML RPC server."""
    assert _sd is not None, "please call init_process_group first"
    return _sd.client


 def new_group(proc_ranks: List[int]) -> Group:
    """Build a subgroup containing certain ranks"""
    """Build a subgroup containing certain ranks."""
    return Group(proc_ranks)


 def group_barrier(group: Optional[Group] = WORLD) -> None:
    """Block until all ranks in the group reach this barrier"""
    """Block until all ranks in the group reach this barrier."""
    assert isinstance(group, Group)
    _sd.client.group_barrier(group.key, group.size)
--- a/imperative/python/megengine/distributed/helper.py
+++ b/imperative/python/megengine/distributed/helper.py
@@ -17,11 +17,112 @@ import numpy as np
 from megengine.autodiff.grad_manager import GradManager, get_backwarding_grad_manager
 from megengine.device import get_default_device, get_device_count

 from ..functional.param_pack import get_offsets, pack_allreduce_split
 from ..core.ops.builtin import ParamPackConcat, ParamPackSplit
 from ..core.tensor.core import apply
 from ..functional.utils import copy
 from ..tensor import Tensor
 from ..utils.future import Future
 from .functional import all_reduce_sum, broadcast
 from .group import WORLD, group_barrier, is_distributed
 from .group import WORLD, Group, group_barrier, is_distributed


 def param_pack_split(inp: Tensor, offsets: list, shapes: list):
    r"""
    Returns split tensor to tensor list as offsets and shapes described,
            only used for ``parampack``.

    :param inp: input tensor.
    :param offsets: offsets of outputs, length of `2 * n`,
            while n is tensor nums you want to split,
            format `[begin0, end0, begin1, end1]`.
    :param shapes: tensor shapes of outputs.
    :return: splitted tensors.

    Examples:

    .. testcode::

        import numpy as np
        from megengine import tensor
        from megengine.distributed.helper import param_pack_split

        a = tensor(np.ones((10,), np.int32))
        b, c = param_pack_split(a, [0, 1, 1, 10], [(1,), (3, 3)])
        print(b.numpy())
        print(c.numpy())

    Outputs:

    .. testoutput::

        [1]
        [[1 1 1]
         [1 1 1]
         [1 1 1]]

    """
    op = ParamPackSplit()
    op.offsets = offsets
    op.shapes = shapes
    return apply(op, inp)


 def param_pack_concat(inps: list, offsets: Tensor, offsets_val: list):
    r"""
    Returns concated tensor, only used for ``parampack``.

    :param inps: input tensors.
    :param offsets: device value of offsets.
    :param offsets_val: offsets of inputs, length of `2 * n`,
            format `[begin0, end0, begin1, end1]`.
    :return: concated tensor.

    Examples:

    .. testcode::

        import numpy as np
        from megengine import tensor
        from megengine.distributed.helper import param_pack_concat

        a = tensor(np.ones((1,), np.int32))
        b = tensor(np.ones((3, 3), np.int32))
        offsets_val = [0, 1, 1, 10]
        offsets = tensor(offsets_val, np.int32)
        c = param_pack_concat([a, b], offsets, offsets_val)
        print(c.numpy())

    Outputs:

    .. testoutput::

        [1 1 1 1 1 1 1 1 1 1]

    """
    op = ParamPackConcat()
    op.offsets = offsets_val
    return apply(op, *inps, offsets)[0]


 def get_offsets(shapes):
    offsets = []
    offset = 0
    for shape in shapes:
        offsets.append(offset)
        offset += int(np.prod(shape))
        offsets.append(offset)
    return offsets


 def pack_allreduce_split(pack_list, shapes, group, reduce_method):
    offsets_val = get_offsets(shapes)
    offsets = Tensor(offsets_val)
    packed_grads = param_pack_concat(pack_list, offsets, offsets_val)
    packed_grads = all_reduce_sum(packed_grads, group, group.comp_node)
    if reduce_method == "mean":
        packed_grads /= group.size
    grads = param_pack_split(packed_grads, offsets_val, shapes)
    return grads


 class TensorFuture(Future):
@@ -54,28 +155,43 @@ def synchronized(func: Callable):
    return wrapper


 def get_device_count_by_fork(device_type: str):
    q = mp.Queue()
 def _get_device_count_worker(queue, device_type):
    num = get_device_count(device_type)
    queue.put(num)

    def worker(queue):
        num = get_device_count(device_type)
        queue.put(num)

    p = mp.Process(target=worker, args=(q,))
 def get_device_count_by_fork(device_type: str):
    """Get device count in fork thread.
    See https://stackoverflow.com/questions/22950047/cuda-initialization-error-after-fork
    for more information.
    """
    q = mp.Queue()
    p = mp.Process(target=_get_device_count_worker, args=(q, device_type))
    p.start()
    p.join()
    return q.get()


 def bcast_list_(params, group):
    for p in params:
        p._reset(broadcast(p, group))
 def bcast_list_(inps: list, group: Group = WORLD):
    """Broadcast tensors between given group.

    :param inps: input tensors.
    :param group: communication group.
    """
    for inp in inps:
        inp._reset(broadcast(inp, group))


 class AllreduceCallback:
    def __init__(self, reduce_method, group=WORLD):
    """Allreduce Callback with tensor fusion optimization.

    :param reduce_method: the method to reduce gradiants.
    :param group: communication group.
    """

    def __init__(self, reduce_method: str, group: Group = WORLD):
        reduce_method = reduce_method.lower()
        assert reduce_method in ["sum", "mean"]
        assert reduce_method in ["sum", "mean"], "reduce_method should be sum or mean"
        self._reduce_method = reduce_method
        self._group = group
        self._marked_gm = WeakSet()
@@ -88,6 +204,7 @@ class AllreduceCallback:
        self._futures_dict = dict()
        self._packing_list = defaultdict(list)
        self._packing_size = defaultdict(int)
        self._grad_origin_device = dict()

    def _pack(self, dtype):
        grad_list = [self._gradients_dict[p] for p in self._packing_list[dtype]]
@@ -109,6 +226,7 @@ class AllreduceCallback:
        self._params.append(param)
        self._futures_dict[param] = TensorFuture(ack=False)
        self._gradients_dict[param] = grad
        self._grad_origin_device[param] = str(grad.device)

        dtype_str = str(np.dtype(param.dtype))
        dtype_size = np.dtype(param.dtype).itemsize
@@ -123,6 +241,7 @@ class AllreduceCallback:
            self._pack(dtype)
        for param in self._params:
            grad = self._gradients_dict[param]
            grad = copy(grad, self._grad_origin_device[param])
            self._futures_dict[param].set(grad)
        self._reset()

--- a/imperative/python/megengine/distributed/launcher.py
+++ b/imperative/python/megengine/distributed/launcher.py
@@ -15,7 +15,7 @@ from .util import get_free_ports


 def _run_wrapped(func, master_ip, port, world_size, rank, dev, args, kwargs):
    """init distributed process group and run wrapped function"""
    """Init distributed process group and run wrapped function."""
    init_process_group(
        master_ip=master_ip, port=port, world_size=world_size, rank=rank, device=dev
    )
@@ -23,7 +23,7 @@ def _run_wrapped(func, master_ip, port, world_size, rank, dev, args, kwargs):


 def launcher(func):
    """decorator for launching multiple processes in single-machine multi-gpu training"""
    """Decorator for launching multiple processes in single-machine multi-gpu training."""

    n_gpus = get_device_count_by_fork("gpu")

--- a/imperative/python/megengine/distributed/server.py
+++ b/imperative/python/megengine/distributed/server.py
@@ -21,6 +21,12 @@ from .util import get_free_ports


 class Methods:
    """Distributed Server Method.
    Used for exchange information between distributed nodes.

    :param mm_server_port: multiple machine rpc server port.
    """

    def __init__(self, mm_server_port):
        self.lock = threading.Lock()
        self.mm_server_port = mm_server_port
@@ -31,51 +37,65 @@ class Methods:
        self.dict_barrier_event = defaultdict(threading.Event)

    def connect(self):
        """Method for checking connection success."""
        return True

    def get_mm_server_port(self):
        """Get multiple machine rpc server port."""
        return self.mm_server_port

    def set_is_grad(self, rank_peer, is_grad):
    def set_is_grad(self, key, is_grad):
        """Mark send/recv need gradiants by key.
        
        :param key: key to match send/recv op.
        :param is_grad: whether this op need grad.
        """
        with self.lock:
            future = self.dict_is_grad[rank_peer]
            future = self.dict_is_grad[key]
        future.set(is_grad)
        return True

    def check_is_grad(self, rank_peer):
    def check_is_grad(self, key):
        """Check whether send/recv need gradiants.
        
        :param key: key to match send/recv op.
        """
        with self.lock:
            future = self.dict_is_grad[rank_peer]
            future = self.dict_is_grad[key]
        ret = future.get()
        with self.lock:
            del self.dict_is_grad[rank_peer]
            del self.dict_is_grad[key]
        return ret

    def set_remote_tracer(self, rank_peer, tracer_set):
    def set_remote_tracer(self, key, tracer_set):
        """Set tracer dict for tracing send/recv op.

        :param key: key to match send/recv op.
        :param tracer_set: valid tracer set.
        """
        with self.lock:
            future = self.dict_remote_tracer[rank_peer]
            future = self.dict_remote_tracer[key]
        future.set(tracer_set)
        return True

    def check_remote_tracer(self, rank_peer):
    def check_remote_tracer(self, key):
        """Get tracer dict for send/recv op.
        
        :param key: key to match send/recv op.
        """
        with self.lock:
            future = self.dict_remote_tracer[rank_peer]
            future = self.dict_remote_tracer[key]
        ret = future.get()
        with self.lock:
            del self.dict_remote_tracer[rank_peer]
            del self.dict_remote_tracer[key]
        return ret

    def set_pack_list(self, key, pack_list):
        with self.lock:
            future = self.dict_pack_list[key]
        future.set(pack_list)
        return True

    def get_pack_list(self, key):
        with self.lock:
            future = self.dict_pack_list[key]
        return future.get()

    def group_barrier(self, key, size):
        """A barrier wait for all group member.
        
        :param key: group key to match each other.
        :param size: group size.
        """
        with self.lock:
            self.dict_barrier_counter[key] += 1
            counter = self.dict_barrier_counter[key]
@@ -94,12 +114,23 @@ class ThreadXMLRPCServer(ThreadingMixIn, SimpleXMLRPCServer):


 def start_server(py_server_port, mm_server_port):
    """Start python distributed server and multiple machine server.
    
    :param py_server_port: python server port.
    :param mm_server_port: multiple machine server port.
    """
    server = ThreadXMLRPCServer(("0.0.0.0", py_server_port), logRequests=False)
    server.register_instance(Methods(mm_server_port))
    server.serve_forever()


 class Server:
    """Distributed Server for distributed training.
    Should be running at master node.

    :param port: python server port.
    """

    def __init__(self, port):
        self.py_server_port = get_free_ports(1)[0] if port == 0 else port
        self.mm_server_port = create_mm_server("0.0.0.0", 0)
@@ -112,12 +143,19 @@ class Server:


 class Client:
    """Distributed Client for distributed training.

    :param master_ip: ip address of master node.
    :param port: port of server at master node.
    """

    def __init__(self, master_ip, port):
        self.master_ip = master_ip
        self.port = port
        self.connect()

    def connect(self):
        """Check connection success."""
        while True:
            try:
                self.proxy = ServerProxy(
@@ -129,25 +167,43 @@ class Client:
                time.sleep(1)

    def get_mm_server_port(self):
        """Get multiple machine server port."""
        return self.proxy.get_mm_server_port()

    def set_is_grad(self, rank_peer, is_grad):
        self.proxy.set_is_grad(rank_peer, is_grad)

    def check_is_grad(self, rank_peer):
        return self.proxy.check_is_grad(rank_peer)

    def set_remote_tracer(self, rank_peer, tracer_set):
        self.proxy.set_remote_tracer(rank_peer, tracer_set)

    def check_remote_tracer(self, rank_peer):
        return self.proxy.check_remote_tracer(rank_peer)

    def set_pack_list(self, key, pack_list):
        self.proxy.set_pack_list(key, pack_list)

    def get_pack_list(self, key):
        return self.proxy.get_pack_list(key)
    def set_is_grad(self, key, is_grad):
        """Mark send/recv need gradiants by key.
        
        :param key: key to match send/recv op.
        :param is_grad: whether this op need grad.
        """
        self.proxy.set_is_grad(key, is_grad)

    def check_is_grad(self, key):
        """Check whether send/recv need gradiants.
        
        :param key: key to match send/recv op.
        """
        return self.proxy.check_is_grad(key)

    def set_remote_tracer(self, key, tracer_set):
        """Set tracer dict for tracing send/recv op.

        :param key: key to match send/recv op.
        :param tracer_set: valid tracer set.
        """
        self.proxy.set_remote_tracer(key, tracer_set)

    def check_remote_tracer(self, key):
        """Get tracer dict for send/recv op.
        
        :param key: key to match send/recv op.
        """
        return self.proxy.check_remote_tracer(key)

    def group_barrier(self, key, size):
        """A barrier wait for all group member.
        
        :param key: group key to match each other.
        :param size: group size.
        """
        self.proxy.group_barrier(key, size)
--- a/imperative/python/megengine/functional/init.py
+++ b/imperative/python/megengine/functional/init.py
@@ -8,13 +8,10 @@
 # "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # pylint: disable=redefined-builtin
 from .elemwise import *
 from .graph import add_update
 from .loss import *
 from .math import *
 from .nn import *
 from .quantized import conv_bias_activation
 from .tensor import *
 from .utils import accuracy, copy
 from .utils import *

 from . import distributed  # isort:skip

--- a/imperative/python/megengine/functional/debug_param.py
+++ b/imperative/python/megengine/functional/debug_param.py
@@ -26,14 +26,14 @@ def set_conv_execution_strategy(option: str):
        Available values:

        * 'HEURISTIC' uses heuristic to choose the fastest algorithm.
        * 'PROFILE' runs possible algorithms on real device to find the best.
        * 'PROFILE_HEURISTIC' uses profile result and heuristic to choose the fastest algorithm.
        * 'PROFILE_REPRODUCIBLE' uses the fastest of profile result that is also reproducible.
        * 'PROFILE' runs possible algorithms on real device to find the best one.
        * 'PROFILE_HEURISTIC' uses profiling result and heuristic to choose the fastest algorithm.
        * 'PROFILE_REPRODUCIBLE' uses the fastest of profiling result that is also reproducible.
        * 'HEURISTIC_REPRODUCIBLE' uses heuristic to choose the fastest algorithm that is also reproducible.

        The default strategy is 'HEURISTIC'.

        It can also be set through the environmental variable 'MEGENGINE_CONV_EXECUTION_STRATEGY'.
        It can also be set through the environment variable 'MEGENGINE_CONV_EXECUTION_STRATEGY'.
    """
    valid_option = (
        "HEURISTIC",
--- a/imperative/python/megengine/functional/elemwise.py
+++ b/imperative/python/megengine/functional/elemwise.py
@@ -26,23 +26,22 @@ __all__ = [
    "acosh",
    "atanh",
    "ceil",
    "clamp",
    "clip",
    "cos",
    "cosh",
    "div",
    "eq",
    "equal",
    "exp",
    "expm1",
    "fast_tanh",
    "floor",
    "floor_div",
    "gt",
    "ge",
    "greater",
    "greater_equal",
    "hswish",
    "hsigmoid",
    "left_shift",
    "lt",
    "le",
    "less",
    "less_equal",
    "log",
    "log1p",
    "logical_and",
@@ -54,7 +53,7 @@ __all__ = [
    "mod",
    "mul",
    "neg",
    "ne",
    "not_equal",
    "pow",
    "relu",
    "relu6",
@@ -88,13 +87,6 @@ def _elwise(*args, mode):
    return result


 def _logical(*args, mode):
    op = builtin.CondExecPredLogical(mode=mode)
    args = utils.convert_inputs(*args)
    (result,) = apply(op, *args)
    return result


 def _elemwise_multi_type(*args, mode, **kwargs):
    op = builtin.ElemwiseMultiType(mode=mode, **kwargs)
    args = utils.convert_inputs(*args)
@@ -106,9 +98,10 @@ def _elemwise_multi_type(*args, mode, **kwargs):


 def add(x, y):
    """Element-wise addition.
    """Element-wise `addition`.
    At least one operand should be tensor.
    Same for sub/mul/div/floor_div/pow/mod/atan2/eq/ne/lt/le/gt/ge/maximum/minmium.

    Same for sub/mul/div/floor_div/pow/mod/atan2/equal/not_equal/less/less_equal/greater/greater_equal/maximum/minmium.

    :param x: input tensor.
    :return: computed tensor.
@@ -138,68 +131,68 @@ def add(x, y):


 def sub(x, y):
    """Element-wise subtraction."""
    """Element-wise `subtraction`."""
    return _elwise(x, y, mode="sub")


 def mul(x, y):
    """Element-wise multiplication."""
    """Element-wise `multiplication`."""
    return _elwise(x, y, mode="mul")


 def div(x, y):
    """Element-wise (x / y)."""
    """Element-wise `(x / y)`."""
    return _elwise(x, y, mode="true_div")


 def floor_div(x, y):
    """Element-wise floor(x / y)."""
    """Element-wise `floor(x / y)`."""
    return _elwise(x, y, mode="floor_divide")


 def neg(x):
    """Element-wise negation."""
    """Element-wise `negation`."""
    return _elwise(x, mode="negate")


 def pow(x, y):
    """Element-wise power."""
    """Element-wise `power`."""
    return _elwise(x, y, mode="pow")


 def mod(x, y):
    """Element-wise remainder of division."""
    """Element-wise `remainder of division`."""
    return _elwise(x, y, mode="mod")


 def abs(x):
    """Element-wise absolute value."""
    """Element-wise `absolute value`."""
    return _elwise(x, mode="abs")


 def exp(x):
    """Element-wise exponential."""
    """Element-wise `exponential`."""
    return _elwise(x, mode="exp")


 def expm1(x):
    """Element-wise exp(x)-1."""
    """Element-wise `exp(x)-1`."""
    return _elwise(x, mode="expm1")


 def log(x):
    """Element-wise logarithm (base `e`)."""
    """Element-wise `logarithm (base e)`."""
    return _elwise(x, mode="log")


 def log1p(x):
    """Element-wise log(x+1) (base `e`)."""
    """Element-wise `log(x+1) (base e)`."""
    return _elwise(x, mode="log1p")


 def sqrt(x: Tensor) -> Tensor:
    """Element-wise sqrt.
    For negative input value, return ``NaN``.
    """Element-wise `sqrt`.
    Returns ``NaN`` for negative input value.

    :param x: input tensor.
    :return: computed tensor.
@@ -229,10 +222,10 @@ def sqrt(x: Tensor) -> Tensor:

 def square(x: Tensor) -> Tensor:
    """
    Return a new tensor with the square of the elements of input tensor.
    Returns a new tensor with the square of the elements of input tensor.

    :param inp: The input tensor
    :return: The computed tensor
    :param inp: input tensor.
    :return: computed tensor.

    Examples:

@@ -258,27 +251,27 @@ def square(x: Tensor) -> Tensor:


 def round(x):
    """Element-wise rounding to int."""
    """Element-wise `rounding to int`."""
    return _elwise(x, mode="round")


 def ceil(x):
    """Element-wise ceiling."""
    """Element-wise `ceiling`."""
    return _elwise(x, mode="ceil")


 def floor(x):
    """Element-wise floor."""
    """Element-wise `floor`."""
    return _elwise(x, mode="floor")


 def maximum(x, y):
    """Element-wise maximum of array elements."""
    """Element-wise `maximum of array elements`."""
    return _elwise(x, y, mode="max")


 def minimum(x, y):
    """Element-wise minimum of array elements."""
    """Element-wise `minimum of array elements`."""
    return _elwise(x, y, mode="min")


@@ -286,7 +279,7 @@ def minimum(x, y):


 def cos(x):
    """Element-wise cosine.
    """Element-wise `cosine`.

    :param x: input tensor.
    :return: computed tensor.
@@ -315,80 +308,71 @@ def cos(x):


 def sin(x):
    """Element-wise sine."""
    """Element-wise `sine`."""
    return _elwise(x, mode="sin")


 def tan(x):
    """Element-wise tangent."""
    """Element-wise `tangent`."""
    return sin(x) / cos(x)


 def acos(x):
    """Element-wise inverse cosine."""
    """Element-wise `inverse cosine`."""
    return _elwise(x, mode="acos")


 def asin(x):
    """Element-wise inverse sine."""
    """Element-wise `inverse sine`."""
    return _elwise(x, mode="asin")


 def atan(x):
    """Element-wise inverse tangent."""
    """Element-wise `inverse tangent`."""
    return _elwise(x, 1, mode="atan2")


 def atan2(y, x):
    """Element-wise 2-argument arctangent."""
    """Element-wise `2-argument arctangent`."""
    return _elwise(y, x, mode="atan2")


 def cosh(x):
    r"""Element-wise hyperbolic cosine."""
    r"""Element-wise `hyperbolic cosine`."""
    return 0.5 * (exp(x) + exp(-x))


 def sinh(x):
    r"""Element-wise hyperbolic sine."""
    r"""Element-wise `hyperbolic sine`."""
    u = expm1(x)
    return 0.5 * u / (u + 1) * (u + 2)


 def tanh(x):
    r"""Element-wise hyperbolic tangent."""
    r"""Element-wise `hyperbolic tangent`."""
    return _elwise(x, mode="tanh")


 def asinh(x):
    r"""Element-wise inverse hyperbolic sine."""
    r"""Element-wise `inverse hyperbolic sine`."""
    return log(x + (x ** 2 + 1) ** 0.5)


 def acosh(x):
    r"""Element-wise inverse hyperbolic cosine."""
    r"""Element-wise `inverse hyperbolic cosine`."""
    return log(x + (x ** 2 - 1) ** 0.5)


 def atanh(x):
    r"""Element-wise inverse hyperbolic tangent."""
    r"""Element-wise `inverse hyperbolic tangent`."""
    return log1p(2 * x / (1 - x)) / 2


 def fast_tanh(x):
    r"""Element-wise fast tanh; this is an approximation:

    .. math::
        \text{fast_tanh}(x) = x * (27. + x * x) / (27. + 9. * x * x)
    """
    return _elwise(x, mode="fast_tanh")


 # bit-twiddling functions


 def left_shift(x, y):
    """Element-wise bitwise binary: x << y.
    """Element-wise `bitwise binary: x << y`.

    :param x: input tensor, should be int.
    :param y: how many bits to be left-shifted.
@@ -418,7 +402,7 @@ def left_shift(x, y):


 def right_shift(x, y):
    """Element-wise bitwise binary: x >> y."""
    """Element-wise `bitwise binary: x >> y`."""
    return _elwise(x, y, mode="shr")


@@ -426,30 +410,30 @@ def right_shift(x, y):


 def logical_and(x, y):
    """Element-wise logical and: x && y."""
    """Element-wise `logical and: x && y`."""
    return _elwise(x, y, mode="AND")


 def logical_not(x):
    """Element-wise logical not: ~x."""
    """Element-wise `logical not: ~x`."""
    return _elwise(x, mode="NOT")


 def logical_or(x, y):
    """Element-wise logical or: x || y."""
    """Element-wise `logical or: x || y`."""
    return _elwise(x, y, mode="OR")


 def logical_xor(x, y):
    """Element-wise logical xor: x ^ y."""
    """Element-wise `logical xor: x ^ y`."""
    return _elwise(x, y, mode="XOR")


 # comparison functions


 def eq(x, y):
    """Element-wise (x == y).
 def equal(x, y):
    """Element-wise `(x == y)`.

    :param x: input tensor 1.
    :param y: input tensor 2.
@@ -465,7 +449,7 @@ def eq(x, y):

        x = tensor(np.arange(0, 6, dtype=np.float32).reshape(2, 3))
        y = tensor(np.arange(0, 6, dtype=np.float32).reshape(2, 3))
        out = F.eq(x, y)
        out = F.equal(x, y)
        print(out.numpy())

    Outputs:
@@ -479,28 +463,28 @@ def eq(x, y):
    return _elwise(x, y, mode="eq")


 def ne(x, y):
    """Element-wise (x != y)."""
 def not_equal(x, y):
    """Element-wise `(x != y)`."""
    return x != y


 def lt(x, y):
    """Element-wise (x < y)."""
 def less(x, y):
    """Element-wise `(x < y)`."""
    return _elwise(x, y, mode="lt")


 def le(x, y):
    """Element-wise (x <= y)."""
 def less_equal(x, y):
    """Element-wise `(x <= y)`."""
    return _elwise(x, y, mode="leq")


 def gt(x, y):
    """Element-wise (x > y)."""
 def greater(x, y):
    """Element-wise `(x > y)`."""
    return _elwise(y, x, mode="lt")


 def ge(x, y):
    """Element-wise (x >= y)."""
 def greater_equal(x, y):
    """Element-wise `(x >= y)`."""
    return _elwise(y, x, mode="leq")


@@ -508,7 +492,7 @@ def ge(x, y):


 def hswish(x):
    """Element-wise x * relu6(x + 3) / 6.
    """Element-wise `x * relu6(x + 3) / 6`.

    :param x: input tensor.
    :return: computed tensor.
@@ -534,7 +518,7 @@ def hswish(x):


 def hsigmoid(x):
    """Element-wise relu6(x + 3) / 6."""
    """Element-wise `relu6(x + 3) / 6`."""
    return relu6(x + 3) / 6


@@ -544,16 +528,16 @@ def relu(x):


 def relu6(x):
    """Element-wise min(max(x, 0), 6)."""
    """Element-wise `min(max(x, 0), 6)`."""
    return minimum(maximum(x, 0), 6)


 def sigmoid(x):
    """Element-wise 1 / ( 1 + exp( -x ) )."""
    """Element-wise `1 / ( 1 + exp( -x ) )`."""
    return _elwise(x, mode="sigmoid")


 def clamp(x: Tensor, lower=None, upper=None) -> Tensor:
 def clip(x: Tensor, lower=None, upper=None) -> Tensor:
    r"""Clamps all elements in input tensor into the range `[` :attr:`lower`, :attr:`upper` `]` and returns
    a resulting tensor:

@@ -578,9 +562,9 @@ def clamp(x: Tensor, lower=None, upper=None) -> Tensor:
        import megengine.functional as F

        a = tensor(np.arange(5).astype(np.int32))
        print(F.clamp(a, 2, 4).numpy())
        print(F.clamp(a, lower=3).numpy())
        print(F.clamp(a, upper=3).numpy())
        print(F.clip(a, 2, 4).numpy())
        print(F.clip(a, lower=3).numpy())
        print(F.clip(a, upper=3).numpy())

    Outputs:

@@ -596,7 +580,7 @@ def clamp(x: Tensor, lower=None, upper=None) -> Tensor:
    ), "At least one of 'lower' or 'upper' must not be None"
    if lower is not None:
        if upper is not None:
            assert lower <= upper, "clamp lower bound is bigger that upper bound"
            assert lower <= upper, "clip lower bound is bigger that upper bound"
            return minimum(maximum(x, lower), upper)
        else:
            return maximum(x, lower)
--- a/imperative/python/megengine/functional/external.py
+++ b/imperative/python/megengine/functional/external.py
@@ -1,44 +0,0 @@
 # -*- coding: utf-8 -*-
 # MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
 #
 # Copyright (c) 2014-2020 Megvii Inc. All rights reserved.
 #
 # Unless required by applicable law or agreed to in writing,
 # software distributed under the License is distributed on an
 # "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # pylint: disable=too-many-lines
 from typing import List

 from ..tensor import Tensor


 def cambricon_subgraph(
    inputs: List[Tensor], data: bytes, symbol: str, tensor_dim_mutable: bool,
 ) -> List[Tensor]:
    """Loads a serialized Cambricon subgraph (i.e. cnrtModel_t) and
    execute the operations defined in the subgraph.

    :param inputs: list of input tensors of the subgraph.
    :param data: the serialized subgraph.
    :param symbol: the name of the function in the subgraph.
        The function is corresponding to a cnmlFusionOp
        which is added to the cnmlModel_t/cnrtModel_t.
    :param tensor_dim_mutable: whether the input tensors' shapes are mutalbe
        in cnrtModel_t.
    """
    raise NotImplementedError


 def extern_opr_subgraph(
    inputs, output_shapes: List[tuple], dump_name: str, dump_data: bytes,
 ) -> List[Tensor]:
    """Loads a serialized extern opr subgraph and fake execute the operator.

    :param inputs: tensor or list of input tensors.
    :param output_shapes: the output shapes.
    :param dump_name: the serialized subgraph name.
    :param dump_data: the serialized subgraph.

    :return: list of tensors.
    """
    raise NotImplementedError
--- a/imperative/python/megengine/functional/graph.py
+++ b/imperative/python/megengine/functional/graph.py
@@ -1,41 +0,0 @@
 # -*- coding: utf-8 -*-
 # MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
 #
 # Copyright (c) 2014-2020 Megvii Inc. All rights reserved.
 #
 # Unless required by applicable law or agreed to in writing,
 # software distributed under the License is distributed on an
 # "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 import collections
 from typing import Iterable, Optional, Union

 from ..tensor import Tensor


 def add_update(
    dest: Tensor,
    delta: Tensor,
    *,
    alpha: Union[Tensor, float, int] = 1.0,
    beta: Union[Tensor, float, int] = 1.0,
    bias: Union[Tensor, float, int] = 0.0
 ):
    r"""Modify ``dest`` inplace as follows:

    .. math::
        dest = alpha * dest +  beta * delta + bias

    :param dest: input data that will be inplace modified.
    :param delta: update value that will be added to ``dest``.
    :param alpha: weight ratio of ``dest``. Default: 1.0
    :param beta: weight ratio of ``delta``. Default: 1.0
    :param bias: bias value appended to the result. Default: 0.0
    """
    if beta is not None and beta != 1.0:
        delta = delta * beta
    if bias is not None and bias != 0.0:
        delta = delta + bias
    if alpha is not None and alpha != 1.0:
        dest *= alpha
    dest += delta
    return dest
--- a/imperative/python/megengine/functional/loss.py
+++ b/imperative/python/megengine/functional/loss.py
@@ -10,14 +10,14 @@ import numpy as np

 from ..core.tensor.utils import make_shape_tuple
 from ..tensor import Tensor
 from .elemwise import abs, eq, exp, log, maximum, pow, relu
 from .nn import indexing_one_hot
 from .elemwise import abs, equal, exp, log, maximum, pow, relu
 from .nn import indexing_one_hot, logsigmoid, logsumexp
 from .tensor import where

 __all__ = [
    "l1_loss",
    "square_loss",
    "cross_entropy_with_softmax",
    "cross_entropy",
    "binary_cross_entropy",
    "hinge_loss",
 ]
@@ -55,7 +55,7 @@ def l1_loss(pred: Tensor, label: Tensor) -> Tensor:

        ipt = mge.tensor(np.array([3, 3, 3, 3]).astype(np.float32))
        tgt = mge.tensor(np.array([2, 8, 6, 1]).astype(np.float32))
        loss = F.l1_loss(ipt, tgt)
        loss = F.nn.l1_loss(ipt, tgt)
        print(loss.numpy())

    Outputs:
@@ -106,7 +106,7 @@ def square_loss(pred: Tensor, label: Tensor) -> Tensor:

        ipt = mge.tensor(np.array([3, 3, 3, 3]).astype(np.float32))
        tgt = mge.tensor(np.array([2, 8, 6, 1]).astype(np.float32))
        loss = F.square_loss(ipt, tgt)
        loss = F.nn.square_loss(ipt, tgt)
        print(loss.numpy())

    Outputs:
@@ -120,10 +120,16 @@ def square_loss(pred: Tensor, label: Tensor) -> Tensor:
    return (diff ** 2).mean()


 def cross_entropy_with_softmax(
    pred: Tensor, label: Tensor, axis: int = 1, label_smooth: float = 0
 def cross_entropy(
    pred: Tensor,
    label: Tensor,
    axis: int = 1,
    with_logits: bool = True,
    label_smooth: float = 0,
 ) -> Tensor:
    r"""Returns loss after applying :func:`~.softmax` + :func:`~.cross_entropy`.
    r"""Compute the multi-class cross entropy loss (using logits by default).

    By default, prediction is assumed to be logits, whose softmax gives probabilities.

    It has better numerical stability compared with sequential calls to :func:`~.softmax` and :func:`~.cross_entropy`.

@@ -132,11 +138,12 @@ def cross_entropy_with_softmax(
    .. math:: y^{LS}_{k}=y_{k}\left(1-\alpha\right)+\alpha/K

    where :math:`y^{LS}` and :math:`y` are new label distribution and origin label distribution respectively.
    k is the index of label distribution. :math:`\alpha` is label_smooth and :math:`K` is the number of classes.
    k is the index of label distribution. :math:`\alpha` is ``label_smooth`` and :math:`K` is the number of classes.

    :param pred: input tensor representing the predicted probability.
    :param label: input tensor representing the classification label.
    :param axis: an axis along which softmax will be applied. Default: 1
    :param with_logits: whether to apply softmax first. Default: True
    :param label_smooth: a label smoothing of parameter that can re-distribute target distribution. Default: 0
    :return: loss value.

@@ -150,9 +157,9 @@ def cross_entropy_with_softmax(

        data_shape = (1, 2)
        label_shape = (1, )
        pred = tensor(np.array([0.5, 0.5], dtype=np.float32).reshape(data_shape))
        pred = tensor(np.array([0, 0], dtype=np.float32).reshape(data_shape))
        label = tensor(np.ones(label_shape, dtype=np.int32))
        loss = F.cross_entropy_with_softmax(pred, label)
        loss = F.nn.cross_entropy(pred, label)
        print(loss.numpy())

    Outputs:
@@ -170,26 +177,41 @@ def cross_entropy_with_softmax(
    )

    num_classes = pred.shape[axis]
    no_label_smooth = (
        label_smooth is None or type(label_smooth) in (int, float) and label_smooth == 0
    )

    if not with_logits:
        if no_label_smooth:
            return -log(indexing_one_hot(pred, label, axis)).mean()
        pred = log(pred)
        return (
            label_smooth * pred.mean()
            - (1 - label_smooth) * indexing_one_hot(pred, label, axis).mean()
        )

    # Denominator of the softmax
    offset = pred.max(axis=axis, keepdims=True).detach()
    pred = pred - offset
    down = exp(pred).sum(axis=axis, keepdims=True)
    down = logsumexp(pred, axis=axis, keepdims=True)

    up = indexing_one_hot(pred, label, axis)

    if label_smooth != 0:
    if not no_label_smooth:
        factor = label_smooth / num_classes
        up = up * (1 - label_smooth) + pred.sum(axis=axis, keepdims=True) * factor

    return (log(down) - up).mean()
    return (down - up).mean()


 def binary_cross_entropy(pred: Tensor, label: Tensor) -> Tensor:
    r"""Function that measures the Binary Cross Entropy between the target and the prediction.
 def binary_cross_entropy(
    pred: Tensor, label: Tensor, with_logits: bool = True
 ) -> Tensor:
    r"""Compute the binary cross entropy loss (using logits by default).

    By default, prediction is assumed to be logits, whose sigmoid gives probabilities.

    :param pred: `(N, *)` where `*` means any number of additional dimensions.
    :param pred: `(N, *)`, where `*` means any number of additional dimensions.
    :param label: `(N, *)`, same shape as the input.
    :param with_logits: bool, whether to apply sigmoid first. Default: True
    :return: loss value.

    Examples:
@@ -200,9 +222,9 @@ def binary_cross_entropy(pred: Tensor, label: Tensor) -> Tensor:
        from megengine import tensor
        import megengine.functional as F

        pred = tensor(np.array([0.5, 0.5], dtype=np.float32).reshape(1, 2))
        pred = tensor(np.array([0, 0], dtype=np.float32).reshape(1, 2))
        label = tensor(np.ones((1, 2), dtype=np.float32))
        loss = F.binary_cross_entropy(pred, label)
        loss = F.nn.binary_cross_entropy(pred, label)
        print(loss.numpy())

    Outputs:
@@ -212,11 +234,15 @@ def binary_cross_entropy(pred: Tensor, label: Tensor) -> Tensor:
        [0.6931]

    """
    return -1.0 * (label * log(pred) + (1.0 - label) * log(1 - pred)).mean()
    if not with_logits:
        return -(label * log(pred) + (1 - label) * log(1 - pred)).mean()
    # logsigmoid(pred) and logsigmoid(-pred) has common sub-expression
    # hopefully the backend would optimize this
    return -(label * logsigmoid(pred) + (1 - label) * logsigmoid(-pred)).mean()


 def hinge_loss(pred: Tensor, label: Tensor, norm: str = "L1") -> Tensor:
    r"""Caculate the hinge loss which is often used in SVMs.
    r"""Caculates the hinge loss which is often used in SVM.

    The hinge loss can be described as:

@@ -236,7 +262,7 @@ def hinge_loss(pred: Tensor, label: Tensor, norm: str = "L1") -> Tensor:

        pred = tensor([[0.5, -0.5, 0.1], [-0.6, 0.7, 0.8]], dtype="float32")
        label = tensor([[1, -1, -1], [-1, 1, 1]], dtype="float32")
        loss = F.hinge_loss(pred, label)
        loss = F.nn.hinge_loss(pred, label)
        print(loss.numpy())

    Outputs:
--- a/imperative/python/megengine/functional/math.py
+++ b/imperative/python/megengine/functional/math.py
@@ -14,11 +14,12 @@ from typing import Optional, Sequence, Tuple, Union

 from ..core.ops import builtin
 from ..core.ops._internal import param_defs as P
 from ..core.ops.special import Const
 from ..core.tensor import utils
 from ..core.tensor.core import apply
 from ..core.tensor.core import TensorBase, TensorWrapperBase, apply
 from ..tensor import Tensor
 from .elemwise import clamp, exp, log, log1p
 from .tensor import add_axis, remove_axis, reshape
 from .elemwise import clip, exp, log, log1p
 from .tensor import reshape, squeeze

 __all__ = [
    "argmax",
@@ -45,7 +46,7 @@ def isnan(inp: Tensor) -> Tensor:
    r"""Returns a new tensor representing if each element is ``NaN`` or not.

    :param inp: input tensor.
    :return: a new tensor representing if each element in inp is NaN or not.
    :return: result tensor.

    Examples:

@@ -71,7 +72,7 @@ def isinf(inp: Tensor) -> Tensor:
    r"""Returns a new tensor representing if each element is ``Inf`` or not.

    :param inp: input tensor.
    :return: a new tensor representing if each element in inp is Inf or not.
    :return: result tensor.

    Examples:

@@ -84,7 +85,7 @@ def isinf(inp: Tensor) -> Tensor:
        print(F.isinf(x).numpy())

    Outputs:
    

    .. testoutput::

        [False  True False]
@@ -108,7 +109,7 @@ def sign(inp: Tensor):

        x = tensor([1, -1, 0])
        print(F.sign(x).numpy())
    

    Outputs:

    .. testoutput::
@@ -128,7 +129,7 @@ def sum(
    reduce over all of them.

    :param inp: input tensor.
    :param axis: dimension to reduce. If None, all the dimensions will be reduced.
    :param axis: dimension to reduce. If None, all dimensions will be reduced.
        Default: None
    :param keepdims: whether the output tensor has axis retained or not.
        Default: False
@@ -163,7 +164,7 @@ def prod(
    reduce over all of them.

    :param inp: input tensor.
    :param axis: dimension to reduce. If None, all the dimensions will be reduced. Default: None
    :param axis: dimension to reduce. If None, all dimensions will be reduced. Default: None
    :param keepdims: whether the output tensor has axis retained or not. Default: False
    :return: output tensor.

@@ -199,7 +200,7 @@ def mean(
    reduce over all of them.

    :param inp: input tensor.
    :param axis: dimension to reduce. If None, all the dimensions will be reduced. Default: None
    :param axis: dimension to reduce. If None, all dimensions will be reduced. Default: None
    :param keepdims: whether the output tensor has axis retained or not. Default: False
    :return: output tensor.

@@ -235,7 +236,7 @@ def var(
    reduce over all of them.

    :param inp: input tensor.
    :param axis: dimension to reduce. If None, all the dimensions will be reduced. Default: None
    :param axis: dimension to reduce. If None, all dimensions will be reduced. Default: None
    :param keepdims: whether the output tensor has axis retained or not. Default: False
    :return: output tensor.

@@ -275,7 +276,7 @@ def std(
    reduce over all of them.

    :param inp: input tensor.
    :param axis: dimension to reduce. If None, all the dimensions will be reduced. Default: None
    :param axis: dimension to reduce. If None, all dimensions will be reduced. Default: None
    :param keepdims: whether the output tensor has axis retained or not. Default: False
    :return: output tensor.

@@ -310,7 +311,7 @@ def min(
    reduce over all of them.

    :param inp: input tensor.
    :param axis: dimension to reduce. If None, all the dimensions will be reduced. Default: None
    :param axis: dimension to reduce. If None, all dimensions will be reduced. Default: None
    :param keepdims: whether the output tensor has axis retained or not. Default: False
    :return: output tensor.

@@ -346,7 +347,7 @@ def max(
    reduce over all of them.

    :param inp: input tensor.
    :param axis: dimension to reduce. If None, all the dimensions will be reduced. Default: None
    :param axis: dimension to reduce. If None, all dimensions will be reduced. Default: None
    :param keepdims: whether the output tensor has axis retained or not. Default: False
    :return: output tensor.

@@ -373,18 +374,14 @@ def max(


 def norm(
    inp: Tensor,
    p: int = 2,
    axis: Optional[Union[int, Sequence[int]]] = None,
    keepdims=False,
    inp: Tensor, ord: float = None, axis: int = None, keepdims=False,
 ):
    """Calculates ``p``-norm of input tensor along
    given axis. If axis is a list of dimensions,
    reduce over all of them.
    given axis.

    :param inp: input tensor.
    :param p: power of value applied to inp. Default: 2
    :param axis: dimension to reduce. If None, all the dimensions will be reduced. Default: None
    :param ord: power of value applied to inp. Default: 2
    :param axis: dimension to reduce. If None, input must be a vector. Default: None
    :param keepdims: whether the output tensor has axis retained or not. Default: False
    :return: output tensor.

@@ -396,7 +393,7 @@ def norm(
        from megengine import tensor
        import megengine.functional as F

        x = tensor(np.arange(-3, 3, dtype=np.float32).reshape(2,3))
        x = tensor(np.arange(-3, 3, dtype=np.float32))
        out = F.norm(x)
        print(out.numpy())

@@ -407,13 +404,18 @@ def norm(
        [4.3589]

    """
    if p == 0:
    if axis is None:
        if inp.ndim != 1:
            raise TypeError("axis is required unless input is a vector")
    if ord is None:
        ord = 2
    if ord == 0:
        return sum(inp != 0, axis=axis, keepdims=keepdims)
    if p == math.inf:
    if ord == math.inf:
        return max(abs(inp))
    if p == -math.inf:
    if ord == -math.inf:
        return min(abs(inp))
    return sum(abs(inp) ** p, axis=axis, keepdims=keepdims) ** (1.0 / p)
    return sum(abs(inp) ** ord, axis=axis, keepdims=keepdims) ** (1.0 / ord)


 def argmin(
@@ -426,7 +428,7 @@ def argmin(
    reduce over all of them.

    :param inp: input tensor.
    :param axis: dimension to reduce. If None, all the dimensions will be reduced. Default: None
    :param axis: dimension to reduce. If None, all dimensions will be reduced. Default: None
    :param keepdims: whether the output tensor has axis retained or not. Default: False
    :return: output tensor.

@@ -458,7 +460,7 @@ def argmin(
            (inp,) = apply(op, inp)

            if not keepdims:
                inp = remove_axis(inp, ai)
                inp = squeeze(inp, ai)

        return inp

@@ -470,7 +472,7 @@ def argmin(
    op = builtin.Argmin(axis=axis)
    (result,) = apply(op, inp)
    if not keepdims:
        result = remove_axis(result, axis)
        result = squeeze(result, axis)
    return result


@@ -484,7 +486,7 @@ def argmax(
    reduce over all of them.

    :param inp: input tensor.
    :param axis: dimension to reduce. If None, all the dimensions will be reduced. Default: None
    :param axis: dimension to reduce. If None, all dimensions will be reduced. Default: None
    :param keepdims: whether the output tensor has axis retained or not. Default: False
    :return: output tensor.

@@ -516,7 +518,7 @@ def argmax(
            (inp,) = apply(op, inp)

            if not keepdims:
                inp = remove_axis(inp, ai)
                inp = squeeze(inp, ai)

        return inp

@@ -528,45 +530,40 @@ def argmax(
    op = builtin.Argmax(axis=axis)
    (result,) = apply(op, inp)
    if not keepdims:
        result = remove_axis(result, axis)
        result = squeeze(result, axis)
    return result


 def normalize(
    inp: Tensor,
    p: int = 2,
    axis: Optional[Union[int, Sequence[int]]] = None,
    eps: float = 1e-12,
    inp: Tensor, ord: float = None, axis: int = None, eps: float = 1e-12,
 ) -> Tensor:
    r"""Performs :math:`L_p` normalization of input tensor along
    given axis. If axis is a list of dimensions,
    reduce over all of them.
    given axis.

    For a tensor inp of shape :math:`(n_0, ..., n_{dim}, ..., n_k)`, each
    For a tensor of shape :math:`(n_0, ..., n_{dim}, ..., n_k)`, each
    :math:`n_{dim}` -element vector :math:`v` along dimension :attr:`axis` is transformed as:

    .. math::
        v = \frac{v}{\max(\lVert v \rVert_p, \epsilon)}.

    :param inp: input tensor.
    :param p: power of value applied to inp. Default: 2
    :param axis: dimension to reduce. If None, all the dimensions will be reduced
        to calculate the norm. Default: None
    :param ord: power of value applied to input tensor. Default: 2
    :param axis: dimension to reduce.If None, input must be a vector. Default: None
    :param eps: a small value to avoid division by zero. Default: 1e-12
    :return: normalized output tensor.
    """
    if axis is None:
        return inp / clamp(norm(inp, p, axis), lower=eps)
        return inp / clip(norm(inp, ord, axis), lower=eps)
    else:
        return inp / clamp(norm(inp, p, axis, keepdims=True), lower=eps)
        return inp / clip(norm(inp, ord, axis, keepdims=True), lower=eps)


 def argsort(inp: Tensor, descending: bool = False) -> Tensor:
    r"""Sorts the target 2d matrix by row, return both the sorted tensor and indices.
    r"""Returns the indices that would sort the input tensor.

    :param inp: input tensor, if 2d, each row will be sorted.
    :param descending: Sort in descending order, where the largest comes first. Default: False
    :return: Tuple of two tensors `(sorted_tensor, indices_of_int32)`.
    :param inp: input tensor. If it's 2d, the result would be array of indices show how to sort each row in the input tensor.
    :param descending: sort in descending order, where the largest comes first. Default: False
    :return: indices of int32 indicates how to sort the input.

    Examples:

@@ -603,6 +600,31 @@ def argsort(inp: Tensor, descending: bool = False) -> Tensor:


 def sort(inp: Tensor, descending: bool = False) -> Tuple[Tensor, Tensor]:
    r"""Returns sorted tensor and the indices would sort the input tensor.

    :param inp: input tensor. If it's 2d, the result would be sorted by row.
    :param descending: sort in descending order, where the largest comes first. Default: False
    :return: tuple of two tensors `(sorted_tensor, indices_of_int32)`.

    Examples:

    .. testcode::

        import numpy as np
        from megengine import tensor
        import megengine.functional as F

        x = tensor(np.array([1,2], dtype=np.float32))
        out, indices = F.sort(x)
        print(out.numpy())

    Outputs:

    .. testoutput::

        [1. 2.]

    """
    assert len(inp.shape) <= 2, "Input should be 1d or 2d"
    if descending:
        order = P.Argsort.Order.DESCENDING
@@ -625,13 +647,13 @@ def topk(
    kth_only: bool = False,
    no_sort: bool = False,
 ) -> Tuple[Tensor, Tensor]:
    r"""Selects the ``Top-K(by default)`` smallest elements of 2d matrix by row.
    r"""Selects the ``Top-K``(by default) smallest elements of 2d matrix by row.

    :param inp: input tensor, if 2d, each row will be sorted.
    :param inp: input tensor. If input tensor is 2d, each row will be sorted.
    :param k: number of elements needed.
    :param descending: if true, return the largest elements instead. Default: False
    :param kth_only: if true, only the k-th element will be returned. Default: False
    :param no_sort: if true, the returned elements can be unordered. Default: False
    :param descending: if True, return the largest elements instead. Default: False
    :param kth_only: if True, only the k-th element will be returned. Default: False
    :param no_sort: if True, the returned elements can be unordered. Default: False
    :return: tuple of two tensors `(topk_tensor, indices_of_int32)`.

    Examples:
@@ -665,15 +687,18 @@ def topk(
        mode = Mode.VALUE_IDX_SORTED
    op = builtin.TopK(mode=mode)

    if not isinstance(k, (TensorBase, TensorWrapperBase)):
        (k,) = Const(k, dtype="int32", device=inp.device)(inp)

    if len(inp.shape) == 1:
        inp = inp.reshape(1, -1)
        res = apply(op, inp, Tensor(k, dtype="int32"))
        res = apply(op, inp, k)
        if kth_only:
            tns = res[0]
        else:
            tns, ind = res[0][0], res[1][0]
    else:
        res = apply(op, inp, Tensor(k, dtype="int32"))
        res = apply(op, inp, k)
        if kth_only:
            tns = res
        else:
--- a/imperative/python/megengine/functional/nn.py
+++ b/imperative/python/megengine/functional/nn.py
@@ -13,46 +13,51 @@ from ..core._imperative_rt import CompNode
 from ..core.ops import builtin
 from ..core.ops._internal import param_defs as P
 from ..core.ops.special import Const
 from ..core.tensor import utils
 from ..core.tensor import megbrain_graph, utils
 from ..core.tensor.core import TensorBase, TensorWrapperBase, apply
 from ..core.tensor.utils import astensor1d
 from ..distributed import WORLD, is_distributed
 from ..jit.tracing import is_tracing
 from ..random import uniform
 from ..tensor import Tensor
 from .debug_param import get_conv_execution_strategy
 from .distributed import all_reduce_sum
 from .elemwise import exp, floor, log, log1p, maximum, minimum, relu
 from .math import argsort, max, sum
 from .tensor import add_axis, broadcast, concat, full, ones, remove_axis, reshape, zeros
 from .tensor import (
    broadcast_to,
    concat,
    expand_dims,
    full,
    ones,
    reshape,
    squeeze,
    zeros,
 )
 from .types import _pair, _pair_nonzero

 __all__ = [
    "adaptive_avg_pool2d",
    "adaptive_max_pool2d",
    "avg_pool2d",
    "batched_nms",
    "batch_norm2d",
    "batch_norm",
    "conv2d",
    "conv_transpose2d",
    "dot",
    "dropout",
    "embedding",
    "indexing_one_hot",
    "interpolate",
    "leaky_relu",
    "linear",
    "local_conv2d",
    "logsigmoid",
    "logsumexp",
    "log_softmax",
    "logsoftmax",
    "matmul",
    "max_pool2d",
    "nms",
    "one_hot",
    "prelu",
    "roi_align",
    "roi_pooling",
    "softmax",
    "softplus",
    "svd",
    "sync_batch_norm",
    "warp_perspective",
 ]

@@ -106,19 +111,18 @@ def conv2d(
    :param padding: size of the paddings added to the input on both sides of its
        spatial dimensions. Only zero-padding is supported. Default: 0
    :param dilation: dilation of the 2D convolution operation. Default: 1
    :param groups: number of groups to divide input and output channels into,
        so as to perform a ``grouped convolution``. When groups is not 1,
        in_channels and out_channels must be divisible by groups,
    :param groups: number of groups into which the input and output channels are divided, so as to perform a ``grouped convolution``. When ``groups`` is not 1,
        ``in_channels`` and ``out_channels`` must be divisible by ``groups``,
        and the shape of weight should be `(groups, out_channel // groups,
        in_channels // groups, height, width)`.
    :type conv_mode: string or :class:`P.Convolution.Mode`.
    :type conv_mode: string or :class:`P.Convolution.Mode`
    :param conv_mode: supports "CROSS_CORRELATION" or "CONVOLUTION". Default:
        "CROSS_CORRELATION"
    :type compute_mode: string or
        :class:`P.Convolution.ComputeMode`.
        :class:`P.Convolution.ComputeMode`
    :param compute_mode: when set to "DEFAULT", no special requirements will be
        placed on the precision of intermediate results. When set to "FLOAT32",
        Float32 would be used for accumulator and intermediate result, but only
        "Float32" would be used for accumulator and intermediate result, but only
        effective when input and output are of Float16 dtype.
    :return: output tensor.
    """
@@ -167,24 +171,23 @@ def conv_transpose2d(

    :param inp: feature map of the convolution operation.
    :param weight: convolution kernel.
    :param bias: bias added to the result of convolution (if given)
    :param bias: bias added to the result of convolution (if given).
    :param stride: stride of the 2D convolution operation. Default: 1
    :param padding: size of the paddings added to the input on both sides of its
        spatial dimensions. Only zero-padding is supported. Default: 0
    :param dilation: dilation of the 2D convolution operation. Default: 1
    :param groups: number of groups to divide input and output channels into,
        so as to perform a ``grouped convolution``. When groups is not 1,
        in_channels and out_channels must be divisible by groups,
    :param groups: number of groups into which the input and output channels are divided, so as to perform a ``grouped convolution``. When ``groups`` is not 1,
        ``in_channels`` and ``out_channels`` must be divisible by groups,
        and the shape of weight should be `(groups, out_channel // groups,
        in_channels // groups, height, width)`. Default: 1
    :type conv_mode: string or :class:`P.Convolution.Mode`.
    :type conv_mode: string or :class:`P.Convolution.Mode`
    :param conv_mode: supports "CROSS_CORRELATION" or "CONVOLUTION". Default:
        "CROSS_CORRELATION"
    :type compute_mode: string or
        :class:`P.Convolution.ComputeMode`.
        :class:`P.Convolution.ComputeMode`
    :param compute_mode: when set to "DEFAULT", no special requirements will be
        placed on the precision of intermediate results. When set to "FLOAT32",
        Float32 would be used for accumulator and intermediate result, but only
        "Float32" would be used for accumulator and intermediate result, but only
        effective when input and output are of Float16 dtype.
    :return: output tensor.
    """
@@ -222,10 +225,8 @@ def local_conv2d(
    padding: Union[int, Tuple[int, int]] = 0,
    dilation: Union[int, Tuple[int, int]] = 1,
    conv_mode="CROSS_CORRELATION",
 ) -> Tensor:
    """Applies spatial 2D convolution over an image with untied kernels.

    Refer to :class:`~.LocalConv2d` for more information.
 ):
    """Applies spatial 2D convolution over an groupped channeled image with untied kernels.
    """
    assert conv_mode == "CROSS_CORRELATION" or conv_mode.name == "CROSS_CORRELATION"

@@ -233,6 +234,8 @@ def local_conv2d(
    pad_h, pad_w = expand_hw(padding)
    dilate_h, dilate_w = expand_hw(dilation)

    Sparse = P.Convolution.Sparse

    op = builtin.GroupLocal(
        stride_h=stride_h,
        stride_w=stride_w,
@@ -240,7 +243,9 @@ def local_conv2d(
        pad_w=pad_w,
        dilate_h=dilate_h,
        dilate_w=dilate_w,
        # strategy=get_conv_execution_strategy(),
        mode=conv_mode,
        compute_mode="DEFAULT",
        sparse=Sparse.DENSE,
    )
    inp, weight = utils.convert_inputs(inp, weight)
    (output,) = apply(op, inp, weight)
@@ -263,7 +268,7 @@ def max_pool2d(
    :param kernel_size: size of the window.
    :param stride: stride of the window. If not provided, its value is set to kernel_size.
        Default: None
    :param padding: implicit zero padding to be added on both sides. Default: 0
    :param padding: implicit zero padding added on both sides. Default: 0
    :return: output tensor.
    """
    if stride is None:
@@ -292,15 +297,15 @@ def avg_pool2d(
    padding: Union[int, Tuple[int, int]] = 0,
    mode: str = "AVERAGE_COUNT_EXCLUDE_PADDING",
 ) -> Tensor:
    """Applies a 2D average pooling over an input tensor.
    """Applies 2D average pooling over an input tensor.

    Refer to :class:`~.AvgPool2d` for more information.

    :param inp: input tensor.
    :param kernel_size: size of the window.
    :param stride: stride of the window. If not provided, its value is set to kernel_size.
    :param stride: stride of the window. If not provided, its value is set to ``kernel_size``.
        Default: None
    :param padding: implicit zero padding to be added on both sides. Default: 0
    :param padding: implicit zero padding added on both sides. Default: 0
    :param mode: whether to count padding values. Default: "AVERAGE_COUNT_EXCLUDE_PADDING"
    :return: output tensor.
    """
@@ -323,6 +328,48 @@ def avg_pool2d(
    return output


 def adaptive_max_pool2d(
    inp: Tensor, oshp: Union[Tuple[int, int], int, Tensor],
 ) -> Tensor:
    """Applies a 2D max adaptive pooling over an input.

    Refer to :class:`~.MaxAdaptivePool2d` for more information.

    :param inp: The input tensor.
    :param oshp: (OH, OW) size of the output shape.
    :return: output tensor.
    """
    assert isinstance(inp, (Tensor, megbrain_graph.VarNode)), "inp must be Tensor type"
    if isinstance(oshp, int):
        oshp = (oshp, oshp)

    op = builtin.AdaptivePooling(mode="MAX", format="NCHW",)
    oshp = astensor1d(oshp, inp, dtype="int32", device=inp.device)
    (output,) = apply(op, inp, oshp)
    return output


 def adaptive_avg_pool2d(
    inp: Tensor, oshp: Union[Tuple[int, int], int, Tensor],
 ) -> Tensor:
    """Applies a 2D average adaptive pooling over an input.

    Refer to :class:`~.AvgAdaptivePool2d` for more information.

    :param inp: The input tensor.
    :param oshp: (OH, OW) size of the output shape.
    :return: output tensor.
    """
    assert isinstance(inp, (Tensor, megbrain_graph.VarNode)), "inp must be Tensor type"
    if isinstance(oshp, int):
        oshp = (oshp, oshp)

    op = builtin.AdaptivePooling(mode="AVERAGE", format="NCHW",)
    oshp = astensor1d(oshp, inp, dtype="int32", device=inp.device)
    (output,) = apply(op, inp, oshp)
    return output


 def prelu(inp: Tensor, weight: Tensor) -> Tensor:
    r"""
    Applies the element-wise PReLU function.
@@ -346,17 +393,17 @@ def softplus(inp: Tensor) -> Tensor:

    .. math::
        \text{softplus}(x) = \log(1 + \exp(x))
    

    softplus is a smooth approximation to the ReLU function and can be used
    to constrain the output of a machine to always be positive.
    to constrain the output to be always positive.
    For numerical stability the implementation follows this transformation:

    .. math::
        \text{softplus}(x) = \log(1 + \exp(x)) 
                           = \log(1 + \exp(-\text{abs}(x))) + \max(x, 0) 
        \text{softplus}(x) = \log(1 + \exp(x))
                           = \log(1 + \exp(-\text{abs}(x))) + \max(x, 0)
                           = \log1p(\exp(-\text{abs}(x))) + \text{relu}(x)

    :param inp: The input tensor
    :param inp: input tensor.

    Examples:

@@ -369,9 +416,9 @@ def softplus(inp: Tensor) -> Tensor:
        x = tensor(np.arange(-3, 3, dtype=np.float32))
        y = F.softplus(x)
        print(y.numpy())
    

    Outputs:
    

    .. testoutput::

        [0.0486 0.1269 0.3133 0.6931 1.3133 2.1269]
@@ -380,7 +427,7 @@ def softplus(inp: Tensor) -> Tensor:
    return log1p(exp(-abs(inp))) + relu(inp)


 def log_softmax(inp: Tensor, axis: Union[int, Sequence[int]]) -> Tensor:
 def logsoftmax(inp: Tensor, axis: Union[int, Sequence[int]]) -> Tensor:
    r"""Applies the :math:`\log(\text{Softmax}(x))` function to an n-dimensional
    input Tensor. The LogSoftmax formulation can be simplified as:

@@ -390,13 +437,13 @@ def log_softmax(inp: Tensor, axis: Union[int, Sequence[int]]) -> Tensor:
    For numerical stability the implementation follows this transformation:

    .. math::
        \operatorname{logsoftmax}(x) 
        \operatorname{logsoftmax}(x)
        = \log (\frac{\exp (x)}{\sum_{i}(\exp (x_{i}))})
        = x - \log (\sum_{i}(\exp (x_{i})))
        = x - logsumexp(x)
    
    :param inp: The input tensor
    :param axis: An axis along which log_softmax will be applied.

    :param inp: input tensor.
    :param axis: axis along which logsoftmax will be applied.

    Examples:

@@ -407,11 +454,11 @@ def log_softmax(inp: Tensor, axis: Union[int, Sequence[int]]) -> Tensor:
        import megengine.functional as F

        x = tensor(np.arange(-5, 5, dtype=np.float32)).reshape(2,5)
        y = F.log_softmax(x, axis=1)
        y = F.logsoftmax(x, axis=1)
        print(y.numpy())

    Outputs:
    

    .. testoutput::

        [[-4.4519 -3.4519 -2.4519 -1.4519 -0.4519]
@@ -430,7 +477,7 @@ def logsigmoid(inp: Tensor) -> Tensor:
        = - \log(1 + exp(-x))
        = - \text{softplus}(-x)

    :param inp: The input tensor
    :param inp: input tensor.

    Examples:

@@ -459,11 +506,10 @@ def logsumexp(
    inp: Tensor, axis: Union[int, Sequence[int]], keepdims: bool = False
 ) -> Tensor:
    r"""
    Compute the log of the sum of exponentials of inputs along the given :attr:`axis`. 
    The computation is numerically stabilized.
    
    Calculates the logarithm of the inputs' exponential sum along the given :attr:`axis`.

    .. math::
        

        \operatorname{logsumexp}(\boldsymbol{x})= \log \sum_{j=1}^{n} \exp \left(x_{j}\right)

    For numerical stability, the implementation follows this transformation:
@@ -472,18 +518,18 @@ def logsumexp(

        \operatorname{logsumexp}(\boldsymbol{x})= \log \sum_{j=1}^{n} \exp \left(x_{j}\right)
        = \operatorname{logsumexp}(\boldsymbol{x})=b+\log \sum_{j=1}^{n} \exp \left(x_{j}-b\right)
    

    where

    .. math::
        b = \max(x_j)

    :param inp: The input tensor.
    :param axis: Axis over which the sum is taken. It can be a single axis or a list of axes.
    :param inp: input tensor.
    :param axis: axis over which the sum is taken. It could be single axis or list of axes.
    :param keepdims: whether to retain :attr:`axis` or not for the output tensor.

    Examples:
    

    .. testcode::

        import numpy as np
@@ -501,11 +547,11 @@ def logsumexp(
        [-0.5481  4.4519]

    """
    max_value = max(inp, axis, keepdims=True)
    max_value = max(inp.detach(), axis, keepdims=True)
    if keepdims:
        return max_value + log(sum(exp(inp - max_value), axis, keepdims))
    else:
        return remove_axis(max_value, axis=None) + log(
        return squeeze(max_value, axis=None) + log(
            sum(exp(inp - max_value), axis, keepdims)
        )

@@ -523,13 +569,13 @@ def softmax(inp: Tensor, axis: Optional[int] = None) -> Tensor:
    .. math::
            \text{Softmax}(x_{i}) = \frac{\exp(x_i)}{\sum_j \exp(x_j)}

    It is applied to all elements along axis, and will re-scale them so that
    the elements lie in the range `[0, 1]` and sum to 1.
    It is applied to all elements along axis, and rescales elements so that
    they stay in the range `[0, 1]` and sum to 1.

    See :class:`~megengine.module.activation.Softmax` for more details.

    :param inp: The input tensor.
    :param axis: An axis along which softmax will be applied. By default,
    :param inp: input tensor.
    :param axis: an axis along which softmax will be applied. By default,
        softmax will apply along the highest ranked axis.

    Examples:
@@ -560,7 +606,7 @@ def softmax(inp: Tensor, axis: Optional[int] = None) -> Tensor:
    return cached / down


 def batch_norm2d(
 def batch_norm(
    inp: Tensor,
    running_mean: Tensor = None,
    running_var: Tensor = None,
@@ -572,7 +618,7 @@ def batch_norm2d(
    eps: float = 1e-5,
    inplace: bool = True
 ):
    """Applies batch normalization to the input.
    r"""Applies batch normalization to the input.

    Refer to :class:`~.BatchNorm2d` and :class:`~.BatchNorm1d` for more information.

@@ -584,26 +630,28 @@ def batch_norm2d(
    :param bias: bias tensor in the learnable affine parameters.
        See :math:`\beta` in :class:`~.BatchNorm2d`.
    :param training: a boolean value to indicate whether batch norm is performed
        in traning mode. Default: False
        in training mode. Default: False
    :param momentum: value used for the ``running_mean`` and ``running_var``
        computation.
        Default: 0.9
    :param eps: a value added to the denominator for numerical stability.
        Default: 1e-5
    :param inplace: whether to update running_mean and running_var inplace or return new tensors 
    :param inplace: whether to update ``running_mean`` and ``running_var`` inplace or return new tensors
        Default: True
    :return: output tensor.
    """
    if inp.ndim != 4:
        raise NotImplementedError("batch_norm for ndim != 4")

    def full_value(value):
        C = inp.shape[1]
        (x,) = Const(value, dtype=inp.dtype, device=inp.device)(inp)
        return broadcast(x, [1, C, 1, 1])
        return broadcast_to(x, [1, C, 1, 1])

    def expand_or_full(x, value):
        if x is None:
            return full_value(value)
        return add_axis(x, [0, 2, 3])
        return expand_dims(x, [0, 2, 3])

    def make_full_if_none(x, value):
        if x is None:
@@ -676,7 +724,7 @@ def sync_batch_norm(
    eps_mode="ADDITIVE",
    group=WORLD,
 ) -> Tensor:
    """Applies synchronized batch normalization to the input.
    r"""Applies synchronized batch normalization to the input.

    Refer to :class:`~.BatchNorm2d` and :class:`~.BatchNorm1d` for more information.

@@ -717,7 +765,7 @@ def sync_batch_norm(

        if is_distributed():
            # reduce all nodes' data to calculate mean and variance
            reduce_size = broadcast(Tensor(reduce_size, dtype=_dtype), [1] * _ndim)
            reduce_size = broadcast_to(Tensor(reduce_size, dtype=_dtype), [1] * _ndim)
            stat = concat(
                [reduce_size.astype(_dtype), channel_x1s, channel_x2s], axis=1
            )
@@ -838,6 +886,10 @@ def warp_perspective(
    :param interp_mode: interpolation methods. Default: "LINEAR"
    :return: output tensor.

    Note:

    The transformation matrix is the inverse of that used by `cv2.warpPerspective`.

    Examples:

    .. testcode::
@@ -868,7 +920,8 @@ def warp_perspective(
        imode=interp_mode, bmode=border_mode, format="NCHW", border_val=border_val
    )
    inp, M = utils.convert_inputs(inp, M)
    (result,) = apply(op, inp, M, Tensor(dsize))
    dsize = astensor1d(dsize, inp, dtype="int32", device=inp.device)
    (result,) = apply(op, inp, M, dsize)
    return result


@@ -885,19 +938,18 @@ def matmul(

    With different inputs dim, this function behaves differently:

    - Both 1-D tensor, simply forward to dot.
    - Both 1-D tensor, simply forward to ``dot``.
    - Both 2-D tensor, normal matrix multiplication.
    - If one input tensor is 1-D, matrix vector multiplication.
    - If at least one tensor are 3-dimensional or >3-dimensional, the batched matrix-matrix is returned, and the tensor with smaller dimension will
    - If at least one tensor are 3-dimensional or >3-dimensional, the other tensor should have dim >= 2, the batched matrix-matrix is returned, and the tensor with smaller dimension will
      be broadcasted. For example:
        - inp1: `(k, m)`, inp2: `(m, p)`, return: `(k, p)`
        - inp1: `(n, k, m)`, inp2: `(n, m, p)`, return: `(n, k, p)`
        - inp1: `(n, k, m)`, inp2: `(m, p)`, return: `(n, k, p)`
        - inp1: `(n, j, k, m)`, inp2: `(n, j, m, p)`, return: `(n, j, k, p)`

    :param inp1: The first matrix to be multiplied
    :param inp2: The second matrix to be multiplied
    :return: The output tensor
    :param inp1: first matrix to be multiplied.
    :param inp2: second matrix to be multiplied.
    :return: output tensor.

    Examples:

@@ -931,10 +983,10 @@ def matmul(
        if dim1 != dim2:
            if dim1 < dim2:
                shape1 = shape2[: dim2 - dim1] + shape1
                inp1 = inp1.broadcast(*shape1)
                inp1 = broadcast_to(inp1, shape1)
            else:
                shape2 = shape1[: dim1 - dim2] + shape2
                inp2 = inp2.broadcast(*shape2)
                inp2 = broadcast_to(inp2, shape2)
        reshaped_batch_size = 1
        for i in shape1[:-2]:
            reshaped_batch_size *= i
@@ -949,9 +1001,9 @@ def matmul(
        shp = shape1[:-1] + shape2[-1:]
    elif dim1 == 3 or dim2 == 3:
        if dim2 < 3:
            inp2 = inp2.broadcast(*(inp1.shape[:1] + inp2.shape))
            inp2 = broadcast_to(inp2, inp1.shape[:1] + inp2.shape)
        elif dim1 < 3:
            inp1 = inp1.broadcast(*(inp2.shape[:1] + inp1.shape))
            inp1 = broadcast_to(inp1, inp2.shape[:1] + inp1.shape)
        op = builtin.BatchedMatrixMul(
            transposeA=transpose_a,
            transposeB=transpose_b,
@@ -961,10 +1013,10 @@ def matmul(
    else:
        if dim1 == 1:
            shp = (inp2.shape[1],)
            inp1 = add_axis(inp1, 0)
            inp1 = expand_dims(inp1, 0)
        if dim2 == 1:
            shp = (inp1.shape[0],)
            inp2 = add_axis(inp2, 1)
            inp2 = expand_dims(inp2, 1)
        op = builtin.MatrixMul(
            transposeA=transpose_a,
            transposeB=transpose_b,
@@ -981,12 +1033,12 @@ def matmul(

 def dot(inp1: Tensor, inp2: Tensor) -> Tensor:
    """
    Compute dot-product of two vectors ``inp1`` and ``inp2``.
    Computes dot-product of two vectors ``inp1`` and ``inp2``.
    inputs must be 1-dimensional, scalar input can be automatically broadcasted.

    :param inp1: The first vector
    :param inp2: The second vector
    :return: The output value
    :param inp1: first vector.
    :param inp2: second vector.
    :return: output value.

    Examples:

@@ -1016,10 +1068,10 @@ def dot(inp1: Tensor, inp2: Tensor) -> Tensor:

 def svd(inp: Tensor, full_matrices=False, compute_uv=True) -> Tensor:
    """
    Compute the singular value decompositions of input matrix ``inp``.
    Computes the singular value decompositions of input matrix.

    :param inp: The input matrix, must has shape ``[..., M, N]``
    :return: The output matrices, U, sigma, V
    :param inp: input matrix, must has shape `[..., M, N]`.
    :return: output matrices, `(U, sigma, V)`.

    Examples:

@@ -1036,7 +1088,7 @@ def svd(inp: Tensor, full_matrices=False, compute_uv=True) -> Tensor:
    Outputs:

    .. testoutput::
    

        [7.3485 1.    ]

    """
@@ -1052,8 +1104,7 @@ def interpolate(
    mode: str = "BILINEAR",
    align_corners: bool = None,
 ) -> Tensor:
    r"""Down/up samples the input tensor to either the given size or the given
    scale_factor.
    r"""Down/up samples the input tensor to either the given size or with the given scale_factor. ``size`` can not coexist with ``scale_factor``.

    :param inp: input tensor.
    :param size: size of the output tensor. Default: None
@@ -1069,13 +1120,12 @@ def interpolate(
        import numpy as np
        from megengine import tensor
        import megengine.functional as F
        from megengine.test import assertTensorClose

        x = tensor(np.arange(1, 5, dtype=np.float32).reshape(1, 1, 2, 2))
        out = F.interpolate(x, [4, 4], align_corners=False)
        out = F.nn.interpolate(x, [4, 4], align_corners=False)
        print(out.numpy())
        out2 = F.interpolate(x, scale_factor=2.)
        assertTensorClose(out.numpy(), out2.numpy())
        out2 = F.nn.interpolate(x, scale_factor=2.)
        np.testing.assert_allclose(out.numpy(), out2.numpy())

    Outputs:

@@ -1100,7 +1150,7 @@ def interpolate(
            align_corners = False

    if mode == "LINEAR":
        inp = add_axis(inp, 3)
        inp = expand_dims(inp, 3)

    if inp.ndim != 4:
        raise ValueError("shape of input tensor must correspond to the operartion mode")
@@ -1170,7 +1220,7 @@ def interpolate(
            [row0, row1, Tensor([[0, 0, 1]], dtype="float32", device=inp.device)],
            axis=0,
        ).reshape(1, 3, 3)
        weight = broadcast(weight, (inp.shape[0], 3, 3))
        weight = broadcast_to(weight, (inp.shape[0], 3, 3))
    else:
        hscale = 1.0 * ih / oh
        wscale = 1.0 * iw / ow
@@ -1186,7 +1236,7 @@ def interpolate(
            [row0, row1, Tensor([[0, 0, 1]], dtype="float32", device=inp.device)],
            axis=0,
        ).reshape(1, 3, 3)
        weight = broadcast(weight, (inp.shape[0], 3, 3))
        weight = broadcast_to(weight, (inp.shape[0], 3, 3))

    weight = weight.astype("float32")
    ret = warp_perspective(inp, weight, dsize, interp_mode="LINEAR")
@@ -1197,12 +1247,12 @@ def interpolate(

 def dropout(inp: Tensor, drop_prob: float, training: bool = True) -> Tensor:
    """Returns a new tensor where each of the elements are randomly set to zero
    with probability P = ``drop_prob``. Optionally rescale the output tensor.
    with probability P = ``drop_prob``. Optionally rescale the output tensor if ``training`` is True.

    :param inp: input tensor.
    :param drop_prob: probability to drop (set to zero) a single element.
    :param training: the default behavior of ``dropout`` during training is to rescale the output,
        then it can be replaced by an :class:`~.Identity` during inference, default to True.
        then it can be replaced by an :class:`~.Identity` during inference. Default: True
    :return: the output tensor

    Examples:
@@ -1244,10 +1294,10 @@ def embedding(
    """Applies lookup table for embedding.

    :param inp: tensor with indices.
    :param weight: learnable weights which embedding from.
    :param padding_idx: should be set to None, not support now.
    :param max_norm: should be set to None, not support now.
    :param norm_type: should be set to None, not support now.
    :param weight: learnable weights which embeds from.
    :param padding_idx: should be set to None, not supported now.
    :param max_norm: should be set to None, not supported now.
    :param norm_type: should be set to None, not supported now.
    :return: output tensor.

    Refer to :class:`~.Embedding` for more information.
@@ -1288,7 +1338,7 @@ def roi_pooling(
            np.random.seed(42)
            inp = tensor(np.random.randn(1, 1, 128, 128))
            rois = tensor(np.random.random((4, 5)))
            y = F.roi_pooling(inp, rois, (2, 2))
            y = F.nn.roi_pooling(inp, rois, (2, 2))
            print(y.numpy()[0])

    Outputs:
@@ -1323,14 +1373,14 @@ def roi_align(
 ) -> Tensor:
    """Applies roi align on input feature.

    :param inp: tensor that represents the input feature, `(N, C, H, W)` images.
    :param rois: `(N, 5)` boxes. First column is the index into N. The other 4 columns are xyxy.
    :param inp: tensor that represents the input feature, shape is `(N, C, H, W)`.
    :param rois: `(N, 5)` boxes. First column is the box index. The other 4 columns are ``xyxy``.
    :param output_shape: `(height, width)` shape of output rois feature.
    :param mode: "max" or "average", use max/average align just like max/average pooling. Default: "average"
    :param spatial_scale: scale the input boxes by this number. Default: 1.0
    :param sample_points: number of inputs samples to take for each output sample.
        0 to take samples densely. Default: 2
    :param aligned: wheather align the input feature, with `aligned=True`,
    :param aligned: wheather to align the input feature, with `aligned=True`,
        we first appropriately scale the ROI and then shift it by -0.5. Default: True
    :return: output tensor.

@@ -1345,7 +1395,7 @@ def roi_align(
            np.random.seed(42)
            inp = tensor(np.random.randn(1, 1, 128, 128))
            rois = tensor(np.random.random((4, 5)))
            y = F.roi_align(inp, rois, (2, 2))
            y = F.nn.roi_align(inp, rois, (2, 2))
            print(y.numpy()[0])

    Outputs:
@@ -1383,7 +1433,7 @@ def roi_align(
 def indexing_one_hot(
    src: Tensor, index: Tensor, axis: int = 1, keepdims=False
 ) -> Tensor:
    r"""One-hot indexing for some axis.
    r"""One-hot indexing for some axes.

    :param src: input tensor.
    :param index: index tensor.
@@ -1417,19 +1467,23 @@ def indexing_one_hot(
    index = utils.convert_single_value(index, (src,), dtype="int32", device=src.device)
    (result,) = apply(op, src, index)
    if not keepdims:
        result = remove_axis(result, axis)
        result = squeeze(result, axis)
    return result


 def nms(boxes: Tensor, scores: Tensor, iou_thresh: float) -> Tensor:
 def nms(
    boxes: Tensor, scores: Tensor, iou_thresh: float, max_output: Optional[int] = None
 ) -> Tensor:
    r"""
    Performs non-maximum suppression (NMS) on the boxes according to their intersection-over-union(IoU).

    :param boxes: tensor of shape `(N, 4)`; the boxes to perform nms on; each box is expected to be in `(x1, y1, x2, y2)` format.
    :param iou_thresh: iou threshold for overlapping.
    :param iou_thresh: IoU threshold for overlapping.
    :param scores: tensor of shape `(N,)`, the score of boxes.
    :param max_output: the maximum number of boxes to keep; it is optional if this operator is not traced 
        otherwise it required to be specified; if it is not specified, all boxes are kept.
    :return: indices of the elements that have been kept by NMS.
    

    Examples:

    .. testcode::
@@ -1444,13 +1498,13 @@ def nms(boxes: Tensor, scores: Tensor, iou_thresh: float) -> Tensor:
        x[:,2:] = np.random.rand(100,2)*20 + 100
        scores = tensor(np.random.rand(100))
        inp = tensor(x)
        result = F.nms(inp, scores, iou_thresh=0.7)
        result = F.nn.nms(inp, scores, iou_thresh=0.7)
        print(result.numpy())

    Outputs:

    .. testoutput::
    

        [75 69]

    """
@@ -1466,74 +1520,24 @@ def nms(boxes: Tensor, scores: Tensor, iou_thresh: float) -> Tensor:
    scores = scores.detach()
    sorted_idx = argsort(scores, descending=True)
    boxes = boxes[sorted_idx]
    max_output = boxes.shape[0]

    if is_tracing():
        assert (
            max_output is not None and max_output > 0
        ), "max_output should be specified under tracing"

    if max_output is None:
        max_output = boxes.shape[0]

    op = builtin.NMSKeep(iou_thresh, max_output)
    inp = utils.convert_inputs(boxes.reshape(1, -1, 4))
    indices, count = apply(op, *inp)
    indices = indices[0][: count.item()]
    indices = indices[0][: count[0]]
    keep_inds = sorted_idx[indices]
    return keep_inds


 def batched_nms(
    boxes: Tensor, scores: Tensor, idxs: Tensor, iou_thresh: float,
 ) -> Tensor:
    r"""
    Performs non-maximum suppression (NMS) on the boxes according to their intersection-over-union (IoU).

    :param boxes: tensor of shape `(N, 4)`; the boxes to perform nms on; each box is expected to be in `(x1, y1, x2, y2)` format
    :param iou_thresh: iou threshold for overlapping
    :param idxs: tensor of shape `(N,)`, the class indexs of boxes in the batch.
    :param scores: tensor of shape `(N,)`, the score of boxes.
    :return: indices and the number of the elements that have been kept by NMS

    Examples:

    .. testcode::

        import numpy as np
        from megengine import tensor
        import megengine.functional as F

        x = np.zeros((100,4))
        np.random.seed(42)
        x[:,:2] = np.random.rand(100,2)*20
        x[:,2:] = np.random.rand(100,2)*20 + 100
        scores = tensor(np.random.rand(100))
        idxs =  tensor(np.random.randint(0, 10, 100))
        inp = tensor(x)
        result = F.batched_nms(inp, scores, idxs, iou_thresh=0.6)
        print(result.numpy())

    Outputs:

    .. testoutput::

        [75 41 99 98 69 64 11 27 35 18]

    """
    assert (
        boxes.ndim == 2 and boxes.shape[1] == 4
    ), "the expected shape of boxes is (N, 4)"
    assert scores.ndim == 1, "the expected shape of scores is (N,)"
    assert idxs.ndim == 1, "the expected shape of idxs is (N,)"
    assert boxes.shape[0] == scores.shape[0] == idxs.shape[0]

    boxes = boxes.detach()
    scores = scores.detach()
    idxs = idxs.detach()
    max_coordinate = boxes.max()
    offsets = idxs.astype("float32") * (max_coordinate + 1)
    boxes = boxes + offsets.reshape(-1, 1).broadcast(boxes.shape[0], 4)

    sorted_idx = argsort(scores, descending=True)
    boxes = boxes[sorted_idx]
    max_output = boxes.shape[0]

    op = builtin.NMSKeep(iou_thresh, max_output)
    inp = utils.convert_inputs(boxes.reshape(1, -1, 4))
    indices, count = apply(op, *inp)
    indices = indices[0][: count.item()]
    keep_inds = sorted_idx[indices]
    return keep_inds
 from .loss import *  # isort:skip
 from .quantized import conv_bias_activation  # isort:skip
--- a/imperative/python/megengine/functional/param_pack.py
+++ b/imperative/python/megengine/functional/param_pack.py
@@ -1,34 +0,0 @@
 # -*- coding: utf-8 -*-
 # MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
 #
 # Copyright (c) 2014-2020 Megvii Inc. All rights reserved.
 #
 # Unless required by applicable law or agreed to in writing,
 # software distributed under the License is distributed on an
 # "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 import numpy as np

 from ..tensor import Tensor
 from .distributed import all_reduce_sum
 from .tensor import param_pack_concat, param_pack_split


 def get_offsets(shapes):
    offsets = []
    offset = 0
    for shape in shapes:
        offsets.append(offset)
        offset += int(np.prod(shape))
        offsets.append(offset)
    return offsets


 def pack_allreduce_split(pack_list, shapes, group, reduce_method):
    offsets_val = get_offsets(shapes)
    offsets = Tensor(offsets_val)
    packed_grads = param_pack_concat(pack_list, offsets, offsets_val)
    packed_grads = all_reduce_sum(packed_grads, group)
    if reduce_method == "mean":
        packed_grads /= group.size
    grads = param_pack_split(packed_grads, offsets_val, shapes)
    return grads
--- a/imperative/python/megengine/functional/quantized.py
+++ b/imperative/python/megengine/functional/quantized.py
@@ -34,26 +34,23 @@ def conv_bias_activation(
    :param weight: convolution kernel.
    :param bias: bias added to the result of convolution
    :param stride: stride of the 2D convolution operation. Default: 1
    :param padding: size of the paddings added to the input on both sides of its
        spatial dimensions. Only zero-padding is supported. Default: 0
    :param padding: size of the paddings added to the input on both sides of its spatial dimensions. Only zero-padding is supported. Default: 0
    :param dilation: dilation of the 2D convolution operation. Default: 1
    :param groups: number of groups to divide input and output channels into,
        so as to perform a "grouped convolution". When groups is not 1,
        in_channels and out_channels must be divisible by groups,
    :param groups: number of groups into which the input and output channels are divided, so as to perform a "grouped convolution". When ``groups`` is not 1,
        ``in_channels`` and ``out_channels`` must be divisible by ``groups``,
        and the shape of weight should be `(groups, out_channel // groups,
        in_channels // groups, height, width)`.
    :type conv_mode: string or :class:`P.Convolution.Mode`.
    :param conv_mode: supports 'CROSS_CORRELATION' or 'CONVOLUTION'. Default:
        'CROSS_CORRELATION'
    :param dtype: support for np.dtype, Default: np.int8
    :param dtype: support for ``np.dtype``, Default: np.int8
    :param scale: scale if use quantization, Default: 0.0
    :param zero_point: scale if use quantization quint8, Default: 0.0
    :type compute_mode: string or
        :class:`P.Convolution.ComputeMode`.
    :param compute_mode: when set to 'DEFAULT', no special requirements will be
        placed on the precision of intermediate results. When set to 'FLOAT32',
        Float32 would be used for accumulator and intermediate result, but only
        effective when input and output are of Float16 dtype.
    :param compute_mode: when set to "DEFAULT", no special requirements will be
        placed on the precision of intermediate results. When set to "FLOAT32",
        "Float32" would be used for accumulator and intermediate result, but only effective when input and output are of Float16 dtype.

    """
    ph, pw = _pair(padding)
--- a/imperative/python/megengine/functional/tensor.py
+++ b/imperative/python/megengine/functional/tensor.py
@@ -19,6 +19,7 @@ from ..core.ops import builtin
 from ..core.ops._internal import param_defs as P
 from ..core.ops.special import Const
 from ..core.tensor.core import TensorBase, TensorWrapperBase, apply
 from ..core.tensor.tensor_wrapper import _broadcast, _remove_axis
 from ..core.tensor.utils import (
    astensor1d,
    convert_inputs,
@@ -31,27 +32,22 @@ from ..tensor import Tensor
 from .elemwise import ceil

 __all__ = [
    "add_axis",
    "arange",
    "broadcast",
    "broadcast_to",
    "concat",
    "cond_take",
    "transpose",
    "add_axis",
    "expand_dims",
    "eye",
    "flatten",
    "full",
    "full_like",
    "gather",
    "identity",
    "linspace",
    "ones",
    "ones_like",
    "param_pack_concat",
    "param_pack_split",
    "reshape",
    "remove_axis",
    "split",
    "squeeze",
    "stack",
    "scatter",
    "transpose",
@@ -61,11 +57,10 @@ __all__ = [
 ]


 def eye(shape, *, dtype="float32", device: Optional[CompNode] = None) -> Tensor:
 def eye(N, M=None, *, dtype="float32", device: Optional[CompNode] = None) -> Tensor:
    """Returns a 2D tensor with ones on the diagonal and zeros elsewhere.

    :param shape: expected shape of otuput tensor.
    :param m: number of columns. Default: None
    :param shape: expected shape of output tensor.
    :param dtype: data type. Default: None
    :param device: compute node of the matrix. Default: None
    :return: eye matrix.
@@ -77,8 +72,7 @@ def eye(shape, *, dtype="float32", device: Optional[CompNode] = None) -> Tensor:
        import numpy as np
        import megengine.functional as F

        data_shape = (4, 6)
        out = F.eye(data_shape, dtype=np.float32)
        out = F.eye(4, 6, dtype=np.float32)
        print(out.numpy())

    Outputs:
@@ -91,8 +85,17 @@ def eye(shape, *, dtype="float32", device: Optional[CompNode] = None) -> Tensor:
         [0. 0. 0. 1. 0. 0.]]

    """
    if M is not None:
        if isinstance(N, Tensor) or isinstance(M, Tensor):
            shape = astensor1d((N, M))
        else:
            shape = Tensor([N, M], dtype="int32", device=device)
    elif isinstance(N, Tensor):
        shape = N
    else:
        shape = Tensor(N, dtype="int32", device=device)
    op = builtin.Eye(k=0, dtype=dtype, comp_node=device)
    (result,) = apply(op, Tensor(shape, dtype="int32", device=device))
    (result,) = apply(op, shape)
    return result


@@ -106,7 +109,7 @@ def full(shape, value, dtype="float32", device=None):
    (x,) = Const(value, dtype=dtype, device=device)(
        Tensor(value, dtype=dtype, device=device)
    )
    return broadcast(x, shape)
    return broadcast_to(x, shape)


 def ones(shape, dtype="float32", device=None):
@@ -160,7 +163,7 @@ def zeros_like(inp: Tensor) -> Tensor:
        print(out.numpy())

    Outputs:
    

    .. testoutput::

        [[0 0 0]
@@ -171,7 +174,7 @@ def zeros_like(inp: Tensor) -> Tensor:


 def ones_like(inp: Tensor) -> Tensor:
    """Returns a identity tensor with the same shape as input tensor.
    """Returns a ones tensor with the same shape as input tensor.
    """
    return ones(inp.shape, dtype=inp.dtype, device=inp.device)

@@ -182,19 +185,7 @@ def full_like(inp: Tensor, value: Union[int, float]) -> Tensor:
    return full(inp.shape, value, dtype=inp.dtype, device=inp.device)


 def identity(inp: Tensor) -> Tensor:
    """Applies an identity transform to the input tensor.

    :param inp: input tensor.
    :return: output tensor.
    """
    op = builtin.Identity()
    (data,) = convert_inputs(inp)
    (output,) = apply(op, data)
    return output


 def broadcast(inp: Tensor, shape: Union[int, Iterable[int]]) -> Tensor:
 def broadcast_to(inp: Tensor, shape: Union[int, Iterable[int]]) -> Tensor:
    """
    Broadcasts a tensor to given shape.

@@ -211,7 +202,7 @@ def broadcast(inp: Tensor, shape: Union[int, Iterable[int]]) -> Tensor:
        import megengine.functional as F

        data = tensor(np.arange(0, 6, dtype=np.float32).reshape(2, 3))
        out = F.broadcast(data, (4, 2, 3))
        out = F.broadcast_to(data, (4, 2, 3))
        print(out.numpy())

    Outputs:
@@ -231,9 +222,7 @@ def broadcast(inp: Tensor, shape: Union[int, Iterable[int]]) -> Tensor:
          [3. 4. 5.]]]

    """
    shape = astensor1d(shape, inp, dtype="int32", device=inp.device)
    (result,) = apply(builtin.Broadcast(), inp, shape)
    return result
    return _broadcast(inp, shape)


 def concat(inps: Iterable[Tensor], axis: int = 0, device=None) -> Tensor:
@@ -241,8 +230,8 @@ def concat(inps: Iterable[Tensor], axis: int = 0, device=None) -> Tensor:
    Concat some tensors

    :param inps: input tensors to concat.
    :param axis: dimension over which the tensors are concatenated. Default: 0
    :param device: comp node output on. Default: None
    :param axis: over which dimension the tensors are concatenated. Default: 0
    :param device: which device output will be. Default: None
    :return: output tensor.

    Examples:
@@ -290,7 +279,7 @@ def stack(inps, axis=0, device=None):

    :param inps: input tensors.
    :param axis: which axis will be concatenated.
    :param device: The comp node output on. Default: None
    :param device: the device output will be. Default: None
    :return: output concatenated tensor.

    Examples:
@@ -322,7 +311,7 @@ def stack(inps, axis=0, device=None):
        if len(shapes) != 1:
            raise ValueError("All input tensors must have the same shape")

    inps = [add_axis(inp, axis=axis) for inp in inps]
    inps = [expand_dims(inp, axis=axis) for inp in inps]
    return concat(inps, axis=axis, device=device)


@@ -331,7 +320,7 @@ def split(inp, nsplits_or_sections, axis=0):
    When nsplits_or_sections is int, the last tensor may be smaller than others.

    :param inp: input tensor.
    :param nsplits_or_sections: number of sub tensors or section information list.
    :param nsplits_or_sections: number of sub tensors or sections information list.
    :param axis: which axis will be splited.
    :return: output tensor list.

@@ -399,8 +388,7 @@ def _get_idx(index, axis):
                0, index.shape[i] - 1, index.shape[i], device=index.device,
            )
            arange = (
                arange.reshape(*shape)
                .broadcast(index.shape)
                broadcast_to(arange.reshape(*shape), index.shape)
                .reshape(-1)
                .astype(np.int32)
            )
@@ -411,7 +399,8 @@ def _get_idx(index, axis):


 def gather(inp: Tensor, axis: int, index: Tensor) -> Tensor:
    r"""Gathers data from inp on axis using index.
    # TODO: rewrite doc
    r"""Gathers data from input tensor on axis using index.

    For a 3-D tensor, the output is specified by::

@@ -419,14 +408,14 @@ def gather(inp: Tensor, axis: int, index: Tensor) -> Tensor:
        out[i][j][k] = inp[i][index[i][j][k]][k] # if axis == 1
        out[i][j][k] = inp[i][j][index[i][j][k]] # if axis == 2

    if inp is an n-dimensional tensor with size
    if input tensor is a n-dimensional tensor with size
    :math:`(x_0,x_1,...,x_{i-1},x_i,x_{i+1},...,x_{n-1})` and axis=i,
    then index must be an n-dimensional tensor with size
    then index must be a n-dimensional tensor with size
    :math:`(x_0,x_1,...,x_{i-1},y,x_{i+1},...,x_{n-1})` where :math:`y\ge 1` and
    output will have the same size as index.

    :param inp: input tensor.
    :param axis: axis along which to index.
    :param axis: along which axis to index.
    :param index: indices of elements to gather.
    :return: output tensor.

@@ -482,20 +471,21 @@ def gather(inp: Tensor, axis: int, index: Tensor) -> Tensor:


 def scatter(inp: Tensor, axis: int, index: Tensor, source: Tensor) -> Tensor:
    r"""Writes all values from the tensor source into inp 
    # TODO: rewrite doc
    r"""Writes all values from the tensor source into input tensor
    at the indices specified in the index tensor.

    For each value in source, its output index is specified by its index
    in source for ``axis != dimension`` and by the corresponding value in
    index for ``axis = dimension``.

    For a 3-D tensor, inp is updated as::
    For a 3-D tensor, input tensor is updated as::

        inp[index[i][j][k]][j][k] = source[i][j][k]  # if axis == 0
        inp[i][index[i][j][k]][k] = source[i][j][k]  # if axis == 1
        inp[i][j][index[i][j][k]] = source[i][j][k]  # if axis == 2

    inp, index and source should have same number of dimensions.
    ``inp``, ``index`` and ``source`` should have same number of dimensions.

    It is also required that ``source.shape(d) <= inp.shape(d)`` and ``index.shape(d) == source.shape(d)``
    for all dimensions ``d``.
@@ -504,10 +494,10 @@ def scatter(inp: Tensor, axis: int, index: Tensor, source: Tensor) -> Tensor:

    .. note::
        Please notice that, due to performance issues, the result is uncertain on the GPU device
        if scatter difference positions from source to the same destination position
        if scattering different positions from source to the same destination position
        regard to index tensor.

        Show the case using the following examples, the oup[0][2] is maybe
        Check the following examples, the oup[0][2] is maybe
        from source[0][2] which value is 0.2256 or source[1][2] which value is 0.5339
        if set the index[1][2] from 1 to 0.

@@ -593,7 +583,7 @@ def where(mask: Tensor, x: Tensor, y: Tensor) -> Tensor:

        \textrm{out}_i = x_i \textrm{ if } \textrm{mask}_i \textrm{ is True else } y_i

    :param mask: a mask used for choosing x or y.
    :param mask: a mask used for choosing ``x`` or ``y``.
    :param x: first choice.
    :param y: second choice.
    :return: output tensor.
@@ -649,7 +639,7 @@ def where(mask: Tensor, x: Tensor, y: Tensor) -> Tensor:

 def cond_take(mask: Tensor, x: Tensor) -> Tensor:
    r"""
    Take elements from data if specific condition is satisfied on mask.
    Takes elements from data if specific condition is satisfied on mask.
    This operator has two outputs: the first is the elements taken,
    and the second is the indices corresponding to those elements;
    they are both 1-dimensional. High-dimension input would first be flattened.
@@ -696,7 +686,7 @@ def transpose(inp: Tensor, pattern: Iterable[int]) -> Tensor:
    Swaps shapes and strides according to given pattern.

    :param inp: input tensor.
    :param pattern: a list of integers including 0, 1, ... , ``ndim``-1, 
    :param pattern: a list of integers including 0, 1, ... , ``ndim``-1,
    and any number of ``'x'`` char in dimensions where this tensor should be broadcasted. For examples:

        * (``'x'``) -> make a 0d (scalar) into a 1d vector
@@ -707,7 +697,7 @@ def transpose(inp: Tensor, pattern: Iterable[int]) -> Tensor:
        * (2, 0, 1) -> AxBxC to CxAxB
        * (0, ``'x'``, 1) -> AxB to Ax1xB
        * (1, ``'x'``, 0) -> AxB to Bx1xA
        * (1,) -> This remove dimensions 0. It must be a broadcastable dimension (1xA to A)
        * (1,) -> this removes dimensions 0. It must be a broadcastable dimension (1xA to A)

    :return: output tensor.

@@ -730,13 +720,7 @@ def transpose(inp: Tensor, pattern: Iterable[int]) -> Tensor:
         [1 0]]

    """
    op = builtin.Dimshuffle(pattern)
    (inp,) = convert_inputs(inp)
    (result,) = apply(op, inp)
    return result


 dimshuffle = transpose
    return inp.transpose(pattern)


 def reshape(inp: Tensor, target_shape: Iterable[int]) -> Tensor:
@@ -745,8 +729,7 @@ def reshape(inp: Tensor, target_shape: Iterable[int]) -> Tensor:
    remain unchanged

    :param inp: input tensor.
    :param target_shape: target shape, the components would be concatenated to form the
        target shape, and it can contain an element of -1 representing unspec_axis.
    :param target_shape: target shape, it can contain an element of -1 representing ``unspec_axis``.

    Examples:

@@ -773,26 +756,7 @@ def reshape(inp: Tensor, target_shape: Iterable[int]) -> Tensor:
          [10 11]]]

    """
    if isinstance(target_shape, (TensorBase, TensorWrapperBase)):
        target_shape = target_shape.numpy()
    target_shape = tuple(map(int, target_shape))
    unspec_axis = None
    for i, s in enumerate(target_shape):
        if s < 0:
            if s != -1:
                raise ValueError("expect shape[{}] >= -1, got {}".format(i, s))
            if unspec_axis is not None:
                raise ValueError("multiple -1 in shape: {} & {}".format(unspec_axis, i))
            unspec_axis = i

    # TODO: device should be None (cpu)
    (target_shape,) = Const(target_shape, dtype="int32", device=inp.device)(inp)
    if unspec_axis is None:
        op = builtin.Reshape()
    else:
        op = builtin.Reshape(unspec_axis=unspec_axis)
    (x,) = apply(op, inp, target_shape)
    return x
    return inp.reshape(target_shape)


 AxisAddRemove = builtin.AxisAddRemove
@@ -837,7 +801,7 @@ def flatten(inp: Tensor, start_axis: int = 0, end_axis: int = -1) -> Tensor:
    return inp.reshape(*target_shape)


 def add_axis(inp: Tensor, axis: Union[int, Sequence[int]]) -> Tensor:
 def expand_dims(inp: Tensor, axis: Union[int, Sequence[int]]) -> Tensor:
    r"""
    Adds dimension before given axis.

@@ -854,7 +818,7 @@ def add_axis(inp: Tensor, axis: Union[int, Sequence[int]]) -> Tensor:
        import megengine.functional as F

        x = tensor([1, 2])
        out = F.add_axis(x, 0)
        out = F.expand_dims(x, 0)
        print(out.shape)

    Outputs:
@@ -883,12 +847,7 @@ def add_axis(inp: Tensor, axis: Union[int, Sequence[int]]) -> Tensor:
    return result


 add_axis = add_axis


 def remove_axis(
    inp: Tensor, axis: Optional[Union[int, Sequence[int]]] = None
 ) -> Tensor:
 def squeeze(inp: Tensor, axis: Optional[Union[int, Sequence[int]]] = None) -> Tensor:
    r"""
    Removes dimension of shape 1.

@@ -905,7 +864,7 @@ def remove_axis(
        import megengine.functional as F

        x = tensor(np.array([1, 2], dtype=np.int32).reshape(1, 1, 2, 1))
        out = F.remove_axis(x, 3)
        out = F.squeeze(x, 3)
        print(out.shape)

    Outputs:
@@ -915,25 +874,7 @@ def remove_axis(
        (1, 1, 2)

    """
    Param = builtin.AxisAddRemove.Param

    def get_axes():
        if axis is None:
            return [i for i, s in enumerate(inp.shape) if s == 1]
        try:
            return [int(axis)]
        except (TypeError, ValueError):
            pass
        return list(map(int, axis))

    axis = get_axes()
    axis = sorted(i + inp.ndim if i < 0 else i for i in axis)
    axis = [a - i for i, a in enumerate(axis)]

    param = Param(*map(builtin.AxisAddRemove.AxisDesc.make_remove, axis))
    op = builtin.AxisAddRemove(param=param)
    (result,) = apply(op, inp)
    return result
    return _remove_axis(inp, axis)


 def linspace(
@@ -962,7 +903,7 @@ def linspace(
        print(a.numpy())

    Outputs:
    

    .. testoutput::

        [ 3.    4.75  6.5   8.25 10.  ]
@@ -982,15 +923,15 @@ def linspace(

 def arange(
    start: Union[int, float, Tensor] = 0,
    end: Optional[Union[int, float, Tensor]] = None,
    stop: Optional[Union[int, float, Tensor]] = None,
    step: Union[int, float, Tensor] = 1,
    dtype="float32",
    device: Optional[CompNode] = None,
 ) -> Tensor:
    r"""Returns a Tensor with values from start to end with adjacent interval step.
    r"""Returns a tensor with values from start to stop with adjacent interval step.

    :param start: starting value of the squence, shoule be scalar.
    :param end: ending value of the squence, shoule be scalar.
    :param stop: ending value of the squence, shoule be scalar.
    :param step: gap between each pair of adjacent values. Default: 1
    :param dtype: result data type.
    :return: generated tensor.
@@ -1004,7 +945,7 @@ def arange(

        a = F.arange(5)
        print(a.numpy())
    

    Outputs:

    Outputs:
@@ -1014,96 +955,18 @@ def arange(
        [0. 1. 2. 3. 4.]

    """
    if end is None:
        start, end = 0, start
    if stop is None:
        start, stop = 0, start

    if isinstance(start, Tensor):
        start = start.astype("float32")
    if isinstance(end, Tensor):
        end = end.astype("float32")
    if isinstance(stop, Tensor):
        stop = stop.astype("float32")
    if isinstance(step, Tensor):
        step = step.astype("float32")
    num = ceil(Tensor((end - start) / step, device=device))
    num = ceil(Tensor((stop - start) / step, device=device))
    stop = start + step * (num - 1)
    result = linspace(start, stop, num, device=device)
    if np.dtype(dtype) == np.int32:
        return result.astype(dtype)
    return result


 def param_pack_split(inp: Tensor, offsets: List, shapes: List) -> Tensor:
    r"""
    Returns split Tensor to Tensor list as offsets and shapes described,
            only used for parampack.

    :param inp: input tensor.
    :param offsets: offsets of outputs, length of 2 * n,
            while n is tensor nums you want to split,
            format `[begin0, end0, begin1, end1]`.
    :param shapes: tensor shapes of outputs.
    :return: split tensors.

    Examples:

    .. testcode::

        import numpy as np
        import megengine.functional as F
        from megengine import tensor

        a = tensor(np.ones((10,), np.int32))
        b, c = F.param_pack_split(a, [0, 1, 1, 10], [(1,), (3, 3)])
        print(b.numpy())
        print(c.numpy())
    
    Outputs:
    
    .. testoutput::

        [1]
        [[1 1 1]
         [1 1 1]
         [1 1 1]]

    """
    op = builtin.ParamPackSplit()
    op.offsets = offsets
    op.shapes = shapes
    return apply(op, inp)


 def param_pack_concat(inps: List, offsets: Tensor, offsets_val: List) -> Tensor:
    r"""
    Returns concat Tensor, only used for parampack.

    :param inps: input tensors.
    :param offsets: device value of offsets.
    :param offsets_val: offsets of inputs, length of 2 * n,
            format [begin0, end0, begin1, end1].
    :return: concat tensors

    Examples:

    .. testcode::

        import numpy as np
        import megengine.functional as F
        from megengine import tensor

        a = tensor(np.ones((1,), np.int32))
        b = tensor(np.ones((3, 3), np.int32))
        offsets_val = [0, 1, 1, 10]
        offsets = tensor(offsets_val, np.int32)
        c = F.param_pack_concat([a, b], offsets, offsets_val)
        print(c.numpy())
    
    Outputs:
    
    .. testoutput::

        [1 1 1 1 1 1 1 1 1 1]

    """
    op = builtin.ParamPackConcat()
    op.offsets = offsets_val
    return apply(op, *inps, offsets)[0]
--- a/imperative/python/megengine/functional/utils.py
+++ b/imperative/python/megengine/functional/utils.py
@@ -11,18 +11,24 @@ from typing import Iterable, Union

 import numpy as np

 from ..core.ops.builtin import Copy
 from ..core._wrap import device as as_device
 from ..core.ops.builtin import Copy, Identity
 from ..core.tensor import Tensor
 from ..core.tensor.core import apply
 from .math import topk as _topk
 from .tensor import transpose as _transpose
 from .tensor import broadcast_to, transpose

 __all__ = [
    "topk_accuracy",
    "copy",
 ]

 def accuracy(

 def topk_accuracy(
    logits: Tensor, target: Tensor, topk: Union[int, Iterable[int]] = 1
 ) -> Union[Tensor, Iterable[Tensor]]:
    r"""
    Calculate the classification accuracy given predicted logits and ground-truth labels.
    Calculates the classification accuracy given predicted logits and ground-truth labels.

    :param logits: model predictions of shape `[batch_size, num_classes]`,
        representing the probability (likelyhood) of each class.
@@ -40,7 +46,7 @@ def accuracy(

        logits = tensor(np.arange(80, dtype=np.int32).reshape(8,10))
        target = tensor(np.arange(8, dtype=np.int32))
        top1, top5 = F.accuracy(logits, target, (1, 5))
        top1, top5 = F.topk_accuracy(logits, target, (1, 5))
        print(top1.numpy(), top5.numpy())

    Outputs:
@@ -54,8 +60,8 @@ def accuracy(
    _, pred = _topk(logits, k=max(topk), descending=True)
    accs = []
    for k in topk:
        correct = pred[:, :k].detach() == _transpose(target, (0, "x")).broadcast(
            target.shape[0], k
        correct = pred[:, :k].detach() == broadcast_to(
            transpose(target, (0, "x")), (target.shape[0], k)
        )
        accs.append(correct.astype(np.float32).sum() / target.shape[0])
    if len(topk) == 1:  # type: ignore[arg-type]
@@ -63,25 +69,12 @@ def accuracy(
    return accs


 def zero_grad(inp: Tensor) -> Tensor:
    r"""
    Returns a tensor which is treated as constant during backward gradient calcuation,
    i.e. its gradient is zero.

    :param inp: Input tensor.

    See implementation of :func:`~.softmax` for example.
    """
    print("zero_grad is obsoleted, please use detach instead")
    raise NotImplementedError


 def copy(inp, cn):
 def copy(inp, device=None):
    r"""
    Copy tensor to another device.
    Copies tensor to another device.

    :param inp: input tensor.
    :param cn: device that you copy to.
    :param device: destination device.

    Examples:

@@ -101,4 +94,6 @@ def copy(inp, cn):

        [1 2 3]
    """
    return apply(Copy(comp_node=cn), inp)[0]
    if device is None:
        return apply(Identity(), inp)[0]
    return apply(Copy(comp_node=as_device(device).to_c()), inp)[0]
--- a/imperative/python/megengine/hub/exceptions.py
+++ b/imperative/python/megengine/hub/exceptions.py
@@ -19,12 +19,12 @@ class InvalidGitHost(FetcherError):


 class GitPullError(FetcherError):
    """A git pull error occurred"""
    """A git pull error occurred."""


 class GitCheckoutError(FetcherError):
    """A git checkout error occurred"""
    """A git checkout error occurred."""


 class InvalidProtocol(FetcherError):
    """The protocol provided was somehow invalid"""
    """The protocol provided was somehow invalid."""
--- a/imperative/python/megengine/hub/fetcher.py
+++ b/imperative/python/megengine/hub/fetcher.py
@@ -106,20 +106,20 @@ class GitSSHFetcher(RepoFetcherBase):

        :param git_host:
            host address of git repo.
            example: github.com
            Example: github.com
        :param repo_info:
            a string with format ``"repo_owner/repo_name[:tag_name/:branch_name]"`` with an optional
            tag/branch. The default branch is ``master`` if not specified.
            example: ``"brain_sdk/MegBrain[:hub]"``
            Example: ``"brain_sdk/MegBrain[:hub]"``
        :param use_cache:
            whether to use locally fetched code or completely re-fetch
            whether to use locally fetched code or completely re-fetch.
        :param commit:
            commit id on github or gitlab
            commit id on github or gitlab.
        :param silent:
            whether to accept the stdout and stderr of the subprocess with PIPE, instead of
            displaying on the screen
            displaying on the screen.
        :return:
            directory where the repo code is stored
            directory where the repo code is stored.
        """
        if not cls._check_git_host(git_host):
            raise InvalidGitHost("git_host: '{}' is malformed.".format(git_host))
@@ -215,24 +215,24 @@ class GitHTTPSFetcher(RepoFetcherBase):
        silent: bool = True,
    ) -> str:
        """
        Fetches git repo by HTTPS protocol
        Fetches git repo by HTTPS protocol.

        :param git_host:
            host address of git repo
            example: github.com
            host address of git repo.
            Example: github.com
        :param repo_info:
            a string with format ``"repo_owner/repo_name[:tag_name/:branch_name]"`` with an optional
            tag/branch. The default branch is ``master`` if not specified.
            example: ``"brain_sdk/MegBrain[:hub]"``
            Example: ``"brain_sdk/MegBrain[:hub]"``
        :param use_cache:
            whether to use locally cached code or completely re-fetch
            whether to use locally cached code or completely re-fetch.
        :param commit:
            commit id on github or gitlab
            commit id on github or gitlab.
        :param silent:
            whether to accept the stdout and stderr of the subprocess with PIPE, instead of
            displaying on the screen
            displaying on the screen.
        :return:
            directory where the repo code is stored
            directory where the repo code is stored.
        """
        if not cls._check_git_host(git_host):
            raise InvalidGitHost("git_host: '{}' is malformed.".format(git_host))
--- a/imperative/python/megengine/hub/hub.py
+++ b/imperative/python/megengine/hub/hub.py
@@ -94,24 +94,24 @@ def _init_hub(
    commit: str = None,
    protocol: str = DEFAULT_PROTOCOL,
 ):
    """Imports hubmodule like python import
    """Imports hubmodule like python import.

    :param repo_info:
        a string with format ``"repo_owner/repo_name[:tag_name/:branch_name]"`` with an optional
        tag/branch. The default branch is ``master`` if not specified.
        Example: ``"brain_sdk/MegBrain[:hub]"``
    :param git_host:
        host address of git repo
        host address of git repo.
        Example: github.com
    :param use_cache:
        whether to use locally cached code or completely re-fetch
        whether to use locally cached code or completely re-fetch.
    :param commit:
        commit id on github or gitlab
        commit id on github or gitlab.
    :param protocol:
        which protocol to use to get the repo, and HTTPS protocol only supports public repo on github.
        The value should be one of HTTPS, SSH.
    :return:
        hubconf.py as a python module
        a python module.
    """
    cache_dir = os.path.expanduser(os.path.join(_get_megengine_home(), "hub"))
    os.makedirs(cache_dir, exist_ok=True)
@@ -137,24 +137,24 @@ def list(
    commit: str = None,
    protocol: str = DEFAULT_PROTOCOL,
 ) -> List[str]:
    """Lists all entrypoints available in repo hubconf
    """Lists all entrypoints available in repo hubconf.

    :param repo_info:
        a string with format ``"repo_owner/repo_name[:tag_name/:branch_name]"`` with an optional
        tag/branch. The default branch is ``master`` if not specified.
        Example: ``"brain_sdk/MegBrain[:hub]"``
    :param git_host:
        host address of git repo
        host address of git repo.
        Example: github.com
    :param use_cache:
        whether to use locally cached code or completely re-fetch
        whether to use locally cached code or completely re-fetch.
    :param commit:
        commit id on github or gitlab
        commit id on github or gitlab.
    :param protocol:
        which protocol to use to get the repo, and HTTPS protocol only supports public repo on github.
        The value should be one of HTTPS, SSH.
    :return:
        all entrypoint names of the model
        all entrypoint names of the model.
    """
    hubmodule = _init_hub(repo_info, git_host, use_cache, commit, protocol)

@@ -182,14 +182,14 @@ def load(
        tag/branch. The default branch is ``master`` if not specified.
        Example: ``"brain_sdk/MegBrain[:hub]"``
    :param entry:
        an entrypoint defined in hubconf
        an entrypoint defined in hubconf.
    :param git_host:
        host address of git repo
        host address of git repo.
        Example: github.com
    :param use_cache:
        whether to use locally cached code or completely re-fetch
        whether to use locally cached code or completely re-fetch.
    :param commit:
        commit id on github or gitlab
        commit id on github or gitlab.
    :param protocol:
        which protocol to use to get the repo, and HTTPS protocol only supports public repo on github.
        The value should be one of HTTPS, SSH.
@@ -217,9 +217,9 @@ def help(
 ) -> str:
    """This function returns docstring of entrypoint ``entry`` by following steps:

    1. Pull the repo code specified by git and repo_info
    1. Pull the repo code specified by git and repo_info.
    2. Load the entry defined in repo's hubconf.py
    3. Return docstring of function entry
    3. Return docstring of function entry.

    :param repo_info:
        a string with format ``"repo_owner/repo_name[:tag_name/:branch_name]"`` with an optional
@@ -228,17 +228,17 @@ def help(
    :param entry:
        an entrypoint defined in hubconf.py
    :param git_host:
        host address of git repo
        host address of git repo.
        Example: github.com
    :param use_cache:
        whether to use locally cached code or completely re-fetch
        whether to use locally cached code or completely re-fetch.
    :param commit:
        commit id on github or gitlab
        commit id on github or gitlab.
    :param protocol:
        which protocol to use to get the repo, and HTTPS protocol only supports public repo on github.
        The value should be one of HTTPS, SSH.
    :return:
        docstring of entrypoint ``entry``
        docstring of entrypoint ``entry``.
    """
    hubmodule = _init_hub(repo_info, git_host, use_cache, commit, protocol)

@@ -255,10 +255,10 @@ def load_serialized_obj_from_url(url: str, model_dir=None) -> Any:
    If the object is already present in ``model_dir``, it's deserialized and
    returned. If no ``model_dir`` is specified, it will be ``MGE_HOME/serialized``.

    :param url: url to serialized object
    :param model_dir: dir to cache target serialized file
    :param url: url to serialized object.
    :param model_dir: dir to cache target serialized file.

    :return: loaded object
    :return: loaded object.
    """
    if model_dir is None:
        model_dir = os.path.join(_get_megengine_home(), "serialized")
--- a/imperative/python/megengine/hub/tools.py
+++ b/imperative/python/megengine/hub/tools.py
@@ -15,10 +15,10 @@ from typing import Iterator

 def load_module(name: str, path: str) -> types.ModuleType:
    """
    Loads module specified by name and path
    Loads module specified by name and path.

    :param name: module name
    :param path: module path
    :param name: module name.
    :param path: module path.
    """
    spec = importlib.util.spec_from_file_location(name, path)
    module = importlib.util.module_from_spec(spec)
@@ -27,18 +27,18 @@ def load_module(name: str, path: str) -> types.ModuleType:


 def check_module_exists(module: str) -> bool:
    """Checks whether python module exists or not
    """Checks whether python module exists or not.

    :param module: name of module
    :param module: name of module.
    """
    return importlib.util.find_spec(module) is not None


@contextmanager
 def cd(target: str) -> Iterator[None]:
    """Changes current directory to target
    """Changes current directory to target.

    :param target: target directory
    :param target: target directory.
    """
    prev = os.getcwd()
    os.chdir(os.path.expanduser(target))
--- a/imperative/python/megengine/jit/tracing.py
+++ b/imperative/python/megengine/jit/tracing.py
@@ -36,6 +36,13 @@ active_trace = None
 skip_tracing = False


 def is_tracing():
    if active_trace is None:
        return False
    else:
        return not skip_tracing


@contextlib.contextmanager
 def exclude_from_trace():
    global skip_tracing
@@ -125,6 +132,9 @@ class trace:
        self._graph_opt_level = opt_level
        self._tensor_shape = tensor_shape

        self._reset()

    def _reset(self):
        self._untraced = True
        self._tinfo = []  # handle -> TensorInfo
        self._seq = []
@@ -257,77 +267,117 @@ class trace:
    def _record_const(self, op, outputs):
        pass

    @contextlib.contextmanager
    def _setup(self):
    def _set_active(self, active: bool):
        global active_trace
        if active_trace:
            raise NotImplementedError("sorry, not implemented: nested trace")
        active_trace = self

        if self._untraced:
            apply.enable(apply_with_tracing)
            apply.enable(apply_const_with_tracing)
            if self._symbolic:
                apply.enable(apply_symbolic_mode)
                apply.enable(apply_const_symbolic_mode)
                self._lazy_eval_graph = G.Graph()
        if active:
            if active_trace:
                raise NotImplementedError("sorry, not implemented: nested trace")
            active_trace = self
        else:
            apply.enable(apply_compiled_mode)
            if self._graph is None:
                self._compile()
            self._graph.execute()

        yield

            assert active_trace is self
            active_trace = None

    def _init_trace(self, symbolic: bool):
        apply.enable(apply_with_tracing)
        apply.enable(apply_const_with_tracing)
        if symbolic:
            apply.enable(apply_symbolic_mode)
            apply.enable(apply_const_symbolic_mode)
            self._lazy_eval_graph = G.Graph()

    def _take_escaped_tensors(self):
        escaped_tensors = tuple(self._active_tensors)
        self._active_tensors.clear()
        return escaped_tensors

        if self._untraced:
            for x in escaped_tensors:
                info = self._tinfo[x._TraceMixin__handle]
                info.data_read = True
                x._TraceMixin__restore()
            if self._inputs_to_restore:
                for x in self._inputs_to_restore:
    def _lazy_eval(self, lazy_eval_graph, lazy_eval_tensors):
        active_lazy_eval_tensors = []
        visited = set()
        readers = []
        for x in lazy_eval_tensors:
            x = x()
            if x is None or x in visited:
                continue
            reader = G.OutputNode(x._LazyEvalTensor__varnode).outputs[0]
            readers.append(reader)
            active_lazy_eval_tensors.append(x)
            visited.add(x)
        self._apply_graph_options(lazy_eval_graph)
        lazy_eval_graph.compile(*readers)
        lazy_eval_graph()
        for r, x in zip(readers, active_lazy_eval_tensors):
            assign_raw_tensor(x, as_raw_tensor(r.op.get_value()))

    @contextlib.contextmanager
    def _setup(self):
        interrupted = False

        def do_enter():
            self._set_active(True)
            if self._untraced:
                self._init_trace(self._symbolic)
            else:
                apply.enable(apply_compiled_mode)
                if self._graph is None:
                    self._compile()
                self._graph.execute()

        def do_finalize():
            escaped_tensors = self._take_escaped_tensors()
            if self._untraced:
                for x in escaped_tensors:
                    info = self._tinfo[x._TraceMixin__handle]
                    info.data_read = True
                    x._TraceMixin__restore()
            if self._symbolic:
                # eval lazy eval tensors
                if self._lazy_eval_tensors:
                    lazy_eval_tensors = []
                    visited = set()
                    readers = []
                    for x in self._lazy_eval_tensors:
                        x = x()
                        if x is None or x in visited:
                            continue
                        reader = G.OutputNode(x._LazyEvalTensor__varnode).outputs[0]
                        readers.append(reader)
                        lazy_eval_tensors.append(x)
                        visited.add(x)
                    self._apply_graph_options(self._lazy_eval_graph)
                    self._lazy_eval_graph.compile(*readers)
                    self._lazy_eval_graph()
                    for r, x in zip(readers, lazy_eval_tensors):
                        assign_raw_tensor(x, as_raw_tensor(r.op.get_value()))
                if self._inputs_to_restore:
                    for x in self._inputs_to_restore:
                        x._TraceMixin__restore()
                if self._symbolic and self._lazy_eval_tensors:
                    # eval lazy eval tensors
                    self._lazy_eval(self._lazy_eval_graph, self._lazy_eval_tensors)
                    self._lazy_eval_graph = None
                    self._lazy_eval_tensors = None
            self._untraced = False
        else:
            if self._pc != len(self._seq):
                raise TraceMismatchError("premature end")
            for x in escaped_tensors:
                assign_raw_tensor(x, as_raw_tensor(x._dev_tensor()))
            self._graph.wait()
            self._reset_exec_env()
                self._untraced = False
            else:
                # compiled_tensor leaks
                if self._pc == len(self._seq):
                    for x in escaped_tensors:
                        try:
                            assign_raw_tensor(x, as_raw_tensor(x._dev_tensor()))
                        except TraceMismatchError:
                            # TraceMismatchError thrown in do_exit
                            pass
                    self._graph.wait()
                    self._reset_exec_env()

            # reset status
            self._pc = 0

        self._tensor_remaps = None
        apply.disable(apply_with_tracing)
        apply.disable(apply_const_with_tracing)
        apply.disable(apply_symbolic_mode)
        apply.disable(apply_const_symbolic_mode)
        apply.disable(apply_compiled_mode)
        active_trace = None
            self._tensor_remaps = None
            apply.disable(apply_with_tracing)
            apply.disable(apply_const_with_tracing)
            apply.disable(apply_symbolic_mode)
            apply.disable(apply_const_symbolic_mode)
            apply.disable(apply_compiled_mode)
            self._set_active(False)

        def do_exit():
            if not self._untraced and self._pc != len(self._seq):
                raise TraceMismatchError("premature end")
            if not self._symbolic or not self._untraced:
                for x in self._active_tensors:
                    x._dev_tensor()

        try:
            do_enter()
            yield
            do_exit()
        except:
            interrupted = True
            raise
        finally:
            do_finalize()
            if interrupted:
                self._reset()

    def _begin_excluded_region(self):
        if self._capture_as_const:
@@ -368,6 +418,7 @@ class trace:
    def _compile(self):
        graph = self._graph = G.Graph()
        graph.options.no_force_inplace = True
        graph.options.async_exec_level = 0b100
        self._apply_graph_options(graph)
        # graph.options.graph_opt_level = 0
        need_reset_nodes = self._need_reset_nodes = []
@@ -570,7 +621,9 @@ class trace:
                if h not in h2v:
                    assert info.external
                    assert info.bound_data
                    h2v[h] = graph.make_const(info.bound_data._dev_tensor())
                    h2v[h] = graph.make_const(
                        info.bound_data.numpy(), dtype=info.dtype, device=info.device
                    )
                ivars.append(h2v[h])
            ovars = apply(op, *ivars)
            assert len(ovars) == len(ohandles)
--- a/imperative/python/megengine/logger.py
+++ b/imperative/python/megengine/logger.py
@@ -12,7 +12,7 @@ import os
 import sys

 _all_loggers = []
 _default_level_name = os.getenv("MEGENGINE_LOGGING_LEVEL", "ERROR")
 _default_level_name = os.getenv("MEGENGINE_LOGGING_LEVEL", "INFO")
 _default_level = logging.getLevelName(_default_level_name.upper())


--- a/imperative/python/megengine/module/init.py
+++ b/imperative/python/megengine/module/init.py
@@ -8,6 +8,7 @@
 # "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.

 from .activation import LeakyReLU, PReLU, ReLU, Sigmoid, Softmax
 from .adaptive_pooling import AdaptiveAvgPool2d, AdaptiveMaxPool2d
 from .batchnorm import BatchNorm1d, BatchNorm2d, SyncBatchNorm
 from .concat import Concat
 from .conv import Conv2d, ConvRelu2d, ConvTranspose2d, LocalConv2d
--- a/imperative/python/megengine/module/activation.py
+++ b/imperative/python/megengine/module/activation.py
@@ -20,10 +20,10 @@ class Softmax(Module):
    .. math::
            \text{Softmax}(x_{i}) = \frac{exp(x_i)}{\sum_j exp(x_j)}

    It is applied to an n-dimensional input Tensor and rescaling them so that the elements of the
    n-dimensional output Tensor lie in the range of `[0, 1]` and sum to 1.
    It is applied to all elements along axis, and rescales elements so that
    they stay in the range `[0, 1]` and sum to 1.

    :param axis: An axis along which softmax will be applied. By default,
    :param axis: Along which axis softmax will be applied. By default,
        softmax will apply along the highest ranked axis.

    Examples:
@@ -55,6 +55,9 @@ class Softmax(Module):
    def forward(self, inputs):
        return softmax(inputs, self.axis)

    def _module_info_string(self) -> str:
        return "axis={axis}".format(axis=self.axis)


 class Sigmoid(Module):
    r"""
@@ -138,8 +141,7 @@ class PReLU(Module):
        \end{cases}

    Here :math:`a` is a learnable parameter. When called without arguments, `PReLU()` uses
    a single paramter :math:`a` across all input channel. If called with `PReLU(num_of_channels)`,
    a seperate :math:`a` is used for each input channle.
    a single paramter :math:`a` across all input channel. If called with `PReLU(num_of_channels)`, each input channle will has it's own :math:`a`.

    :param num_parameters: number of :math:`a` to learn, there is only two
        values are legitimate: 1, or the number of channels at input. Default: 1
--- a/imperative/python/megengine/module/adaptive_pooling.py
+++ b/imperative/python/megengine/module/adaptive_pooling.py
@@ -0,0 +1,114 @@
 # -*- coding: utf-8 -*-
 # MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
 #
 # Copyright (c) 2014-2020 Megvii Inc. All rights reserved.
 #
 # Unless required by applicable law or agreed to in writing,
 # software distributed under the License is distributed on an
 # "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 from abc import abstractmethod
 from typing import Tuple, Union

 from ..functional import adaptive_avg_pool2d, adaptive_max_pool2d
 from ..tensor import Parameter, Tensor
 from .module import Module


 class _AdaptivePoolNd(Module):
    def __init__(
        self, oshp: Union[Tuple[int, int], int, Tensor],
    ):
        super(_AdaptivePoolNd, self).__init__()
        self.oshp = oshp

    @abstractmethod
    def forward(self, inp):
        pass


 class AdaptiveMaxPool2d(_AdaptivePoolNd):
    r"""Applies a 2D max adaptive pooling over an input.

    For instance, given an input of the size :math:`(N, C, H, W)` and
    an output shape :math:`(OH, OW)`, this layer generates the output of
    the size :math:`(N, C, OH, OW)` through a process described as:

    .. math::
        \begin{aligned}
            out(N_i, C_j, h, w) ={} & \max_{m=0, \ldots, kH-1} \max_{n=0, \ldots, kW-1}
                \text{input}(N_i, C_j, \text{stride[0]} \times h + m,
                \text{stride[1]} \times w + n)
        \end{aligned}

    Kernel_size and stride can be inferred from input shape and out shape:
    padding: (0, 0)
    stride: (floor(IH / OH), floor(IW / OW))
    kernel_size: (IH - (OH - 1) * stride_h, IW - (OW - 1) * stride_w)

    Examples:

    .. testcode::

        import numpy as np
        import megengine as mge
        import megengine.module as M

        m = M.AdaptiveMaxPool2d((2, 2))
        inp = mge.tensor(np.arange(0, 16).astype("float32").reshape(1, 1, 4, 4))
        oup = m(inp)
        print(oup.numpy())

    Outputs:

    .. testoutput::

        [[[[5.  7.]
           [13. 15.]]]]

    """

    def forward(self, inp):
        return adaptive_max_pool2d(inp, self.oshp)


 class AdaptiveAvgPool2d(_AdaptivePoolNd):
    r"""Applies a 2D average pooling over an input.

    For instance, given an input of the size :math:`(N, C, H, W)` and
    an output shape :math:`(OH, OW)`, this layer generates the output of
    the size :math:`(N, C, OH, OW)` through a process described as:

    .. math::

        out(N_i, C_j, h, w)  = \frac{1}{kH * kW} \sum_{m=0}^{kH-1} \sum_{n=0}^{kW-1}
                               input(N_i, C_j, stride[0] \times h + m, stride[1] \times w + n)

    Kernel_size and stride can be inferred from input shape and out shape:
    padding: (0, 0)
    stride: (floor(IH / OH), floor(IW / OW))
    kernel_size: (IH - (OH - 1) * stride_h, IW - (OW - 1) * stride_w)

    Examples:

    .. testcode::

        import numpy as np
        import megengine as mge
        import megengine.module as M

        m = M.AdaptiveAvgPool2d((2, 2))
        inp = mge.tensor(np.arange(0, 16).astype("float32").reshape(1, 1, 4, 4))
        oup = m(inp)
        print(oup.numpy())

    Outputs:

    .. testoutput::

        [[[[2.5  4.5]
           [10.5 12.5]]]]

    """

    def forward(self, inp):
        return adaptive_avg_pool2d(inp, self.oshp)
--- a/imperative/python/megengine/module/batchnorm.py
+++ b/imperative/python/megengine/module/batchnorm.py
@@ -11,7 +11,7 @@ from typing import Optional
 import numpy as np

 from ..distributed.group import WORLD, Group
 from ..functional import batch_norm2d, sync_batch_norm
 from ..functional.nn import batch_norm, sync_batch_norm
 from ..tensor import Parameter, Tensor
 from . import init
 from .module import Module
@@ -96,7 +96,7 @@ class _BatchNorm(Module):
        else:
            exponential_average_factor = 0.0  # useless

        output = batch_norm2d(
        output = batch_norm(
            inp,
            self.running_mean if self.track_running_stats else None,
            self.running_var if self.track_running_stats else None,
@@ -113,6 +113,13 @@ class _BatchNorm(Module):

        return output

    def _module_info_string(self) -> str:
        s = (
            "{num_features}, eps={eps}, momentum={momentum}, affine={affine}, "
            "track_running_stats={track_running_stats}"
        )
        return s.format(**self.__dict__)


 class SyncBatchNorm(_BatchNorm):
    r"""
@@ -213,8 +220,8 @@ class BatchNorm2d(_BatchNorm):
    of 0.9.

    If :attr:`track_running_stats` is set to ``False``, this layer will not
    keep running estimates, and batch statistics are instead used during
    evaluation time.
    keep running estimates, batch statistics is used during
    evaluation time instead.

    .. note::
        This :attr:`momentum` argument is different from one used in optimizer
@@ -229,15 +236,14 @@ class BatchNorm2d(_BatchNorm):
    Spatial Batch Normalization.

    :type num_features: int
    :param num_features: usually the :math:`C` from an input of size
        :math:`(N, C, H, W)` or the highest ranked dimension of an input with
    :param num_features: usually :math:`C` from an input of shape
        :math:`(N, C, H, W)` or the highest ranked dimension of an input
        less than 4D.
    :type eps: float
    :param eps: a value added to the denominator for numerical stability.
        Default: 1e-5
    :type momentum: float
    :param momentum: the value used for the `running_mean` and `running_var`
        computation.
    :param momentum: the value used for the ``running_mean`` and ``running_var`` computation.
        Default: 0.9
    :type affine: bool
    :param affine: a boolean value that when set to True, this module has
--- a/imperative/python/megengine/module/conv.py
+++ b/imperative/python/megengine/module/conv.py
@@ -70,6 +70,21 @@ class _ConvNd(Module):
    def _infer_bias_shape(self):
        pass

    def _module_info_string(self):
        s = "{in_channels}, {out_channels}, kernel_size={kernel_size}"

        if self.stride != (1,) * len(self.stride):
            s += ", stride={stride}"
        if self.padding != (0,) * len(self.padding):
            s += ", padding={padding}"
        if self.dilation != (1,) * len(self.dilation):
            s += ", dilation={dilation}"
        if self.groups != 1:
            s += ", groups={groups}"
        if self.bias is None:
            s += ", bias=False"
        return s.format(**self.__dict__)


 class Conv2d(_ConvNd):
    r"""Applies a 2D convolution over an input tensor.
@@ -84,8 +99,8 @@ class Conv2d(_ConvNd):
        \sum_{k = 0}^{C_{\text{in}} - 1} \text{weight}(C_{\text{out}_j}, k) \star \text{input}(N_i, k)

    where :math:`\star` is the valid 2D cross-correlation operator,
    :math:`N` is a batch size, :math:`C` denotes a number of channels,
    :math:`H` is a height of input planes in pixels, and :math:`W` is
    :math:`N` is batch size, :math:`C` denotes number of channels,
    :math:`H` is height of input planes in pixels, and :math:`W` is
    width in pixels.

    When `groups == in_channels` and `out_channels == K * in_channels`,
@@ -105,9 +120,8 @@ class Conv2d(_ConvNd):
    :param padding: size of the paddings added to the input on both sides of its
        spatial dimensions. Only zero-padding is supported. Default: 0
    :param dilation: dilation of the 2D convolution operation. Default: 1
    :param groups: number of groups to divide input and output channels into,
        so as to perform a "grouped convolution". When groups is not 1,
        in_channels and out_channels must be divisible by groups,
    :param groups: number of groups into which the input and output channels are divided, so as to perform a "grouped convolution". When ``groups`` is not 1,
        ``in_channels`` and ``out_channels`` must be divisible by ``groups``,
        and there would be an extra dimension at the beginning of the weight's
        shape. Specifically, the shape of weight would be `(groups,
        out_channel // groups, in_channels // groups, *kernel_size)`.
@@ -115,9 +129,9 @@ class Conv2d(_ConvNd):
        True
    :param conv_mode: Supports `CROSS_CORRELATION` or `CONVOLUTION`. Default:
        `CROSS_CORRELATION`
    :param compute_mode: When set to `DEFAULT`, no special requirements will be
        placed on the precision of intermediate results. When set to `FLOAT32`,
        float32 would be used for accumulator and intermediate result, but only
    :param compute_mode: When set to "DEFAULT", no special requirements will be
        placed on the precision of intermediate results. When set to "FLOAT32",
        "Float32" would be used for accumulator and intermediate result, but only
        effective when input and output are of float16 dtype.

    Examples:
@@ -221,7 +235,7 @@ class ConvTranspose2d(_ConvNd):
    r"""Applies a 2D transposed convolution over an input tensor.

    This module is also known as a deconvolution or a fractionally-strided convolution.
    :class:`ConvTranspose2d` can ben seen as the gradient of :class:`Conv2d` operation
    :class:`ConvTranspose2d` can be seen as the gradient of :class:`Conv2d` operation
    with respect to its input.

    Convolution usually reduces the size of input, while transposed convolution works
@@ -237,8 +251,7 @@ class ConvTranspose2d(_ConvNd):
    :param padding: size of the paddings added to the input on both sides of its
        spatial dimensions. Only zero-padding is supported. Default: 0
    :param dilation: dilation of the 2D convolution operation. Default: 1
    :param groups: number of groups to divide input and output channels into,
        so as to perform a "grouped convolution". When ``groups`` is not 1,
    :param groups: number of groups into which the input and output channels are divided, so as to perform a "grouped convolution". When ``groups`` is not 1,
        ``in_channels`` and ``out_channels`` must be divisible by ``groups``,
        and there would be an extra dimension at the beginning of the weight's
        shape. Specifically, the shape of weight would be ``(groups,
@@ -247,9 +260,9 @@ class ConvTranspose2d(_ConvNd):
        True
    :param conv_mode: Supports `CROSS_CORRELATION` or `CONVOLUTION`. Default:
        `CROSS_CORRELATION`
    :param compute_mode: When set to `DEFAULT`, no special requirements will be
        placed on the precision of intermediate results. When set to `FLOAT32`,
        float32 would be used for accumulator and intermediate result, but only
    :param compute_mode: When set to "DEFAULT", no special requirements will be
        placed on the precision of intermediate results. When set to "FLOAT32",
        "Float32" would be used for accumulator and intermediate result, but only
        effective when input and output are of float16 dtype.
    """

@@ -327,7 +340,7 @@ class ConvTranspose2d(_ConvNd):


 class LocalConv2d(Conv2d):
    r"""Applies a spatial convolution with untied kernels over an input 4D tensor.
    r"""Applies a spatial convolution with untied kernels over an groupped channeled input 4D tensor.
    It is also known as the locally connected layer.

    :param in_channels: number of input channels.
@@ -340,9 +353,9 @@ class LocalConv2d(Conv2d):
    :param stride: stride of the 2D convolution operation. Default: 1
    :param padding: size of the paddings added to the input on both sides of its
        spatial dimensions. Only zero-padding is supported. Default: 0
    :param groups: number of groups to divide input and output channels into,
        so as to perform a "grouped convolution". When groups is not 1,
        in_channels and out_channels must be divisible by groups.
    :param groups: number of groups into which the input and output channels are divided,
        so as to perform a "grouped convolution". When ``groups`` is not 1,
        ``in_channels`` and ``out_channels`` must be divisible by ``groups``.
        The shape of weight is `(groups, output_height, output_width,
        in_channels // groups, *kernel_size, out_channels // groups)`.
    """
--- a/imperative/python/megengine/module/dropout.py
+++ b/imperative/python/megengine/module/dropout.py
@@ -11,7 +11,7 @@ from .module import Module


 class Dropout(Module):
    r"""Randomly set input elements to zeros with the probability :math:`drop\_prob` during training.
    r"""Randomly sets input elements to zeros with the probability :math:`drop\_prob` during training.
    Commonly used in large networks to prevent overfitting.
    Note that we perform dropout only during training, we also rescale(multiply) the output tensor
    by :math:`\frac{1}{1 - drop\_prob}`. During inference :class:`~.Dropout` is equal to :class:`~.Identity`.
@@ -28,3 +28,6 @@ class Dropout(Module):
            return dropout(inputs, self.drop_prob, training=True)
        else:
            return inputs

    def _module_info_string(self) -> str:
        return "drop_prob={drop_prob}".format(drop_prob=self.drop_prob)
--- a/imperative/python/megengine/module/elemwise.py
+++ b/imperative/python/megengine/module/elemwise.py
@@ -34,7 +34,7 @@ class Elemwise(Module):
        * "EXP": exp(x)
        * "TANH": tanh(x)
        * "FUSE_MUL_ADD3": x * y + z
        * "FAST_TANH": fast_tanh(x)
        * "FAST_TANH": x * (27. + x * x) / (27. + 9. * x * x)
        * "NEGATE": -x
        * "ACOS": acos(x)
        * "ASIN": asin(x)
@@ -56,9 +56,9 @@ class Elemwise(Module):
        * "SIGMOID_GRAD": sigmoid_grad
        * "SWITCH_GT0": switch_gt0
        * "TANH_GRAD": tanh_grad
        * "LT": lt
        * "LT": less
        * "LEQ": leq
        * "EQ": eq
        * "EQ": equal
        * "POW": pow
        * "LOG_SUM_EXP": log_sum_exp
        * "FAST_TANH_GRAD": fast_tanh_grad
--- a/imperative/python/megengine/module/embedding.py
+++ b/imperative/python/megengine/module/embedding.py
@@ -10,7 +10,7 @@ from typing import Optional

 import numpy as np

 from ..functional import embedding as embedding_func
 from ..functional.nn import embedding as embedding_func
 from ..tensor import Parameter
 from . import init
 from .module import Module
@@ -26,9 +26,9 @@ class Embedding(Module):

    :param num_embeddings: size of embedding dictionary.
    :param embedding_dim: size of each embedding vector.
    :param padding_idx: should be set to None, not support now.
    :param max_norm: should be set to None, not support now.
    :param norm_type: should be set to None, not support now.
    :param padding_idx: should be set to None, not supportted now.
    :param max_norm: should be set to None, not supportted now.
    :param norm_type: should be set to None, not supportted now.
    :param initial_weight: the learnable weights of the module of shape (num_embeddings, embedding_dim).

    Examples:
@@ -121,8 +121,8 @@ class Embedding(Module):
        r"""
        Creates Embedding instance from given 2-dimensional FloatTensor.

        :param embeddings: Tensor contained weight for the embedding.
        :param freeze: If ``True``, the weight does not get updated during the learning process. Default: ``True``.
        :param embeddings: tensor contained weight for the embedding.
        :param freeze: if ``True``, the weight does not get updated during the learning process. Default: True.
        :param padding_idx: should be set to None, not support Now.
        :param max_norm: should be set to None, not support Now.
        :param norm_type: should be set to None, not support Now.
--- a/imperative/python/megengine/module/identity.py
+++ b/imperative/python/megengine/module/identity.py
@@ -6,7 +6,7 @@
 # Unless required by applicable law or agreed to in writing,
 # software distributed under the License is distributed on an
 # "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 from ..functional import identity
 from ..functional import copy
 from .module import Module


@@ -14,4 +14,4 @@ class Identity(Module):
    r"""A placeholder identity operator that will ignore any argument."""

    def forward(self, x):
        return identity(x)
        return copy(x)
--- a/imperative/python/megengine/module/init.py
+++ b/imperative/python/megengine/module/init.py
@@ -18,48 +18,48 @@ from ..tensor import Tensor


 def fill_(tensor: Tensor, val: Union[float, int]) -> None:
    """Fill the given ``tensor`` with value ``val``.
    """Fills the given ``tensor`` with value ``val``.

    :param tensor: An n-dimentional tensor to be initialized
    :param val: The value to be filled throughout the tensor
    :param tensor: tensor to be initialized.
    :param val: value to be filled throughout the tensor.
    """
    tensor._reset(full(shape=tensor.shape, value=val, dtype=tensor.dtype))


 def zeros_(tensor: Tensor) -> None:
    """Fill the given ``tensor`` with scalar value `0`.
    """Fills the given ``tensor`` with scalar value `0`.

    :param tensor: An n-dimentional tensor to be initialized
    :param tensor: tensor to be initialized.
    """
    fill_(tensor, 0)


 def ones_(tensor: Tensor) -> None:
    """Fill the given ``tensor`` with the scalar value `1`.
    """Fills the given ``tensor`` with the scalar value `1`.

    :param tensor: An n-dimentional tensor to be initialized
    :param tensor: tensor to be initialized.
    """
    fill_(tensor, 1)


 def uniform_(tensor: Tensor, a: float = 0.0, b: float = 1.0) -> None:
    r"""Fill the given ``tensor`` with random value sampled from uniform distribution
    r"""Fills the given ``tensor`` with random value sampled from uniform distribution
    :math:`\mathcal{U}(\text{a}, \text{b})`.

    :param tensor: An n-dimentional tensor to be initialized
    :param a: Lower bound of the sampling interval
    :param b: Upper bound of the sampling interval
    :param tensor: tensor to be initialized.
    :param a: lower bound of the sampling interval.
    :param b: upper bound of the sampling interval.
    """
    tensor._reset(uniform(size=tensor.shape, low=a, high=b).astype(tensor.dtype))


 def normal_(tensor: Tensor, mean: float = 0.0, std: float = 1.0) -> None:
    r"""Fill the given ``tensor`` with random value sampled from normal distribution
    r"""Fills the given ``tensor`` with random value sampled from normal distribution
    :math:`\mathcal{N}(\text{mean}, \text{std}^2)`.

    :param tensor: An n-dimentional tensor to be initialized
    :param mean: The mean of the normal distribution
    :param std: The standard deviation of the normal distribution
    :param tensor: tensor to be initialized.
    :param mean: mean of the normal distribution.
    :param std: standard deviation of the normal distribution.
    """
    tensor._reset(normal(size=tensor.shape, mean=mean, std=std).astype(tensor.dtype))

@@ -67,7 +67,7 @@ def normal_(tensor: Tensor, mean: float = 0.0, std: float = 1.0) -> None:
 def calculate_gain(
    nonlinearity: str, param: Optional[Union[int, float]] = None
 ) -> float:
    r"""Return a recommended gain value (see the table below) for the given nonlinearity
    r"""Returns a recommended gain value (see the table below) for the given nonlinearity
    function.

    ================= ====================================================
@@ -81,8 +81,8 @@ def calculate_gain(
    Leaky Relu        :math:`\sqrt{\frac{2}{1 + {\text{negative}_\text{slope}}^2}}`
    ================= ====================================================

    :param nonlinearity: Name of the non-linear function
    :param param: Optional parameter for leaky_relu. Only effective when
    :param nonlinearity: name of the non-linear function.
    :param param: optional parameter for leaky_relu. Only effective when
        ``nonlinearity`` is "leaky_relu".

    """
@@ -119,10 +119,10 @@ def calculate_gain(

 def calculate_fan_in_and_fan_out(tensor: Tensor) -> Tuple[float, float]:
    """
    Calculate fan_in / fan_out value for given weight tensor. This function assumes
    input tensor is stored in NCHW format.
    Calculates fan_in / fan_out value for given weight tensor. This function assumes
    input tensor is stored in ``NCHW`` format.

    :param tensor: Weight tensor in NCHW format
    :param tensor: weight tensor in ``NCHW`` format.
    """
    shape = tensor.shape
    ndim = len(shape)
@@ -148,13 +148,13 @@ def calculate_fan_in_and_fan_out(tensor: Tensor) -> Tuple[float, float]:

 def calculate_correct_fan(tensor: Tensor, mode: str) -> float:
    """
    Calculate fan_in or fan_out value for given weight tensor, depending on given
    Calculates fan_in / fan_out value for given weight tensor, depending on given
    ``mode``.

    See :func:`calculate_fan_in_and_fan_out` for details.

    :param tensor: Weight tensor in NCHW format
    :param mode: ``'fan_in'`` or ``'fan_out'``
    :param tensor: weight tensor in ``NCHW`` format.
    :param mode: "fan_in" or "fan_out".
    """
    mode = mode.lower()
    valid_modes = ["fan_in", "fan_out"]
@@ -168,7 +168,7 @@ def calculate_correct_fan(tensor: Tensor, mode: str) -> float:


 def xavier_uniform_(tensor: Tensor, gain: float = 1.0) -> None:
    r"""Fill ``tensor`` with random values sampled from :math:`\mathcal{U}(-a, a)`
    r"""Fills tensor with random values sampled from :math:`\mathcal{U}(-a, a)`
    where

    .. math::
@@ -178,8 +178,8 @@ def xavier_uniform_(tensor: Tensor, gain: float = 1.0) -> None:
    `Understanding the difficulty of training deep feedforward neural networks` -
    Glorot, X. & Bengio, Y. (2010).

    :param tensor: An n-dimentional tensor to be initialized
    :param gain: Scaling factor for :math:`a`.
    :param tensor: tensor to be initialized.
    :param gain: scaling factor for :math:`a`.
    """
    fan_in, fan_out = calculate_fan_in_and_fan_out(tensor)
    std = gain * math.sqrt(2.0 / float(fan_in + fan_out))
@@ -188,7 +188,7 @@ def xavier_uniform_(tensor: Tensor, gain: float = 1.0) -> None:


 def xavier_normal_(tensor: Tensor, gain: float = 1.0) -> None:
    r"""Fill ``tensor`` with random values sampled from
    r"""Fills tensor with random values sampled from
    :math:`\mathcal{N}(0, \text{std}^2)` where

    .. math::
@@ -198,8 +198,8 @@ def xavier_normal_(tensor: Tensor, gain: float = 1.0) -> None:
    `Understanding the difficulty of training deep feedforward neural networks` -
    Glorot, X. & Bengio, Y. (2010).

    :param tensor: An n-dimentional tensor to be initialized
    :param gain: Scaling factor for :math:`std`.
    :param tensor: tensor to be initialized.
    :param gain: scaling factor for :math:`std`.
    """
    fan_in, fan_out = calculate_fan_in_and_fan_out(tensor)
    std = gain * math.sqrt(2.0 / float(fan_in + fan_out))
@@ -209,7 +209,7 @@ def xavier_normal_(tensor: Tensor, gain: float = 1.0) -> None:
 def msra_uniform_(
    tensor: Tensor, a: float = 0, mode: str = "fan_in", nonlinearity: str = "leaky_relu"
 ) -> None:
    r"""Fill ``tensor`` wilth random values sampled from
    r"""Fills tensor wilth random values sampled from
    :math:`\mathcal{U}(-\text{bound}, \text{bound})` where

    .. math::
@@ -219,13 +219,13 @@ def msra_uniform_(
    `Delving deep into rectifiers: Surpassing human-level performance on ImageNet
    classification`

    :param tensor: An n-dimentional tensor to be initialized
    :param a: Optional parameter for calculating gain for leaky_relu. See
    :param tensor: tensor to be initialized.
    :param a: optional parameter for calculating gain for leaky_relu. See
        :func:`calculate_gain` for details.
    :param mode: ``'fan_in'`` or ``'fan_out'``, used to calculate :math:`gain`, the
    :param mode: "fan_in" or "fan_out", used to calculate :math:`gain`, the
        scaling factor for :math:`bound`. See :func:`calculate_fan_in_and_fan_out` for
        details.
    :param nonlinearity: Name of the non-linear function used to calculate :math:`gain`.
    :param nonlinearity: name of the non-linear function used to calculate :math:`gain`.
        See :func:`calculate_gain` for details.
    """
    fan = calculate_correct_fan(tensor, mode)
@@ -238,7 +238,7 @@ def msra_uniform_(
 def msra_normal_(
    tensor: Tensor, a: float = 0, mode: str = "fan_in", nonlinearity: str = "leaky_relu"
 ) -> None:
    r"""Fill ``tensor`` wilth random values sampled from
    r"""Fills tensor wilth random values sampled from
    :math:`\mathcal{N}(0, \text{std}^2)` where

    .. math::
@@ -248,13 +248,13 @@ def msra_normal_(
    `Delving deep into rectifiers: Surpassing human-level performance on ImageNet
    classification`

    :param tensor: An n-dimentional tensor to be initialized
    :param a: Optional parameter for calculating gain for leaky_relu. See
    :param tensor: tensor to be initialized
    :param a: optional parameter for calculating gain for leaky_relu. See
        :func:`calculate_gain` for details.
    :param mode: ``'fan_in'`` or ``'fan_out'``, used to calculate :math:`gain`, the
    :param mode: "fan_in" or "fan_out", used to calculate :math:`gain`, the
        scaling factor for :math:`gain`. See :func:`calculate_fan_in_and_fan_out` for
        details.
    :param nonlinearity: Name of the non-linear function used to calculate :math:`gain`.
    :param nonlinearity: name of the non-linear function used to calculate :math:`gain`.
        See :func:`calculate_gain` for details.
    """
    fan = calculate_correct_fan(tensor, mode)
--- a/imperative/python/megengine/module/linear.py
+++ b/imperative/python/megengine/module/linear.py
@@ -7,7 +7,7 @@
 # "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 import numpy as np

 from ..functional import linear
 from ..functional.nn import linear
 from ..tensor import Parameter
 from . import init
 from .module import Module
@@ -25,7 +25,7 @@ class Linear(Module):

    :param in_features: size of each input sample.
    :param out_features: size of each output sample.
    :param bias: If set to ``False``, the layer will not learn an additive bias.
    :param bias: if it's ``False``, the layer will not learn an additional ``bias``.
        Default: ``True``

    Examples:
@@ -78,3 +78,8 @@ class Linear(Module):

    def forward(self, x):
        return self._calc_linear(x, self.weight, self.bias)

    def _module_info_string(self) -> str:
        return "in_features={}, out_features={}, bias={}".format(
            self.in_features, self.out_features, self.bias is not None
        )
--- a/imperative/python/megengine/module/module.py
+++ b/imperative/python/megengine/module/module.py
@@ -69,14 +69,14 @@ class Module(metaclass=ABCMeta):
        self._forward_pre_hooks = OrderedDict()
        self._forward_hooks = OrderedDict()

        self._modules = []

    @abstractmethod
    def forward(self, inputs):
        pass

    def register_forward_pre_hook(self, hook: Callable) -> HookHandler:
        """Register a hook to handle forward inputs. `hook` should be a function

        Note that `inputs` keyword inputs
        """Registers a hook to handle forward inputs. `hook` should be a function.

        :param hook: a function that receive `module` and `inputs`, then return
        a modified `inputs` or `None`.
@@ -85,7 +85,7 @@ class Module(metaclass=ABCMeta):
        return HookHandler(self._forward_pre_hooks, hook)

    def register_forward_hook(self, hook: Callable) -> HookHandler:
        """Register a hook to handle forward results. `hook` should be a function that
        """Registers a hook to handle forward results. `hook` should be a function that
        receive `module`, `inputs` and `outputs`, then return a modified `outputs` or `None`.

        This method return a handler with :meth:`~.HookHandler.remove` interface to delete the hook.
@@ -124,12 +124,12 @@ class Module(metaclass=ABCMeta):
        returned iterable is guaranteed to be identical, as long as all the involved
        module objects' ``__dict__`` does not change thoughout those calls.

        :param recursive: Whether to recursively scan all the submodules.
        :param with_key: Whether to yield keys along with yielded objects.
        :param with_parent: Whether to yield ``self`` along with yielded objects.
        :param prefix: The prefix appended to the yielded keys.
        :param predicate: The predicate function applied to scanned objects.
        :param seen: A dict that records whether a module has been traversed yet.
        :param recursive: whether to recursively scan all the submodules.
        :param with_key: whether to yield keys along with yielded objects.
        :param with_parent: whether to yield ``self`` along with yielded objects.
        :param prefix: prefix appended to the yielded keys.
        :param predicate: the predication function applied to scanned objects.
        :param seen: a dict that records whether a module has been traversed yet.
        """
        if seen is None:
            seen = set([id(self)])
@@ -191,10 +191,10 @@ class Module(metaclass=ABCMeta):
        self, prefix: Optional[str] = None, recursive: bool = True, **kwargs
    ) -> Iterable[Tuple[str, Parameter]]:
        """Returns an iterable for key :class:`~.Parameter` pairs of the module, where
        ``key`` is the dotted path from this module to the :class:`~.Parameter` .
        ``key`` is the dotted path from this module to the :class:`~.Parameter`.

        :param prefix: The prefix prepended to the keys.
        :param recursive: If ``True``, returns all :class:`~.Parameter` within this
        :param prefix: prefix prepended to the keys.
        :param recursive: if ``True``, returns all :class:`~.Parameter` within this
            module, else only returns :class:`~.Parameter` that are direct attributes
            of this module.
        """
@@ -223,7 +223,7 @@ class Module(metaclass=ABCMeta):

        Buffer is defined to be :class:`~.Tensor` excluding :class:`~.Parameter`.

        :param recursive: If ``True``, returns all buffers within this
        :param recursive: if ``True``, returns all buffers within this
            module, else only returns buffers that are direct attributes
            of this module.
        """
@@ -239,8 +239,8 @@ class Module(metaclass=ABCMeta):

        Buffer is defined to be :class:`~.Tensor` excluding :class:`~.Parameter`.

        :param prefix: The prefix prepended to the keys.
        :param recursive: If ``True``, returns all buffers within this
        :param prefix: prefix prepended to the keys.
        :param recursive: if ``True``, returns all buffers within this
            module, else only returns buffers that are direct attributes
            of this module.
        """
@@ -285,7 +285,7 @@ class Module(metaclass=ABCMeta):
        module, including itself, where 'key' is the dotted path from this module to the
        submodules.

        :param prefix: The prefix prepended to the path.
        :param prefix: prefix prepended to the path.
        """
        if "with_parent" in kwargs and kwargs["with_parent"]:
            yield ("" if prefix is None else prefix), self, None
@@ -296,24 +296,24 @@ class Module(metaclass=ABCMeta):
        )

    def apply(self, fn: "Callable[[Module], Any]") -> None:
        """Apply function ``fn`` to all the modules within this module, including
        """Applies function ``fn`` to all the modules within this module, including
        itself.

        :param fn: The function to be applied on modules.
        :param fn: the function to be applied on modules.
        """
        for it in self.modules():
            fn(it)

    @deprecated(version="1.0")
    def zero_grad(self) -> None:
        """Set all parameters' grads to zero
        """Sets all parameters' grads to zero
        """
        for param in self.parameters():
            if param.grad is not None:
                param.grad.reset_zero()

    def train(self, mode: bool = True, recursive: bool = True) -> None:
        """Set training mode of all the modules within this module (including itself) to
        """Sets training mode of all the modules within this module (including itself) to
        ``mode``. This effectively sets the ``training`` attributes of those modules
        to ``mode``, but only has effect on certain modules (e.g.
        :class:`~.BatchNorm2d`, :class:`~.Dropout`, :class:`~.Observer`)
@@ -331,14 +331,14 @@ class Module(metaclass=ABCMeta):
        self.apply(fn)

    def eval(self) -> None:
        """Set training mode of all the modules within this module (including itself) to
        """Sets training mode of all the modules within this module (including itself) to
        ``False``. See :meth:`~.Module.train` for details.
        """
        self.train(False)

    def disable_quantize(self, value=True):
        r"""
        Set ``module``'s ``quantize_disabled`` attribute and return ``module``.
        Sets ``module``'s ``quantize_disabled`` attribute and return ``module``.
        Could be used as a decorator.
        """

@@ -351,7 +351,7 @@ class Module(metaclass=ABCMeta):
    def replace_param(
        self, params: dict, start_pos: int, seen: Optional[Set[int]] = None
    ):
        """Replace module's parameters with `params`, used by :class:`~.ParamPack` to
        """Replaces module's parameters with `params`, used by :class:`~.ParamPack` to
        speedup multimachine training.
        """
        offset = 0
@@ -407,7 +407,7 @@ class Module(metaclass=ABCMeta):
        state_dict: Union[dict, Callable[[str, Tensor], Optional[np.ndarray]]],
        strict=True,
    ):
        r"""Load a given dictionary created by :func:`state_dict` into this module.
        r"""Loads a given dictionary created by :func:`state_dict` into this module.
        If ``strict`` is ``True``, the keys of :func:`state_dict` must exactly match the keys
        returned by :func:`state_dict`.

@@ -518,3 +518,57 @@ class Module(metaclass=ABCMeta):
            loaded.append(k)

        return set(loaded), set(skipped)

    def __setattr__(self, name: str, value):
        if _is_module(value):
            modules = self.__dict__.get("_modules")
            if modules is None:
                raise AttributeError(
                    "cannot assign module before Module.__init__() call"
                )
            if name not in self.__dict__:
                modules.append(name)
        super().__setattr__(name, value)

    def __delattr__(self, name: str):
        if name in self.__dict__ and _is_module(self.__dict__[name]):
            modules = self.__dict__.get("_modules")
            modules.remove(name)
        super().__delattr__(name)

    def _module_info_string(self) -> str:
        r"""Set the extra representation of the module.
        """
        return ""

    def __repr__(self):
        def add_indent(repr_str, num_spaces):
            s = repr_str.split("\n")
            # don't do anything for single-line stuff
            if len(s) == 1:
                return repr_str
            first = s.pop(0)
            s = [(num_spaces * " ") + line for line in s]
            s = "\n".join(s)
            s = first + "\n" + s
            return s

        extra_lines = []
        extra_repr = self._module_info_string()
        if extra_repr:
            extra_lines = extra_repr.split("\n")
        child_lines = [
            "(" + name + "): " + add_indent(repr(self.__dict__[name]), 2)
            for name in self._modules
        ]
        lines = extra_lines + child_lines
        main_str = self.__class__.__name__ + "("
        if lines:
            # simple one-liner info, which most builtin Modules will use
            if len(extra_lines) == 1 and not child_lines:
                main_str += extra_lines[0]
            else:
                main_str += "\n  " + "\n  ".join(lines) + "\n"

        main_str += ")"
        return main_str
--- a/imperative/python/megengine/module/pooling.py
+++ b/imperative/python/megengine/module/pooling.py
@@ -29,6 +29,11 @@ class _PoolNd(Module):
    def forward(self, inp):
        pass

    def _module_info_string(self) -> str:
        return "kernel_size={kernel_size}, stride={stride}, padding={padding}".format(
            **self.__dict__
        )


 class MaxPool2d(_PoolNd):
    r"""Applies a 2D max pooling over an input.
--- a/imperative/python/megengine/module/qat/conv_bn.py
+++ b/imperative/python/megengine/module/qat/conv_bn.py
@@ -5,7 +5,7 @@
 # Unless required by applicable law or agreed to in writing,
 # software distributed under the License is distributed on an
 # "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 from ...functional import add_update, ones, relu, sqrt, sum, zeros
 from ...functional import ones, relu, sqrt, sum, zeros
 from ...quantization.utils import fake_quant_bias
 from .. import conv_bn as Float
 from .module import QATModule
@@ -76,18 +76,10 @@ class _ConvBnActivation2d(Float._ConvBnActivation2d, QATModule):
            bn_var.detach() * num_elements_per_channel / (num_elements_per_channel - 1)
        )
        exponential_average_factor = 1 - self.bn.momentum
        add_update(
            self.bn.running_mean,
            delta=bn_mean,
            alpha=1 - exponential_average_factor,
            beta=exponential_average_factor,
        )
        add_update(
            self.bn.running_var,
            delta=bn_var,
            alpha=1 - exponential_average_factor,
            beta=exponential_average_factor,
        )
        self.bn.running_mean *= self.bn.momentum
        self.bn.running_mean += exponential_average_factor * bn_mean
        self.bn.running_var *= self.bn.momentum
        self.bn.running_var += exponential_average_factor * bn_var

    def calc_conv_bn_qat(self, inp, approx=True):
        if self.training and not approx:
--- a/imperative/python/megengine/module/qat/linear.py
+++ b/imperative/python/megengine/module/qat/linear.py
@@ -18,7 +18,7 @@ class Linear(Float.Linear, QATModule):
    :param in_features: size of each input sample.
    :param out_features: size of each output sample.
    :param bias: If set to ``False``, the layer will not learn an additive bias.
        Default: ``True``
        Default: True

    """

--- a/imperative/python/megengine/module/qat/module.py
+++ b/imperative/python/megengine/module/qat/module.py
@@ -52,7 +52,7 @@ class QATModule(Module):
            self.weight_fake_quant = safe_call(qconfig.weight_fake_quant)

    def _enable_exec(self, with_module, func, enable):
        if not with_module:
        if not with_module or not func:
            return
        if enable:
            func.enable()
--- a/imperative/python/megengine/module/quantized/concat.py
+++ b/imperative/python/megengine/module/quantized/concat.py
@@ -15,7 +15,7 @@ from .module import QuantizedModule

 class Concat(QuantizedModule):
    r"""
    A :class:`~.QuantizedModule` to do quantized concat, inference only.
    A :class:`~.QuantizedModule` to do quantized concat, used for inference only.
    """

    def __init__(self, dtype=None):
@@ -29,7 +29,7 @@ class Concat(QuantizedModule):
    @classmethod
    def from_qat_module(cls, qat_module: QAT.Concat):
        r"""
        return a :class:`~.QuantizedModule` instance converted from a
        Return a :class:`~.QuantizedModule` instance converted from a
        :class:`~.QATModule` instance.
        """
        return cls(qat_module.get_activation_dtype())
--- a/imperative/python/megengine/module/quantized/conv.py
+++ b/imperative/python/megengine/module/quantized/conv.py
@@ -11,17 +11,17 @@ import numpy as np

 from ... import module as Float
 from ...core.tensor import dtype
 from ...functional import conv_bias_activation
 from ...functional.nn import conv_bias_activation
 from ...tensor import Parameter
 from ..qat import conv as QAT
 from .module import QuantizedModule


 class Conv2d(Float.Conv2d, QuantizedModule):
    r"""quantized version of :class:`~.qat.conv.Conv2d`."""
    r"""Applies a 2D convolution over an quantized input tensor, inference only.
    r"""Quantized version of :class:`~.qat.conv.Conv2d`."""
    r"""Applies a 2D convolution over a quantized input tensor, used for inference only.

    The parameter is same with :class: `~.Conv2d`
    The parameter is same with :class: `~.Conv2d`.
    """

    def __init__(
@@ -101,7 +101,7 @@ class Conv2d(Float.Conv2d, QuantizedModule):


 class ConvRelu2d(Conv2d):
    r"""quantized version of :class:`~.qat.conv.ConvRelu2d`."""
    r"""Quantized version of :class:`~.qat.conv.ConvRelu2d`."""

    def forward(self, inp):
        return self.calc_conv_quantized(inp, nonlinear_mode="RELU")
--- a/imperative/python/megengine/module/quantized/conv_bn.py
+++ b/imperative/python/megengine/module/quantized/conv_bn.py
@@ -11,15 +11,15 @@ from .conv import Conv2d


 class _ConvBnActivation2d(Conv2d):
    r"""Applies a 2D convolution over an quantized input tensor, inference only.
    r"""Applies a 2D convolution over a quantized input tensor, used for inference only.

    The parameter is same with :class: `~.Conv2d`
    The parameter is same with :class: `~.Conv2d`.
    """

    @classmethod
    def from_qat_module(cls, qat_module: QAT._ConvBnActivation2d):
        r"""
        return a :class:`~.QuantizedModule` instance converted from a
        Return a :class:`~.QuantizedModule` instance converted from a
        :class:`~.QATModule` instance.
        """
        output_dtype = qat_module.get_activation_dtype()
@@ -43,14 +43,14 @@ class _ConvBnActivation2d(Conv2d):


 class ConvBn2d(_ConvBnActivation2d):
    r"""quantized version of :class:`~.qat.conv_bn.ConvBn2d`."""
    r"""Quantized version of :class:`~.qat.conv_bn.ConvBn2d`."""

    def forward(self, inp):
        return self.calc_conv_quantized(inp, nonlinear_mode="IDENTITY")


 class ConvBnRelu2d(_ConvBnActivation2d):
    r"""quantized version of :class:`~.qat.conv_bn.ConvBnRelu2d`."""
    r"""Quantized version of :class:`~.qat.conv_bn.ConvBnRelu2d`."""

    def forward(self, inp):
        return self.calc_conv_quantized(inp, nonlinear_mode="RELU")
--- a/imperative/python/megengine/module/quantized/elemwise.py
+++ b/imperative/python/megengine/module/quantized/elemwise.py
@@ -13,7 +13,7 @@ from .module import QuantizedModule


 class Elemwise(QuantizedModule):
    r"""quantized version of :class:`~.qat.elemwise.Elemwise`."""
    r"""Quantized version of :class:`~.qat.elemwise.Elemwise`."""

    _elemwise_multi_type_mode = P.ElemwiseMultiType.Mode

@@ -30,7 +30,7 @@ class Elemwise(QuantizedModule):
    @classmethod
    def from_qat_module(cls, qat_module: QAT.Elemwise):
        r"""
        return a :class:`~.QuantizedModule` instance converted from a
        Return a :class:`~.QuantizedModule` instance converted from a
        :class:`~.QATModule` instance.
        """
        return cls(qat_module.method.name, qat_module.get_activation_dtype())
--- a/imperative/python/megengine/module/quantized/linear.py
+++ b/imperative/python/megengine/module/quantized/linear.py
@@ -15,7 +15,7 @@ from .module import QuantizedModule


 class Linear(QuantizedModule):
    r"""quantized version of :class:`~.qat.linear.Linear`."""
    r"""Quantized version of :class:`~.qat.linear.Linear`."""

    def __init__(
        self, dtype: np.dtype = None,
@@ -31,7 +31,7 @@ class Linear(QuantizedModule):
        inp_scale = dtype.get_scale(inp.dtype)
        w_scale = dtype.get_scale(self.weight.dtype)
        bias_dtype = dtype.qint32(inp_scale * w_scale)
        return F.linear(
        return F.nn.linear(
            inp,
            self.weight,
            None if self.bias is None else self.bias.astype(bias_dtype),
@@ -40,7 +40,7 @@ class Linear(QuantizedModule):
    @classmethod
    def from_qat_module(cls, qat_module: QAT.Linear):
        r"""
        return a :class:`~.QuantizedModule` instance converted from a
        Return a :class:`~.QuantizedModule` instance converted from a
        :class:`~.QATModule` instance.
        """
        output_dtype = qat_module.get_activation_dtype()
--- a/imperative/python/megengine/module/quantized/module.py
+++ b/imperative/python/megengine/module/quantized/module.py
@@ -26,6 +26,6 @@ class QuantizedModule(Module):
    @abstractmethod
    def from_qat_module(cls, qat_module: QATModule):
        r"""
        return a :class:`~.QuantizedModule` instance converted from a
        Return a :class:`~.QuantizedModule` instance converted from a
        :class:`~.QATModule` instance.
        """
--- a/imperative/python/megengine/module/quantized/quant_dequant.py
+++ b/imperative/python/megengine/module/quantized/quant_dequant.py
@@ -11,7 +11,7 @@ from .module import QuantizedModule

 class QuantStub(QuantizedModule):
    r"""
    quantized version of :class:`~.qat.quant_dequant.QuantStub`,
    Quantized version of :class:`~.qat.quant_dequant.QuantStub`,
    will convert input to quantized dtype.
    """

@@ -25,7 +25,7 @@ class QuantStub(QuantizedModule):
    @classmethod
    def from_qat_module(cls, qat_module: QAT.QuantStub):
        r"""
        return a :class:`~.QuantizedModule` instance converted from a
        Return a :class:`~.QuantizedModule` instance converted from a
        :class:`~.QATModule` instance.
        """
        return cls(qat_module.get_activation_dtype())
@@ -33,7 +33,7 @@ class QuantStub(QuantizedModule):

 class DequantStub(QuantizedModule):
    r"""
    quantized version of :class:`~.qat.quant_dequant.DequantStub`,
    Quantized version of :class:`~.qat.quant_dequant.DequantStub`,
    will restore quantized input to float32 dtype.
    """

@@ -43,7 +43,7 @@ class DequantStub(QuantizedModule):
    @classmethod
    def from_qat_module(cls, qat_module: QAT.DequantStub):
        r"""
        return a :class:`~.QuantizedModule` instance converted from a
        Return a :class:`~.QuantizedModule` instance converted from a
        :class:`~.QATModule` instance.
        """
        return cls()
--- a/imperative/python/megengine/module/sequential.py
+++ b/imperative/python/megengine/module/sequential.py
@@ -26,40 +26,40 @@ class Sequential(Module):
        import megengine as mge
        import megengine.module as M
        import megengine.functional as F
        from collections import OrderedDict

        batch_size = 64
        data = mge.tensor(np.zeros((batch_size, 1, 28, 28)), dtype=np.float32)
        label = mge.tensor(np.zeros(batch_size,), dtype=np.int32)

        data = data.reshape(batch_size, -1)
        net = M.Sequential(
        net0 = M.Sequential(
                M.Linear(28 * 28, 320),
                M.Linear(320, 500),
                M.Linear(500, 320),
                M.Linear(320, 10)
            )
        pred = net(data)
        pred0 = net0(data)

        loss = F.cross_entropy_with_softmax(pred, label)
        modules = OrderedDict()
        modules["fc0"] = nn.Linear(28 * 28, 320)
        modules["fc1"] = nn.Linear(320, 10)
        net1 = nn.Sequential(modules)

        pred1 = net1(data)
    """

    def __init__(self, *args):
        super().__init__()
        self.layer_keys = []
        self.layer_values = []
        if len(args) == 1 and isinstance(args[0], OrderedDict):
            for key, module in args[0].items():
                # self.add_module(key, module)
                setattr(self, key, module)
                self.layer_keys.append(key)
                self.layer_values.append(module)
        else:
            for idx, module in enumerate(args):
                # self.add_module(str(idx), module)
                setattr(self, str(idx), module)
                self.layer_keys.append(str(idx))
                self.layer_values.append(module)

    def __getitem__(self, idx):
        if isinstance(idx, slice):
@@ -67,11 +67,10 @@ class Sequential(Module):
                OrderedDict(zip(self.layer_keys[idx], self.layer_values[idx]))
            )
        else:
            return self.layer_values[idx]
            return getattr(self, self.layer_keys[idx])

    def __setitem__(self, idx, module):
        key = self.layer_keys[idx]
        self.layer_values[idx] = module
        return setattr(self, key, module)

    def __delitem__(self, idx):
@@ -79,11 +78,9 @@ class Sequential(Module):
            for key in self.layer_keys[idx]:
                delattr(self, key)
                del self.layer_keys[idx]
                del self.layer_values[idx]
        else:
            delattr(self, self.layer_keys[idx])
            del self.layer_keys[idx]
            del self.layer_values[idx]

    def __len__(self):
        return len(self.layer_keys)
@@ -91,6 +88,10 @@ class Sequential(Module):
    def __iter__(self):
        return iter(self.layer_values)

    @property
    def layer_values(self):
        return [getattr(self, key) for key in self.layer_keys]

    def forward(self, inp):
        for layer in self.layer_values:
            inp = layer(inp)
--- a/imperative/python/megengine/optimizer/adadelta.py
+++ b/imperative/python/megengine/optimizer/adadelta.py
@@ -22,13 +22,13 @@ class Adadelta(Optimizer):

    :param params: iterable of parameters to optimize or dicts defining
        parameter groups.
    :param lr: coefficient that scale delta before it is applied
        to the parameters (default: 1.0).
    :param lr: coefficient that scales delta before it is applied
        to the parameters. Default: 1.0
    :param rho: coefficient used for computing a running average
        of squared gradients (default: 0.9).
        of squared gradients. Default: 0.9
    :param eps: term added to the denominator to improve
        numerical stability (default: 1e-6).
    :param weight_decay: weight decay (L2 penalty) (default: 0).
        numerical stability. Default: 1e-6
    :param weight_decay: weight decay (L2 penalty). Default: 0
    """

    def __init__(
--- a/imperative/python/megengine/optimizer/adagrad.py
+++ b/imperative/python/megengine/optimizer/adagrad.py
@@ -23,12 +23,12 @@ class Adagrad(Optimizer):

    :param params: iterable of parameters to optimize or dicts defining
        parameter groups.
    :param lr: coefficient that scale delta before it is applied
        to the parameters (default: 1e-2).
    :param lr_decay: learning rate decay (default: 0)
    :param lr: coefficient that scales delta before it is applied
        to the parameters. Default: 1e-2
    :param lr_decay: learning rate decay. Default: 0
    :param eps: term added to the denominator to improve
        numerical stability (default: 1e-10).
    :param weight_decay: weight decay (L2 penalty) (default: 0).
        numerical stability. Default: 1e-10
    :param weight_decay: weight decay (L2 penalty). Default: 0
    """

    def __init__(