feat(imperative): add support for quantized conv transpose2d

GitOrigin-RevId: ffd6431299
3 years ago · 3eb0505f9b
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -67,7 +67,6 @@ option(MGE_WITH_ROCM "Enable ROCM support" OFF)
 option(MGE_WITH_LARGE_ARCHIVE "Enable big archive link support" OFF)
 option(MGE_BUILD_WITH_ASAN "Enable build with ASAN, need compiler support" OFF)


 if(MSVC OR WIN32)
    message(STATUS "windows force cudnn static link")
    set(MGE_WITH_CUDNN_SHARED OFF)
@@ -332,7 +331,6 @@ set(CMAKE_SHARED_LINKER_FLAGS "${CMAKE_SHARED_LINKER_FLAGS} ${MGE_COMMON_LINKER_
 set(CMAKE_MODULE_LINKER_FLAGS "${CMAKE_MODULE_LINKER_FLAGS} ${MGE_COMMON_LINKER_FLAGS}")
 set(CMAKE_EXE_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS} ${MGE_COMMON_LINKER_FLAGS}")


 if(NOT MGE_WITH_JIT)
    if(MGE_WITH_HALIDE)
        message(WARNING "MGE_WITH_HALIDE is set to OFF with MGE_WITH_JIT disabled")
@@ -728,7 +726,6 @@ if (MGE_WITH_ROCM)
    include(cmake/rocm.cmake)
 endif ()


 if(MGE_WITH_ATLAS)
    add_subdirectory(dnn/atlas-stub)
    list(APPEND MGE_ATLAS_LIBS atlas-stub)
@@ -736,7 +733,6 @@ if(MGE_WITH_ATLAS)
    set(MGB_ATLAS ${MGE_WITH_ATLAS})
 endif()


 find_program(CCACHE_BIN ccache)
 if(CCACHE_BIN)
    set(CMAKE_CXX_COMPILER_LAUNCHER ${CCACHE_BIN})
@@ -834,12 +830,10 @@ endif()
 set(MGB_CUDA ${MGE_WITH_CUDA})
 set(MEGDNN_WITH_CUDA ${MGE_WITH_CUDA})


 #ROCM
 set(MGB_ROCM ${MGE_WITH_ROCM})
 set(MEGDNN_WITH_ROCM ${MGE_WITH_ROCM})


 # CAMBRICON
 set(MGB_CAMBRICON ${MGE_WITH_CAMBRICON})
 set(MEGDNN_WITH_CAMBRICON ${MGE_WITH_CAMBRICON})
@@ -1029,7 +1023,6 @@ if(MGE_BUILD_SDK)
    add_subdirectory(sdk/load-and-run)
 endif()


 if(MGE_BUILD_IMPERATIVE_RT)
    add_subdirectory(imperative)
    message(STATUS "Enable imperative python wrapper runtime")
@@ -1117,4 +1110,3 @@ if(MGE_WITH_CUDA AND MGE_CUDA_USE_STATIC AND("${CUDNN_VERSION}" VERSION_GREATER
    message(WARNING "Static link CUDNN8 with many sm is unworkable, please use -DMGE_WITH_CUDNN_SHARED=ON or -DMGE_WITH_LARGE_ARCHIVE=ON -DMGE_CUDA_GENCODE=\"-gencode arch=compute_70,code=sm_70 arch=compute_75,code=sm_75\" ")
    message(WARNING "Static link CUDNN8 with many sm is unworkable, please use -DMGE_WITH_CUDNN_SHARED=ON or -DMGE_WITH_LARGE_ARCHIVE=ON -DMGE_CUDA_GENCODE=\"-gencode arch=compute_70,code=sm_70 arch=compute_75,code=sm_75\" ")
 endif()

--- a/cmake/Halide.cmake
+++ b/cmake/Halide.cmake
@@ -1,3 +1,4 @@

 include(ExternalProject)
 find_package(LLVM 6.0 REQUIRED CONFIG)

--- a/dnn/CMakeLists.txt
+++ b/dnn/CMakeLists.txt
@@ -38,7 +38,6 @@ list(APPEND OPR_PARAM_DEFS_OUTS
 )
 list(APPEND OPR_PARAM_DEFS_INC ${OPR_PARAM_DEFS_OUT_DIR})


 install(DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}/include/megdnn DESTINATION ${CMAKE_INSTALL_INCLUDEDIR} FILES_MATCHING PATTERN "*.h")

 add_custom_target(_opr_param_defs DEPENDS ${OPR_PARAM_DEFS_OUTS})
@@ -56,7 +55,6 @@ endforeach()
 add_dependencies(opr_param_defs _opr_param_defs)
 install(TARGETS opr_param_defs EXPORT ${MGE_EXPORT_TARGETS})


 if(MGE_WITH_CUDA)
    add_library(cutlass INTERFACE)
    target_include_directories(cutlass 
--- a/dnn/include/megdnn/config/config.h
+++ b/dnn/include/megdnn/config/config.h
@@ -13,7 +13,6 @@

 #if !defined(__CUDACC__) && !defined(__HIPCC__)


 #endif  // !defined(__CUDACC__)

 // vim: syntax=cpp.doxygen
--- a/dnn/include/megdnn/handle.h
+++ b/dnn/include/megdnn/handle.h
@@ -90,7 +90,6 @@ class Handle {
        std::unique_ptr<opr> create_rocm_operator();
 #endif


        virtual ~Handle();

        /*!
--- a/dnn/src/CMakeLists.txt
+++ b/dnn/src/CMakeLists.txt
@@ -137,11 +137,9 @@ if(MGE_WITH_CUDA)
    gen_cutlass_kimpl(conv2d tensorop8832)
    file(GLOB_RECURSE CUTLASS_SOURCES ${CUTLASS_GEN_DIR}/*.cu)
    list(APPEND SOURCES ${CUTLASS_SOURCES})

    list(APPEND SOURCES ${CUSOURCES})
 endif()


 if(MGE_WITH_CAMBRICON)
    file(GLOB_RECURSE SOURCES_ cambricon/*.cpp)
    list(APPEND SOURCES ${SOURCES_})
@@ -161,7 +159,6 @@ if(MGE_WITH_ATLAS)
    list(APPEND LIBMEGDNN_DEF -DMEGDNN_WITH_ATLAS=1)
 endif()


 add_definitions(${LIBMEGDNN_DEF})
 add_library(megdnn EXCLUDE_FROM_ALL OBJECT ${SOURCES})

@@ -186,7 +183,6 @@ if(MGE_WITH_ROCM)
        ${AMDOCL_LIBRARY_DIR})
 endif()


 if(${MGE_ARCH} STREQUAL "x86_64" OR ${MGE_ARCH} STREQUAL "i386" OR ${MGE_ARCH} STREQUAL "armv7" OR ${MGE_ARCH} STREQUAL "aarch64")
    if(MGE_ENABLE_CPUINFO)
        target_link_libraries(megdnn PRIVATE $<BUILD_INTERFACE:cpuinfo>)
--- a/dnn/src/common/flag_warn.cpp
+++ b/dnn/src/common/flag_warn.cpp
@@ -15,5 +15,4 @@
 #pragma message "Mangling is disabled."
 #endif // MEGDNN_ENABLE_MANGLING


 // vim: syntax=cpp.doxygen
--- a/dnn/src/common/handle.cpp
+++ b/dnn/src/common/handle.cpp
@@ -31,13 +31,10 @@
 #include "src/aarch64/handle.h"
 #endif



 #if MEGDNN_WITH_CUDA
 #include "src/cuda/handle.h"
 #endif


 #if MEGDNN_WITH_CAMBRICON
 #include "src/cambricon/handle.h"
 #endif
@@ -128,7 +125,6 @@ std::unique_ptr<Handle> Handle::make(megcoreComputingHandle_t computing_handle,
    return nullptr;
 }


 void Handle::set_destructor(const thin_function<void()>& d) {
    megdnn_assert(!m_destructor, "destructor can be set only once");
    m_destructor = d;
--- a/dnn/src/common/megcore/common/computing_context.cpp
+++ b/dnn/src/common/megcore/common/computing_context.cpp
@@ -17,8 +17,6 @@
 #include "src/cuda/megcore/cuda_computing_context.hpp"
 #endif



 #if MEGDNN_WITH_ROCM
 #include "src/rocm/megcore/computing_context.hpp"
 #endif
--- a/dnn/src/common/warp_common.h
+++ b/dnn/src/common/warp_common.h
@@ -880,7 +880,6 @@ void remap(const Mat<T>& src, Mat<T>& dst, Mat<short>& map1, Mat<ushort>& map2,
                    for (; x1 <= bcols - 8; x1 += 8)
                        vst1q_u16(A + x1,
                                  vandq_u16(vld1q_u16(sA + x1), v_scale));

 #endif
                    for (; x1 < bcols; ++x1)
                        A[x1] = (ushort)(sA[x1] & (INTER_TAB_SIZE2 - 1));
--- a/dnn/src/cuda/conv_bias/algo.cpp
+++ b/dnn/src/cuda/conv_bias/algo.cpp
@@ -287,7 +287,6 @@ void ConvBiasForwardImpl::AlgoPack::fill_dp4a_algos() {
    int8_nchw4_dotprod.emplace_back(AlgoParam{16, 64, 8, 16, 64, 8, 2});
 }


 ConvBiasForwardImpl::AlgoBase*
 ConvBiasForwardImpl::AlgoPack::cudnn_conv_from_enum(
        cudnnConvolutionFwdAlgo_t algo) {
--- a/dnn/src/cuda/conv_bias/algo.h
+++ b/dnn/src/cuda/conv_bias/algo.h
@@ -1037,7 +1037,6 @@ private:
    WorkspaceBundle get_workspace_bundle(void* ptr, const SizeArgs& args) const;
 };


 class ConvBiasForwardImpl::AlgoPack : NonCopyableObj {
 private:
    AlgoBase::Mapper m_all_algos_map;
--- a/dnn/src/fallback/convolution/img2col_helper.h
+++ b/dnn/src/fallback/convolution/img2col_helper.h
@@ -10,7 +10,6 @@
 */
 #include "src/common/utils.h"


 namespace {

 template <bool is_xcorr, typename dtype>
--- a/dnn/test/CMakeLists.txt
+++ b/dnn/test/CMakeLists.txt
@@ -34,7 +34,6 @@ if(MGE_WITH_CAMBRICON)
    list(APPEND SOURCES ${SOURCES_})
 endif()


 if(MGE_WITH_ATLAS)
    file(GLOB_RECURSE SOURCES_ atlas/*.cpp)
    list(APPEND SOURCES ${SOURCES_})
@@ -45,8 +44,6 @@ if (MGE_WITH_ROCM)
    list (APPEND SOURCES ${SOURCES_})
 endif()



 add_executable(megdnn_test ${SOURCES})
 set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wno-narrowing")
 target_link_libraries(megdnn_test gtest)
@@ -60,7 +57,6 @@ if(MGE_WITH_ATLAS)
    target_link_libraries(megdnn_test atlas-stub)
 endif()


 target_include_directories(megdnn_test
    PRIVATE
        ${PROJECT_SOURCE_DIR}/third_party/midout/src
--- a/dnn/test/common/conv_bias.cpp
+++ b/dnn/test/common/conv_bias.cpp
@@ -494,7 +494,6 @@ std::vector<TestArg> get_int8_nchw44_args(size_t kernel_size, size_t pack_size,
    return args;
 }


 std::vector<TestArg> get_int8_nchw4_args_check_bounds(size_t kernel_size) {
    std::vector<TestArg> args;
    param::ConvBias cur_param;
@@ -530,7 +529,6 @@ std::vector<TestArg> get_int8_nchw4_args_check_bounds(size_t kernel_size) {
    return args;
 }


 std::vector<TestArg> get_int8_nchw4_args_small_batch(size_t kernel_size) {
    std::vector<TestArg> args;
    param::ConvBias cur_param;
@@ -974,7 +972,6 @@ void benchmark_winograd(const char* algo_name, Handle* handle, size_t kernel,
 }
 #endif  // MEGDNN_WITH_BENCHMARK


 std::vector<conv_bias::TestArg> get_conv_bias_args(
        std::vector<size_t> kernel, size_t stride, bool no_pad, bool no_bias,
        bool no_nonlinemode, bool quantized_nlmod, bool only_broadcast_bias) {
@@ -1188,7 +1185,6 @@ void check_conv_bias_preprocess(std::vector<conv_bias::TestArg> args,
    }
 }


 void checker_conv_bias_common(std::vector<conv_bias::TestArg> args, Handle* handle,
                       RNG* rng, float epsilon, DType type0, DType type1,
                       DType type2, DType type3, const char* algo_name) {
--- a/dnn/test/common/conv_bias.h
+++ b/dnn/test/common/conv_bias.h
@@ -93,7 +93,6 @@ void check_conv_bias(std::vector<megdnn::test::conv_bias::TestArg> args,
 void checker_conv_bias_int8x8x16(
        std::vector<megdnn::test::conv_bias::TestArg> args,
        megdnn::Handle* handle, const char* algo_name);

 void checker_conv_bias_common(std::vector<conv_bias::TestArg> args,
                              Handle* handle, RNG* rng, float epsilon,
                              DType type0, DType type1, DType type2,
--- a/dnn/test/common/small_vector.cpp
+++ b/dnn/test/common/small_vector.cpp
@@ -1145,7 +1145,6 @@ TEST(SmallVectorTest, SwapMoveOnly) {
        }
    }
 }

 }  // anonymous namespace

 // vim: syntax=cpp.doxygen foldmethod=marker foldmarker=f{{{,f}}}
--- a/dnn/test/common/test_basic_types.cpp
+++ b/dnn/test/common/test_basic_types.cpp
@@ -40,7 +40,6 @@ TensorLayout make_layout(std::initializer_list<size_t> shape,
 }

 } // anonymous namespace

 #if MEGDNN_64_BIT
 TEST(BASIC_TYPES, TOTAL_NR_ELEMS) {
    TensorShape shp{1u<<31, 1u<<31};
@@ -340,5 +339,4 @@ TEST(BASIC_TYPES, TENSOR_LAYOUT_FMT_LOW_BITS_VALID) {
                              LowbitsAlignedToBytesTensorFormat::make(4_z)),
                 MegDNNError);
 }

 // vim: syntax=cpp.doxygen
--- a/dnn/test/cuda/conv_bias_int8.cpp
+++ b/dnn/test/cuda/conv_bias_int8.cpp
@@ -697,7 +697,6 @@ TEST_F(CUDA, CONV_BIAS_INT8_CHWN4_UNROLL_WIDTH_TENSORCORE_1x1_ALGO_2) {
            conv_bias::get_int8_chwn4_args_small_batch(1));
 }


 TEST_F(CUDA, FALLBACK_CONV_QS8) {
    require_compute_capability_eq(7, 5);
    Checker<ConvBiasForward> checker(handle_cuda());
@@ -1100,7 +1099,6 @@ TEST_F(CUDA, BENCHMARK_CONV_BIAS_INT8_NCHW4_NCHW) {
    run({{16, 16, 46, 80, 4}, {32, 16, 3, 3, 4}, {1, 32, 1, 1}});
 }


 #if CUDA_VERSION >= 10020
 TEST_F(CUDA, BENCHMARK_CUTLASS_CONV_BIAS_INT8_NCHW32) {
    require_compute_capability(7, 5);
--- a/dnn/test/cuda/elemwise_multi_type.cpp
+++ b/dnn/test/cuda/elemwise_multi_type.cpp
@@ -32,7 +32,6 @@ TYPED_TEST(CUDA_ELEMWISE_MULTI_TYPE, run) {
    elemwise_multi_type::run_test<TypeParam>(this->handle_cuda());
 }


 using Mode = ElemwiseMultiType::Param::Mode;
 static void run_test(int arity, Checker<ElemwiseMultiType>& checker, Mode mode) {
    for (auto type : std::vector<std::pair<DType, DType>>{
--- a/dnn/test/cuda/sleep.cpp
+++ b/dnn/test/cuda/sleep.cpp
@@ -22,7 +22,6 @@
 using namespace megdnn;
 using namespace test;


 TEST_F(CUDA, SLEEP) {
    auto opr = this->handle_cuda()->create_operator<megdnn::SleepForward>();

@@ -53,6 +52,5 @@ TEST_F(CUDA, SLEEP) {
 }



 // vim: syntax=cpp.doxygen

--- a/dnn/test/fallback/conv_bias.cpp
+++ b/dnn/test/fallback/conv_bias.cpp
@@ -75,7 +75,6 @@ TEST_F(FALLBACK, CONV_BIAS_FORWARD) {
                .execs({src_shape, filter_shape, bias_shape, {}, {}})
                .execs({src_shape, filter_shape, bias_shape_channel, {}, {}});
    }

 }

 std::vector<conv_bias::TestArg> get_conv_bias_args(
@@ -236,7 +235,6 @@ TEST_F(FALLBACK_MULTI_THREADS, CONV_BIAS_FORWARD_QUANTIZED) {
                      "FALLBACK_NAIVE");
 }


 #if MEGDNN_WITH_BENCHMARK
 TEST_F(FALLBACK, BENCHMARK_CONVBIAS) {
    constexpr size_t RUNS = 10;
--- a/imperative/python/megengine/functional/quantized.py
+++ b/imperative/python/megengine/functional/quantized.py
@@ -139,3 +139,52 @@ def batch_conv_bias_activation(
    )
    (outputs,) = apply(op, inp, weight, bias)
    return outputs


 def conv_transpose2d(
    inp: Tensor,
    weight: Tensor,
    bias: Tensor = None,
    dtype=None,
    stride: Union[int, Tuple[int, int]] = 1,
    padding: Union[int, Tuple[int, int]] = 0,
    dilation: Union[int, Tuple[int, int]] = 1,
    groups: int = 1,
    conv_mode="cross_correlation",
    compute_mode="default",
 ) -> Tensor:

    assert (
        conv_mode.lower() == "cross_correlation"
        or conv_mode.name == "CROSS_CORRELATION"
    )
    assert compute_mode.lower() == "default" or compute_mode.name == "DEFAULT"

    if groups != 1:
        raise NotImplementedError(
            "group quantized transposed conv2d is not supported yet."
        )
    if bias is not None:
        raise NotImplementedError(
            "bias of quantized transposed conv2d is not supported yet."
        )

    pad_h, pad_w = _pair(padding)
    stride_h, stride_w = _pair_nonzero(stride)
    dilate_h, dilate_w = _pair_nonzero(dilation)

    # should be replaced by Op with bias such as ConvolutionBackwardDataBias
    op = builtin.ConvolutionBackwardData(
        stride_h=stride_h,
        stride_w=stride_w,
        pad_h=pad_h,
        pad_w=pad_w,
        dilate_h=dilate_h,
        dilate_w=dilate_w,
        strategy=get_execution_strategy(),
        dtype=dtype,
        compute_mode=compute_mode,
        mode=conv_mode,
    )
    (output,) = apply(op, weight, inp)
    return output
--- a/imperative/python/megengine/module/conv.py
+++ b/imperative/python/megengine/module/conv.py
@@ -651,11 +651,11 @@ class ConvTranspose2d(_ConvNd):
        # Assume format is NCHW
        return (1, self.out_channels, 1, 1)

    def forward(self, inp):
    def calc_conv_transpose2d(self, inp, weight, bias):
        return conv_transpose2d(
            inp,
            self.weight,
            self.bias,
            weight,
            bias,
            self.stride,
            self.padding,
            self.dilation,
@@ -664,6 +664,9 @@ class ConvTranspose2d(_ConvNd):
            self.compute_mode,
        )

    def forward(self, inp):
        return self.calc_conv_transpose2d(inp, self.weight, self.bias)


 class LocalConv2d(Conv2d):
    r"""
--- a/imperative/python/megengine/module/qat/init.py
+++ b/imperative/python/megengine/module/qat/init.py
@@ -7,7 +7,7 @@
 # "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 from .batch_matmul_activation import BatchMatMulActivation
 from .concat import Concat
 from .conv import Conv2d, ConvRelu2d
 from .conv import Conv2d, ConvRelu2d, ConvTranspose2d
 from .conv_bn import ConvBn2d, ConvBnRelu2d
 from .elemwise import Elemwise
 from .linear import Linear
--- a/imperative/python/megengine/module/qat/conv.py
+++ b/imperative/python/megengine/module/qat/conv.py
@@ -57,3 +57,42 @@ class ConvRelu2d(Conv2d):

    def forward(self, inp):
        return self.apply_quant_activation(F.relu(self.calc_conv_qat(inp)))


 class ConvTranspose2d(Float.ConvTranspose2d, QATModule):
    r"""
    A :class:`~.QATModule` :class:`~.module.ConvTranspose2d` with QAT support.
    Could be applied with :class:`~.Observer` and :class:`~.FakeQuantize`.
    """

    def calc_conv_transpose2d_qat(self, inp):
        w_qat = self.apply_quant_weight(self.weight)
        b_qat = self.apply_quant_bias(self.bias, inp, w_qat)
        conv = self.calc_conv_transpose2d(inp, w_qat, b_qat)
        return conv

    @classmethod
    def from_float_module(cls, float_module: Float.ConvTranspose2d):
        r"""
        Return a :class:`~.QATModule` instance converted from
        a float :class:`~.Module` instance.
        """
        qat_module = cls(
            float_module.in_channels,
            float_module.out_channels,
            float_module.kernel_size,
            float_module.stride,
            float_module.padding,
            float_module.dilation,
            float_module.groups,
            float_module.bias is not None,
            float_module.conv_mode,
            float_module.compute_mode,
            name=float_module.name,
        )
        qat_module.weight = float_module.weight
        qat_module.bias = float_module.bias
        return qat_module

    def forward(self, inp):
        return self.apply_quant_activation(self.calc_conv_transpose2d_qat(inp))
--- a/imperative/python/megengine/module/quantized/init.py
+++ b/imperative/python/megengine/module/quantized/init.py
@@ -7,7 +7,7 @@
 # "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 from .batch_matmul_activation import BatchMatMulActivation
 from .concat import Concat
 from .conv import Conv2d, ConvRelu2d
 from .conv import Conv2d, ConvRelu2d, ConvTranspose2d
 from .conv_bn import ConvBn2d, ConvBnRelu2d
 from .elemwise import Elemwise
 from .linear import Linear
--- a/imperative/python/megengine/module/quantized/conv.py
+++ b/imperative/python/megengine/module/quantized/conv.py
@@ -12,6 +12,7 @@ import numpy as np
 from ... import module as Float
 from ...core.tensor import dtype
 from ...functional.nn import conv_bias_activation
 from ...functional.quantized import conv_transpose2d
 from ...tensor import Parameter
 from ..qat import conv as QAT
 from .module import QuantizedModule
@@ -108,3 +109,98 @@ class ConvRelu2d(Conv2d):

    def forward(self, inp):
        return self.calc_conv_quantized(inp, nonlinear_mode="relu")


 class ConvTranspose2d(Float.ConvTranspose2d, QuantizedModule):
    r"""Quantized version of :class:`~.qat.ConvTranspose2d`.

    Applies a 2D transposed convolution over a quantized input tensor, used
    for inference only.

    The parameter is same with :class:`~.module.ConvTranspose2d` but dtype.

    :param dtype: data type of the output, should be qint8.
    """

    def __init__(
        self,
        in_channels: int,
        out_channels: int,
        kernel_size: Union[int, Tuple[int, int]],
        stride: Union[int, Tuple[int, int]] = 1,
        padding: Union[int, Tuple[int, int]] = 0,
        dilation: Union[int, Tuple[int, int]] = 1,
        groups: int = 1,
        bias: bool = True,
        conv_mode: str = "cross_correlation",
        compute_mode: str = "default",
        dtype=None,
        **kwargs
    ):
        super().__init__(
            in_channels=in_channels,
            out_channels=out_channels,
            kernel_size=kernel_size,
            stride=stride,
            padding=padding,
            dilation=dilation,
            groups=groups,
            bias=bias,
            conv_mode=conv_mode,
            compute_mode=compute_mode,
        )
        self.output_dtype = dtype

    @classmethod
    def from_qat_module(cls, qat_module: QAT.ConvTranspose2d):
        r"""
        return a :class:`~.QuantizedModule` instance converted from a
        :class:`~.QATModule` instance.
        """
        output_dtype = qat_module.get_activation_dtype()
        qconv = cls(
            qat_module.in_channels,
            qat_module.out_channels,
            qat_module.kernel_size,
            qat_module.stride,
            qat_module.padding,
            qat_module.dilation,
            qat_module.groups,
            qat_module.bias is not None,
            qat_module.conv_mode,
            qat_module.compute_mode,
            dtype=output_dtype,
            name=qat_module.name,
        )
        weight = qat_module.weight.astype(qat_module.get_weight_dtype())
        qconv.weight = Parameter(weight.numpy(), name=qat_module.weight.name)
        qconv.bias = (
            Parameter(qat_module.bias.numpy(), name=qat_module.bias.name)
            if qat_module.bias is not None
            else None
        )
        return qconv

    def calc_conv_transpose2d_quantized(self, inp):
        if self.bias is not None:
            inp_scale = dtype.get_scale(inp.dtype)
            w_scale = dtype.get_scale(self.weight.dtype)
            bias_scale = inp_scale * w_scale

        return conv_transpose2d(
            inp=inp,
            weight=self.weight,
            bias=self.bias.astype(dtype.qint32(bias_scale))
            if self.bias is not None
            else None,
            dtype=self.output_dtype,
            stride=self.stride,
            padding=self.padding,
            dilation=self.dilation,
            groups=self.groups,
            conv_mode=self.conv_mode,
            compute_mode=self.compute_mode,
        )

    def forward(self, inp):
        return self.calc_conv_transpose2d_quantized(inp)
--- a/imperative/python/megengine/quantization/internal_fake_quant.py
+++ b/imperative/python/megengine/quantization/internal_fake_quant.py
@@ -13,5 +13,3 @@ from .fake_quant import _FakeQuantize
 from .observer import MinMaxObserver
 from .qconfig import QConfig
 from .utils import QParams


--- a/imperative/python/megengine/utils/persistent_cache.py
+++ b/imperative/python/megengine/utils/persistent_cache.py
@@ -69,7 +69,6 @@ class PersistentCacheOnServer(_PersistentCache):
    def make_user_prefix(cls):
        return "mgbcache:{}".format(getpass.getuser())


    def _make_key(self, category, key):
        prefix_with_version = "{}:MGB{}".format(self._prefix, __version__)
        return b"@".join(
@@ -86,5 +85,3 @@ class PersistentCacheOnServer(_PersistentCache):
        key = self._make_key(category, key)
        self._prev_get_refkeep = conn.get(key)
        return self._prev_get_refkeep


--- a/imperative/python/setup.py
+++ b/imperative/python/setup.py
@@ -38,7 +38,6 @@ class build_ext(_build_ext):
            modpath = str(pathlib.Path(*modpath).resolve())

            copy_file(modpath, fullpath, verbose=self.verbose, dry_run=self.dry_run)

 package_name = 'MegEngine'

 v = {}
@@ -79,7 +78,6 @@ megengine_data += [
    for f in pathlib.Path('megengine', 'core', 'lib').glob('**/*')
 ]


 with open('requires.txt') as f:
    requires = f.read().splitlines()
 with open('requires-style.txt') as f:
@@ -108,8 +106,6 @@ setup_kwargs = dict(
    cmdclass={'build_ext': build_ext},
    scripts = ['./megengine/tools/mge'],
 )


 setup_kwargs.update(dict(
    classifiers=[
    'Development Status :: 3 - Alpha',
--- a/imperative/python/test/unit/functional/test_functional.py
+++ b/imperative/python/test/unit/functional/test_functional.py
@@ -876,8 +876,6 @@ def test_nms_is_same():
    assert op3 != op4




 def test_argmxx_on_inf():
    def run_argmax():
        x = F.zeros((100, 100))
--- a/imperative/python/test/unit/module/test_qat.py
+++ b/imperative/python/test/unit/module/test_qat.py
@@ -13,6 +13,7 @@ from megengine.module import (
    Conv2d,
    ConvBn2d,
    ConvRelu2d,
    ConvTranspose2d,
    DequantStub,
    Module,
    QuantStub,
@@ -202,3 +203,40 @@ def test_quantize_batchmatmul_activation():
        infer_cg = cgtools.GraphInference(file)[0]
        dumped_outputs = list(infer_cg.run(inputs.numpy()).values())[0]
        np.testing.assert_allclose(quantize_outputs.numpy(), dumped_outputs, atol=1e-6)


 def test_qat_conv_transpose2d():
    in_channels = 32
    out_channels = 64
    kernel_size = 3

    class TestNet(Module):
        def __init__(self, bias):
            super().__init__()
            self.quant = QuantStub()
            self.dequant = DequantStub()
            self.conv = ConvTranspose2d(
                in_channels, out_channels, kernel_size, bias=bias
            )

        def forward(self, inp):
            out = self.quant(inp)
            out = self.conv(out)
            out = self.dequant(out)
            return out

    inputs = tensor(np.random.randn(4, in_channels, 32, 32).astype(np.float32))
    for bias in [True, False]:
        net = TestNet(bias)
        net.train()
        qat_net = quantize_qat(net, inplace=False)
        disable_fake_quant(qat_net)
        normal_outputs = net(inputs)
        qat_outputs = qat_net(inputs)
        np.testing.assert_allclose(normal_outputs.numpy(), qat_outputs.numpy())

        net.eval()
        normal_outputs = net(inputs)
        qat_net.eval()
        qat_outputs = qat_net(inputs)
        np.testing.assert_allclose(normal_outputs.numpy(), qat_outputs.numpy())
--- a/imperative/python/test/unit/quantization/test_fake_quant.py
+++ b/imperative/python/test/unit/quantization/test_fake_quant.py
@@ -92,8 +92,6 @@ def test_tqt():
    np.testing.assert_allclose(g_s.numpy(), g_s_np, rtol=5e-5, atol=5e-5)




 def _save_to(self, name="grad"):
    def callback(grad):
        setattr(self, name, grad)
--- a/imperative/python/test/unit/quantization/test_op.py
+++ b/imperative/python/test/unit/quantization/test_op.py
@@ -14,6 +14,7 @@ import megengine.functional as F
 from megengine.core.tensor import dtype
 from megengine.device import get_device_count
 from megengine.functional.elemwise import _elemwise_multi_type, _elwise
 from megengine.module.quantized.conv import ConvTranspose2d
 from megengine.quantization import QuantMode, create_qparams


@@ -168,3 +169,94 @@ def test_conv_bias():

    run(10, 36, 8, 46, 26, 2, 2, 2, 1, 1, 2, False, "relu")
    run(10, 36, 8, 46, 26, 2, 2, 2, 1, 1, 2, True, "relu")


 def test_conv_transpose2d():
    rng = np.random.RandomState(seed=2021)

    def test_func(
        N,
        IC,
        IH,
        IW,
        OC,
        KH,
        KW,
        SH,
        SW,
        PH,
        PW,
        DH,
        DW,
        groups=1,
        has_bias=True,
        conv_mode: str = "cross_correlation",
        compute_mode: str = "default",
    ):
        inp_scale = np.float32(rng.uniform(low=0.04, high=0.06))
        weight_scale = np.float32(rng.uniform(low=0.04, high=0.06))
        bias_scale = inp_scale * weight_scale
        out_scale = np.float32(rng.uniform(low=0.04, high=0.06))

        inp_dtype = dtype.qint8(inp_scale)
        weight_dtype = dtype.qint8(weight_scale)
        bias_dtype = dtype.qint32(bias_scale)
        out_dtype = dtype.qint8(out_scale)

        inp_fp32 = rng.uniform(low=-1, high=1, size=(N, IC, IH, IW)).astype(np.float32)
        weight_fp32 = rng.uniform(low=-1, high=1, size=(IC, OC, KH, KW)).astype(
            np.float32
        )
        bias_fp32 = rng.uniform(low=-1, high=1, size=(1, OC, 1, 1)).astype(np.float32)

        inp_int8 = dtype.convert_to_qint8(inp_fp32, inp_dtype)
        weight_int8 = dtype.convert_to_qint8(weight_fp32, weight_dtype)
        bias_int32 = dtype.convert_to_qint32(bias_fp32, bias_dtype)

        inp_int8 = mge.tensor(inp_int8, dtype=inp_dtype)
        weight_int8 = mge.Parameter(weight_int8, dtype=weight_dtype)
        bias_int32 = mge.Parameter(bias_int32, dtype=bias_dtype)

        inp_fp32 = inp_int8.astype("float32")
        weight_fp32 = weight_int8.astype("float32")
        bias_fp32 = bias_int32.astype("float32")

        expected = F.conv_transpose2d(
            inp_fp32,
            weight_fp32,
            bias_fp32 if has_bias else None,
            stride=(SH, SW),
            padding=(PH, PW),
            dilation=(DH, DW),
            groups=groups,
            conv_mode=conv_mode,
            compute_mode=compute_mode,
        )
        expected = dtype.convert_to_qint8(expected.numpy(), out_dtype)
        expected = dtype.convert_from_qint8(expected)

        conv_transpose2d = ConvTranspose2d(
            in_channels=IC,
            out_channels=OC,
            kernel_size=(KH, KW),
            stride=(SH, SW),
            padding=(PH, PW),
            dilation=(DH, DW),
            groups=groups,
            bias=has_bias,
            conv_mode=conv_mode,
            compute_mode=compute_mode,
            dtype=out_dtype,
        )

        conv_transpose2d.weight = mge.Parameter(weight_int8)
        if has_bias:
            conv_transpose2d.bias = mge.Parameter(bias_int32)
        result = conv_transpose2d.forward(inp_int8).numpy()
        result = dtype.convert_from_qint8(result)
        np.testing.assert_allclose(result, expected, atol=out_scale)

    test_func(1, 4, 1, 1, 4, 1, 1, 1, 1, 0, 0, 1, 1, 1, False)
    test_func(2, 4, 3, 1, 8, 1, 1, 1, 1, 0, 0, 1, 1, 1, False)
    test_func(4, 4, 16, 16, 8, 3, 3, 1, 1, 1, 1, 1, 1, 1, False)
    test_func(32, 64, 36, 28, 16, 3, 2, 1, 3, 1, 0, 1, 1, 1, False)
--- a/imperative/python/test/unit/utils/test_network_node.py
+++ b/imperative/python/test/unit/utils/test_network_node.py
@@ -486,8 +486,6 @@ def test_topk():
    check_pygraph_dump(fwd, [x], [top, indices])




 def test_random():
    @trace(symbolic=True, capture_as_const=True)
    def fwd():
@@ -723,8 +721,6 @@ def test_elemwise_multitype():
    check_pygraph_dump(fwd, [x, y], [result])




 def test_cvtcolor():
    inp = np.random.randn(3, 3, 3, 3).astype(np.float32)
    x = Tensor(inp)
--- a/imperative/python/version_template.py
+++ b/imperative/python/version_template.py
@@ -7,4 +7,3 @@
 # software distributed under the License is distributed on an
 # "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 __version__ = "1.6.0.dev"

--- a/imperative/src/impl/ops/convolution.cpp
+++ b/imperative/src/impl/ops/convolution.cpp
@@ -43,6 +43,11 @@ auto apply_on_var_node(
        const VarNodeArray& inputs) {
    auto&& conv = static_cast<const ConvolutionBackwardData&>(def);
    OperatorNodeConfig config{conv.make_name()};
    DType output_dtype = conv.dtype;
    if (output_dtype.valid()) {
        config.output_dtype(output_dtype);
    }

    if (inputs.size() == 2) {
        return opr::ConvolutionBackwardData::make(inputs[0], inputs[1], conv.param(), conv.policy(), config);
    } else {
--- a/scripts/whl/macos/macos_build_whl.sh
+++ b/scripts/whl/macos/macos_build_whl.sh
@@ -192,7 +192,6 @@ function do_build() {
        #handle dlopen path
        install_name_tool -change @rpath/libmegengine_export.dylib @loader_path/lib/libmegengine_export.dylib _imperative_rt.so


        #copy megbrain_export lib
        DEPEND_LIB=${BUILD_DIR}/staging/megengine/core/lib/
        rm -rf ${DEPEND_LIB}
@@ -209,7 +208,6 @@ function do_build() {
        echo "comapt whl name: ${compat_whl_name}"
        cp ${BUILD_DIR}/staging/dist/Meg*.whl ${MACOS_WHL_HOME}/${compat_whl_name}


        cd ${SRC_DIR}
        echo ""
        echo "##############################################################################################"
@@ -220,12 +218,10 @@ function do_build() {
    done
 }


 function third_party_prepare() {
    echo "init third_party..."
    ${SRC_DIR}/third_party/prepare.sh


    if [[ -z ${ALREADY_INSTALL_MKL} ]]
    then
        echo "init third_party..."
--- a/scripts/whl/manylinux2014/do_build_common.sh
+++ b/scripts/whl/manylinux2014/do_build_common.sh
@@ -55,13 +55,11 @@ function patch_elf_depend_lib_mgb_mge() {
    patchelf --force-rpath --set-rpath '$ORIGIN/.' ${LIBS_DIR}/libmegengine_export.so
    handle_strip ${LIBS_DIR}/libmegengine_export.so


    # as some version of cudnn/trt libs have dlopen libs, so we can not use auditwheel
    # TODO: PR for auditwheel to support args for dlopen libs
    handle_copy_cuda_libs ${LIBS_DIR}
 }


 SRC_DIR=$(readlink -f "`dirname $0`/../../../")
 source ${SRC_DIR}/scripts/whl/utils/utils.sh

@@ -142,7 +140,6 @@ do
    mkdir -p staging
    cp -a imperative/python/{megengine,setup.py,requires.txt,requires-style.txt,requires-test.txt} staging/


    cd ${BUILD_DIR}/staging/megengine/core
    mkdir -p lib/ucx
    patch_elf_depend_lib_mgb_mge
@@ -158,7 +155,6 @@ do
    echo "comapt whl name: ${compat_whl_name}"
    mv ${org_whl_name} ${SRC_DIR}/scripts/whl/manylinux2014/output/wheelhouse/${SDK_NAME}/${compat_whl_name}


    cd /home/output
    chown -R ${UID}.${UID} .
    # compat for root-less docker env to remove output at host side
--- a/scripts/whl/windows/windows_build_whl.sh
+++ b/scripts/whl/windows/windows_build_whl.sh
@@ -70,7 +70,6 @@ then
    BUILD_WHL_CPU_ONLY="OFF"
 fi


 # config NVIDIA libs
 TRT_LIB="/c/Program Files/NVIDIA GPU Computing Toolkit/TensorRT-6.0.1.5/lib/nvinfer.dll"
 CUDNN_LIB="/c/Program Files/NVIDIA GPU Computing Toolkit/cudnn-10.1-windows10-x64-v7.6.5.32/cuda/bin/cudnn64_7.dll"
@@ -102,14 +101,11 @@ function copy_more_dll() {
    # empty.file to triger setup.py to create a null empty
    echo "empty" > ${CP_WHL_DST_IMP}/empty.file


    if [ ${BUILD_WHL_CPU_ONLY} = "OFF" ]; then
        echo "copy nvidia lib to whl use...."
        depend_real_copy ${CP_WHL_DST_IMP}

    fi
 }

 BUILD_DIR=${SRC_DIR}/build_dir/host/build/

 # here we just treat cu file should not in the increment build file list
@@ -194,14 +190,12 @@ function do_build() {
        llvm-strip -s ${rt_file}
        mv ${rt_file} _imperative_rt.pyd


        copy_more_dll
        cd ${BUILD_DIR}/staging
        echo "call setup.py now"
        ${PYTHON_DIR}/python3 setup.py bdist_wheel
        cp ${BUILD_DIR}/staging/dist/Meg*.whl ${WINDOWS_WHL_HOME}/


        echo ""
        echo "##############################################################################################"
        echo "windows whl package location: ${WINDOWS_WHL_HOME}"
@@ -215,7 +209,6 @@ function third_party_prepare() {
    echo "init third_party..."
    ${SRC_DIR}/third_party/prepare.sh


    if [[ -z ${ALREADY_INSTALL_MKL} ]]
    then
        echo "init third_party..."
--- a/src/core/impl/comp_node_env.cpp
+++ b/src/core/impl/comp_node_env.cpp
@@ -35,8 +35,6 @@
 #include "megcore_atlas.h"
 #endif



 using namespace mgb;

 /* =================== MegDNNHandle =================== */
@@ -102,7 +100,6 @@ MegDNNHandle::MegDNNHandle(const CompNodeEnv& env) {
    }
 #endif


    if (env.property().type == CompNode::DeviceType::CPU) {
        megcoreCreateDeviceHandle(&m_dev_hdl, megcorePlatformCPU);
        megcoreCreateComputingHandleWithCPUDispatcher(&m_comp_hdl, m_dev_hdl,
@@ -234,7 +231,6 @@ void CompNodeEnv::init_cuda_async(int dev, CompNode comp_node,
 }
 #endif


 #if MGB_ATLAS

 void mgb::_on_atlas_error(const char* expr, int err, const char* file,
@@ -258,8 +254,6 @@ void CompNodeEnv::init_atlas(CompNode comp_node, const AtlasEnv& env) {
 }
 #endif



 #if MGB_ROCM

 void mgb::_on_hip_error(const char* expr, hipError_t err, const char* file,
@@ -381,7 +375,6 @@ void CompNodeEnv::init_cpu(const CpuEnv& env, CompNode comp_node) {
            MegDNNHandle::get(*this).handle()->alignment_requirement();
 }


 #if MGB_CAMBRICON
 void CompNodeEnv::init_cnrt(int dev, CompNode comp_node,
                            const ContinuationCtx<cnrtQueue_t>& cont) {
@@ -446,7 +439,6 @@ void CompNodeEnv::fini() {
        MGB_ATLAS_CHECK(aclrtDestroyStream(m_atlas_env.stream));
    }
 #endif

 }

 #if MGB_ENABLE_COMP_NODE_ASYNC_INIT
--- a/src/core/impl/exception.cpp
+++ b/src/core/impl/exception.cpp
@@ -73,14 +73,11 @@ std::string CudaError::get_cuda_extra_info() {
 #endif
 }


 AtlasError::AtlasError(const std::string &msg):
    SystemError(msg)
 {
 }



 ROCmError::ROCmError(const std::string &msg):
    SystemError(msg)
 {
--- a/src/core/impl/graph/cg_impl.cpp
+++ b/src/core/impl/graph/cg_impl.cpp
@@ -23,7 +23,6 @@
 #include "megbrain/graph/helper.h"
 #include "megbrain/opr/utility.h"


 #if MGB_ENABLE_TENSOR_RT
 #include "megbrain/tensorrt/opr_replace.h"
 #endif
@@ -554,7 +553,6 @@ ComputingGraphImpl::CompileState ComputingGraphImpl::compile_prepare(
    }
 #endif


 #if MGB_JIT
   if (std::abs(options().graph_opt_level) == 0 &&
        (options().graph_opt.jit || options().graph_opt.jit_config.enabled())) {
--- a/src/core/impl/graph/var_node_mem_mgr.h
+++ b/src/core/impl/graph/var_node_mem_mgr.h
@@ -445,7 +445,6 @@ class VarNodeMemManager {

        SyncableCounter m_cpu_async_release_barrier;


 #if MGB_CUDA || MGB_ATLAS || MGB_CAMBRICON  || MGB_ROCM
        //! release dynamic var on after compnode event finishes
        class AsyncVarReleaser;
--- a/src/core/include/megbrain/comp_node.h
+++ b/src/core/include/megbrain/comp_node.h
@@ -508,7 +508,6 @@ class CompNode {
         */
        static bool enable_affinity_for_cpu(bool flag);


    protected:
        //! ImplBase with env(); defined in CompNodeEnv
        class Impl;
--- a/src/core/include/megbrain/comp_node_env.h
+++ b/src/core/include/megbrain/comp_node_env.h
@@ -19,8 +19,6 @@

 #include "megdnn/handle.h"



 #if MGB_CUDA
 #include <cuda_runtime.h>
 #include <cuda.h>
@@ -90,8 +88,6 @@

 #endif // MGB_ATLAS



 #if MGB_ROCM
 #include "hcc_detail/hcc_defs_prologue.h"
 #include "megcore_rocm.h"
@@ -196,7 +192,6 @@ namespace mgb {
                                  const char* file, const char* func, int line);
 #endif


 #if MGB_CUDA
 [[noreturn]] void _on_cuda_error(const char* expr, cudaError_t err,
                                 const char* file, const char* func, int line);
@@ -205,7 +200,6 @@ namespace mgb {
                                    int line);
 #endif


 #if MGB_ROCM
 [[noreturn]] void _on_hip_error(const char* expr, hipError_t err,
                                const char* file, const char* func, int line);
@@ -232,7 +226,6 @@ public:
        mgb_assert(0, "The CompNode set_affinity is not implement");
    }
 };

 using AtlasDispatcher = CPUDispatcher;

 /*!
@@ -328,7 +321,6 @@ public:
        }
 #endif


    }

    /*!
@@ -370,7 +362,6 @@ public:
                         const ContinuationCtx<cudaStream_t>& cont);
 #endif


 #if MGB_ATLAS
    struct AtlasEnv {
        int device = -1;
@@ -431,8 +422,6 @@ public:
    void init_atlas(CompNode comp_node, const AtlasEnv& env);
 #endif



 #if MGB_ROCM
    struct ROCmEnv {
        int device = -1;
@@ -547,7 +536,6 @@ private:
    CompNode m_comp_node;
    Property m_property;
    MemEventHandler m_mem_event_handler;

 #if MGB_CUDA
    CudaEnv m_cuda_env;
 #endif
--- a/src/core/include/megbrain/exception.h
+++ b/src/core/include/megbrain/exception.h
@@ -71,7 +71,6 @@
    })                                                                        \
    do {                                                                      \
    } while (0)

 namespace mgb {

 //! the most general MegBrain exception type; also base class for all megbrain
@@ -149,7 +148,6 @@ public:
    AtlasError(const std::string& msg);
 };


 class ROCmError final : public SystemError {
 public:
    /*!
@@ -224,7 +222,6 @@ public:
    using MegBrainError::MegBrainError;
 };


 }  // namespace mgb

 namespace mgb {
@@ -233,5 +230,4 @@ bool has_uncaught_exception();

 }  // namespace mgb


 // vim: syntax=cpp.doxygen foldmethod=marker foldmarker=f{{{,f}}}
--- a/src/core/include/megbrain/ir/ops.td
+++ b/src/core/include/megbrain/ir/ops.td
@@ -49,7 +49,11 @@ def SVD: MgbHashableOp<"SVD", [SVDParam]>;

 def Convolution : MgbHashableOp<"Convolution", [ConvolutionParam, ExecutionPolicyParamBase<"policy">]>;

 def ConvolutionBackwardData: MgbHashableOp<"ConvolutionBackwardData", [ConvolutionParam, ExecutionPolicyParamBase<"policy">]>;
 def ConvolutionBackwardData: MgbHashableOp<"ConvolutionBackwardData", [ConvolutionParam, ExecutionPolicyParamBase<"policy">]> {
  let extraArguments = (ins
    MgbDTypeAttr:$dtype
  );
 }

 def Convolution3D: MgbHashableOp<"Convolution3D", [Convolution3DParam, ExecutionPolicyParamBase<"policy">]>;

--- a/src/core/test/comp_node.cpp
+++ b/src/core/test/comp_node.cpp
@@ -40,7 +40,6 @@ TEST(TestCompNode, Parse) {
    ASSERT_EQ(L::parse("cpu2:23"), make_lc(D::CPU, 2, 23));
    ASSERT_EQ(L::parse("cpu21:23"), make_lc(D::CPU, 21, 23));


    ASSERT_EQ(L::parse("rocmx"), make_lc(D::ROCM, -1, 0));
    ASSERT_EQ(L::parse("rocm2"), make_lc(D::ROCM, 2, 0));
    ASSERT_EQ(L::parse("rocm2:3"), make_lc(D::ROCM, 2, 3));
@@ -62,7 +61,6 @@ TEST(TestCompNode, Parse) {
    ASSERT_EQ(L::parse("multithread:default:2"),
              make_lc(D::MULTITHREAD, L::DEVICE_MULTITHREAD_DEFAULT, 2));


    ASSERT_THROW(L::parse("apu"), MegBrainError);
    ASSERT_THROW(L::parse("fpgbx"), MegBrainError);
    ASSERT_THROW(L::parse("cab0"), MegBrainError);
@@ -165,8 +163,6 @@ TEST(TestCompNode, Load) {
    auto atlas1 = CompNode::load("atlas1");
    ASSERT_NE(atlas0, atlas1);
 #endif


 }

 TEST(TestCompNode, FreeAfterFinalize) {
@@ -355,7 +351,6 @@ TEST(TestCompNodeAtlas, MemNode) {
 }
 #endif


 TEST(TestCompNodeCPU, PhysicalDispatch) {
    constexpr int ID = 0x2a6453e0;
    using L = CompNode::Locator;
@@ -754,7 +749,6 @@ TEST(TestCompNodeCambricon, P2PCopy) {
 #endif
 #endif // MGB_CAMBRICON


 #if MGB_ATLAS

 TEST(TestCompNodeAtlas, D2DCopy) {
@@ -780,7 +774,6 @@ TEST(TestCompNodeAtlas, D2DCopy) {
 }
 #endif


 namespace {
 class CompNodeDepedentObjectInst final : public CompNodeDepedentObject {
    int *m_dst, *m_timer;
--- a/src/core/test/mem_alloc.cpp
+++ b/src/core/test/mem_alloc.cpp
@@ -634,7 +634,6 @@ void test_gather_other(CompNode cn0, CompNode cn1) {
    opr::Sleep::sleep(cn1, 0.7);
    func->execute();
 }

 }  // namespace

 #if MGB_CUDA
@@ -668,5 +667,4 @@ TEST(TestCudaMemAlloc, FreeMem) {
 }
 #endif  // MGB_CUDA


 // vim: syntax=cpp.doxygen foldmethod=marker foldmarker=f{{{,f}}}
--- a/src/core/test/tensor.cpp
+++ b/src/core/test/tensor.cpp
@@ -340,7 +340,6 @@ TEST(TestTensor, ValueDump) {
    auto val = debug::dump_tensor(*gen({23, 45}), "test");
    debug::write_to_file(output_file("TestTensor.ValueDump.bin").c_str(), val);
 }

 template <class Src, class Dst>
 void run_negative_index_test() {
    constexpr size_t S0 = 200, S1 = 200;
--- a/src/gopt/test/inference.cpp
+++ b/src/gopt/test/inference.cpp
@@ -1912,7 +1912,6 @@ TEST_PASS(FuseConvBiasNonlinPass, Basic) {
    }
 }


 #if MGB_CUDA

 TEST(TestEnableTensorCore, SmallInputShape) {
@@ -4735,7 +4734,6 @@ TEST(TestGoptInference, PaddingChannelsWithWarpPerspective) {
    MGB_ASSERT_TENSOR_EQ(t1, t2);
 }


 #endif

 // vim: syntax=cpp.doxygen foldmethod=marker foldmarker=f{{{,f}}}
--- a/src/megbrain_build_config.h.in
+++ b/src/megbrain_build_config.h.in
@@ -67,7 +67,6 @@
 #define MGB_CUDA    1
 #endif


 // whether to include file/line location for assert message
 #ifndef MGB_ASSERT_LOC
 #define MGB_ASSERT_LOC          1
@@ -162,7 +161,6 @@
 #define MGB_JIT_HALIDE 0
 #endif


 #ifndef MEGDNN_WITH_CAMBRICON
 #define MEGDNN_WITH_CAMBRICON 0
 #endif
@@ -182,7 +180,6 @@
 #define MGB_ENABLE_FASTRUN 1
 #endif


 /* ================= following are more finegrind controls ================= */

 // whether to enable json dumper
--- a/src/opr/impl/basic_arith.sereg.h
+++ b/src/opr/impl/basic_arith.sereg.h
@@ -162,7 +162,6 @@ namespace opr {
    using ReduceV2 = opr::Reduce;
    MGB_SEREG_OPR(ReduceV2, 0);
 } // namespace opr

 using TypeCvtV2 = opr::TypeCvt;
 MGB_SEREG_OPR(TypeCvtV2, 1);

--- a/src/opr/impl/blas.sereg.h
+++ b/src/opr/impl/blas.sereg.h
@@ -97,7 +97,6 @@ MGB_SEREG_OPR(SVD, 1);

 }  // namespace opr


 }  // namespace mgb

 // vim: ft=cpp syntax=cpp.doxygen foldmethod=marker foldmarker=f{{{,f}}}
--- a/src/opr/impl/dnn/dnn.sereg.h
+++ b/src/opr/impl/dnn/dnn.sereg.h
@@ -613,7 +613,6 @@ MGB_SEREG_OPR(LSQ, 4);
 MGB_SEREG_OPR(LSQBackward, 5);
 }  // namespace opr


 }  // namespace mgb

 // vim: ft=cpp syntax=cpp.doxygen foldmethod=marker foldmarker=f{{{,f}}}
--- a/src/opr/impl/imgproc.sereg.h
+++ b/src/opr/impl/imgproc.sereg.h
@@ -196,7 +196,6 @@ using DctChannelSelectV1 = opr::DctChannelSelect;
 MGB_SEREG_OPR(DctChannelSelectV1, 0);
 }  // namespace opr


 }  // namespace mgb

 // vim: ft=cpp syntax=cpp.doxygen foldmethod=marker foldmarker=f{{{,f}}}
--- a/src/opr/impl/misc.sereg.h
+++ b/src/opr/impl/misc.sereg.h
@@ -57,7 +57,6 @@ namespace serialization {

 } // namespace serialization


 namespace opr {

    MGB_SEREG_OPR(Argmax, 1);
--- a/src/opr/impl/rand.sereg.h
+++ b/src/opr/impl/rand.sereg.h
@@ -14,7 +14,6 @@

 namespace mgb {


 namespace opr {

 using UniformRNGV1 = opr::UniformRNG;
--- a/src/opr/impl/tensor_manip.sereg.h
+++ b/src/opr/impl/tensor_manip.sereg.h
@@ -120,7 +120,6 @@ namespace serialization {
 #endif
 } // namespace serialization


 namespace opr {
    MGB_SEREG_OPR(Broadcast, 2);
    MGB_SEREG_OPR(Dimshuffle, 1);
--- a/src/opr/test/dnn/convolution.cpp
+++ b/src/opr/test/dnn/convolution.cpp
@@ -2401,7 +2401,6 @@ TEST(TestOprDNN, ConvolutionMultiCompNode) {
    worker0.join();
    worker1.join();
 }

 #endif

 }  // anonymous namespace
--- a/src/serialization/impl/serializer.cpp
+++ b/src/serialization/impl/serializer.cpp
@@ -37,7 +37,6 @@ GraphLoader::shared_tensor_name_map() {
    }
    return ret;
 }

 std::unique_ptr<GraphLoader> make_fbs_loader(std::unique_ptr<InputFile> file);
 std::unique_ptr<GraphDumper> make_fbs_dumper(std::unique_ptr<OutputFile> file);
 bool is_fbs_file(InputFile& file);
--- a/src/serialization/test/extern_c_opr.cpp
+++ b/src/serialization/test/extern_c_opr.cpp
@@ -502,5 +502,4 @@ TEST(TestExternCOpr, Dedup) {
    ASSERT_EQ(0, MGBOprDescImpl<>::nr_inst);
 }


 // vim: syntax=cpp.doxygen foldmethod=marker foldmarker=f{{{,f}}}
--- a/test/CMakeLists.txt
+++ b/test/CMakeLists.txt
@@ -15,7 +15,6 @@ if (MGE_WITH_CUDA AND MGE_WITH_TRT)
    list(APPEND SOURCES ${SOURCES_})
 endif()


 add_executable(megbrain_test ${SOURCES})
 target_link_libraries(megbrain_test gtest gmock)
 target_link_libraries(megbrain_test megbrain megdnn ${MGE_CUDA_LIBS})
--- a/tools/param_defs/mgb_opr_param_defs.py
+++ b/tools/param_defs/mgb_opr_param_defs.py
@@ -63,7 +63,6 @@ pdef('PersistentOutputStorage').add_fields(
            'false')
 )


 (pdef('CollectiveComm', 'collective communication between multiple computing '
      'nodes on localhost')
 .add_enum(Doc('Mode', 'mode of collective communication'),