perf(dist): add fastpath for bcast params

GitOrigin-RevId: aa40b3cd72
refactor(dtr): import dtr as submodule
--- a/.gitattributes
+++ b/.gitattributes
@@ -7,7 +7,6 @@ dnn/src/cuda/matrix_mul/fp32_simt/kimpl/* binary
 dnn/src/cuda/sass/prebuilt/map_defs.cpp binary
 dnn/src/cuda/convolution/backward_data/int8/kimpl/* binary
 tools/mlir/mlir-tblgen filter=lfs diff=lfs merge=lfs -text
 *.caffemodel filter=lfs diff=lfs merge=lfs -text
 imperative/python/test/integration/data/*.mge filter=lfs diff=lfs merge=lfs -text
 ci/resource/models/float/mobilenet_v2.pkl filter=lfs diff=lfs merge=lfs -text
 ci/resource/models/float/shufflenet_v2.pkl filter=lfs diff=lfs merge=lfs -text
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -1,4 +1,8 @@
 cmake_minimum_required(VERSION 3.15.2)
 message(STATUS "CMAKE_GENERATOR: ${CMAKE_GENERATOR}" )
 if (NOT ${CMAKE_GENERATOR} STREQUAL "Ninja")
    message(WARNING "CMAKE_GENERATOR NOT EQUAL Ninja, which we do not recommend")
 endif()

 include (cmake/FetchMegBrainVersion.cmake)
 project(MegEngine LANGUAGES C CXX VERSION ${MGB_VER_STRING})
@@ -300,6 +304,19 @@ if(NOT MGE_WITH_CUDA)
 endif()

 find_package(PythonInterp 3 REQUIRED)
 # NOTICE: just use for target, which do not depend on python api
 # PURPOSE: reuse target obj when switch python3 version
 # will fallback to PYTHON_EXECUTABLE if can not find in PATH env
 set(PYTHON3_IN_ENV "python3")
 find_program(PYTHON3_EXECUTABLE_WITHOUT_VERSION ${PYTHON3_IN_ENV})
 if (PYTHON3_EXECUTABLE_WITHOUT_VERSION)
    message(STATUS "use ${PYTHON3_IN_ENV} as PYTHON3_EXECUTABLE_WITHOUT_VERSION")
    set(PYTHON3_EXECUTABLE_WITHOUT_VERSION ${PYTHON3_IN_ENV})
 else()
    message(STATUS "fallback ${PYTHON_EXECUTABLE} as PYTHON3_EXECUTABLE_WITHOUT_VERSION,\
    target which depend on PYTHON3_EXECUTABLE_WITHOUT_VERSION will be rebuild when switch python3")
    set(PYTHON3_EXECUTABLE_WITHOUT_VERSION ${PYTHON_EXECUTABLE})
 endif()

 set(THREADS_PREFER_PTHREAD_FLAG ON)
 find_package(Threads)
@@ -339,8 +356,8 @@ if(MGE_BUILD_IMPERATIVE_RT)
    set(CMAKE_CXX_STANDARD 17)
 endif()

 if(NOT MGE_WITH_CUDA)
    message(STATUS "Disable distributed support, as CUDA is not enabled.")
 if(NOT ${MGE_WITH_CUDA} AND NOT ${MGE_WITH_ROCM})
    message(STATUS "Disable distributed support, as both CUDA and ROCm are disabled.")
    set(MGE_WITH_DISTRIBUTED OFF)
 endif()

@@ -854,10 +871,8 @@ set(MGB_OPR_PARAM_DEFS_SCRIPT ${CMAKE_CURRENT_SOURCE_DIR}/dnn/scripts/gen_param_
 set(MGB_OPR_PARAM_DEFS_OUT_DIR ${CMAKE_CURRENT_BINARY_DIR}/src/opr/include/)
 file(MAKE_DIRECTORY ${MGB_OPR_PARAM_DEFS_OUT_DIR}/megbrain/opr)
 add_custom_command(
    OUTPUT
        ${MGB_OPR_PARAM_DEFS_OUT_DIR}/megbrain/opr/param_defs.h
    COMMAND ${PYTHON_EXECUTABLE} ${MGB_OPR_PARAM_DEFS_SCRIPT} ${MGB_OPR_PARAM_DEFS_SRCS}
        ${MGB_OPR_PARAM_DEFS_OUT_DIR}/megbrain/opr/param_defs.h
    OUTPUT ${MGB_OPR_PARAM_DEFS_OUT_DIR}/megbrain/opr/param_defs.h
    COMMAND ${PYTHON3_EXECUTABLE_WITHOUT_VERSION} ${MGB_OPR_PARAM_DEFS_SCRIPT} ${MGB_OPR_PARAM_DEFS_SRCS} ${MGB_OPR_PARAM_DEFS_OUT_DIR}/megbrain/opr/param_defs.h
    DEPENDS ${MGB_OPR_PARAM_DEFS_SRCS} ${MGB_OPR_PARAM_DEFS_SCRIPT}
    VERBATIM
 )
@@ -890,9 +905,10 @@ if(MGE_WITH_JIT_MLIR OR MGE_BUILD_IMPERATIVE_RT)
    file(READ ${PROJECT_SOURCE_DIR}/tools/param_defs/mgb_opr_param_defs.py CONTENTS)
    file(APPEND ${OPR_PARAM_DEFS_SRCS} ${CONTENTS})
    file(MAKE_DIRECTORY ${MGE_GEN_IR_DIR})
    add_custom_target(param_defs_tblgen
        COMMAND ${PYTHON_EXECUTABLE} ${OPR_PARAM_DEFS_SCRIPT} ${OPR_PARAM_DEFS_SRCS} ${OPR_PARAM_DEFS_OUT}
        DEPENDS ${OPR_PARAM_DEFS_SRCS} ${OPR_PARAM_DEFS_SCRIPT}
    add_custom_command(
        OUTPUT ${OPR_PARAM_DEFS_OUT}
        COMMAND ${PYTHON3_EXECUTABLE_WITHOUT_VERSION} ${OPR_PARAM_DEFS_SCRIPT} ${OPR_PARAM_DEFS_SRCS} ${OPR_PARAM_DEFS_OUT}
        DEPENDS ${PROJECT_SOURCE_DIR}/dnn/scripts/opr_param_defs.py ${PROJECT_SOURCE_DIR}/tools/param_defs/mgb_opr_param_defs.py ${OPR_PARAM_DEFS_SCRIPT}
        VERBATIM
    )
    # mlir tblgen sources
@@ -900,9 +916,12 @@ if(MGE_WITH_JIT_MLIR OR MGE_BUILD_IMPERATIVE_RT)
    set(MGE_IR_INCLUDE_DIRS ${MLIR_LLVM_INCLUDE_DIR} ${MGE_IR_DIR} ${MGE_GEN_IR_DIR})
    list(TRANSFORM MGE_IR_INCLUDE_DIRS PREPEND "-I")
    file(GLOB_RECURSE MGE_IR_TDS ${MGE_IR_DIR}/*.td)
    add_custom_target(param_defs_tblgen DEPENDS ${OPR_PARAM_DEFS_OUT})
 endif()

 if(MGE_WITH_DISTRIBUTED)
    set(MEGRAY_WITH_NCCL ${MGE_WITH_CUDA} CACHE BOOL "Override MegRay option" FORCE)
    set(MEGRAY_WITH_RCCL ${MGE_WITH_ROCM} CACHE BOOL "Override MegRay option" FORCE)
    add_subdirectory(${PROJECT_SOURCE_DIR}/third_party/MegRay)
 endif()

--- a/cmake/rocm.cmake
+++ b/cmake/rocm.cmake
@@ -13,89 +13,79 @@ else()
    message(FATAL_ERROR "Could not find HIP. Ensure that HIP is either installed in /opt/rocm/hip or the variable HIP_PATH is set to point to the right location.")
 endif()

 string(REPLACE "." ";" HIP_VERSION_LIST ${HIP_VERSION})
 list(GET HIP_VERSION_LIST 0 HIP_VERSION_MAJOR)
 list(GET HIP_VERSION_LIST 1 HIP_VERSION_MINOR)
 if (NOT ${HIP_VERSION_MAJOR} STREQUAL "3")
    message(FATAL_ERROR "ROCM version needed 3.x, Please update ROCM.")
 else()
    if (${HIP_VERSION_MINOR} LESS "7")
        message(WARNING "ROCM version 3.x which x(got ${HIP_VERSION_MINOR}) greater equal 7 is prefered.")
 if (${HIP_VERSION} VERSION_LESS 3.0)
    message(FATAL_ERROR "ROCM version needed 3. Please update ROCM.")
 endif()

 macro(hipconfig_get_option variable option)
    if(NOT DEFINED ${variable})
        execute_process(
            COMMAND ${HIP_HIPCONFIG_EXECUTABLE} ${option}
            OUTPUT_VARIABLE ${variable})
    endif()
 endmacro()

 hipconfig_get_option(HIP_COMPILER "--compiler")
 hipconfig_get_option(HIP_CPP_CONFIG "--cpp_config")

 separate_arguments(HIP_CPP_CONFIG)

 foreach(hip_config_item ${HIP_CPP_CONFIG})
    foreach(macro_name "__HIP_PLATFORM_HCC__" "__HIP_ROCclr__")
        if(${hip_config_item} STREQUAL "-D${macro_name}=")
            set(HIP_CPP_DEFINE "${HIP_CPP_DEFINE}#define ${macro_name}\n")
            set(HIP_CPP_UNDEFINE "${HIP_CPP_UNDEFINE}\
                    #ifdef ${macro_name}\n#undef ${macro_name}\n\
                    #else\n#error\n\
                    #endif\n")
        elseif(${hip_config_item} STREQUAL "-D${macro_name}")
            set(HIP_CPP_DEFINE "${HIP_CPP_DEFINE}#define ${macro_name} 1\n")
            set(HIP_CPP_UNDEFINE "${HIP_CPP_UNDEFINE}\
                    #ifdef ${macro_name}\n#undef ${macro_name}\n\
                    #else\n#error\n\
                    #endif\n")
        endif()
    endforeach()
 endforeach()

 message(STATUS "Using HIP compiler ${HIP_COMPILER}")

 if(${HIP_COMPILER} STREQUAL "hcc")
    set(MGE_ROCM_LIBS hip_hcc)
    message(WARNING "hcc is not well supported, please modify link.txt to link with hipcc")
 elseif (${HIP_COMPILER} STREQUAL "clang")
    set(MGE_ROCM_LIBS amdhip64)
 endif()

 set(MGE_ROCM_LIBS OpenCL amdhip64 MIOpen rocblas rocrand)
 list(APPEND MGE_ROCM_LIBS amdocl64 MIOpen rocblas rocrand)

 set(HIP_INCLUDE_DIR ${HIP_ROOT_DIR}/../include)
 set(HIP_LIBRARY_DIR ${HIP_ROOT_DIR}/../lib)

 #miopen
 get_filename_component(__found_miopen_library ${HIP_ROOT_DIR}/../miopen/lib REALPATH)
 find_path(MIOPEN_LIBRARY_DIR
    NAMES libMIOpen.so
    HINTS ${PC_MIOPEN_INCLUDE_DIRS} ${MIOPEN_ROOT_DIR} ${ROCM_TOOLKIT_INCLUDE} ${__found_miopen_library}
    PATH_SUFFIXES lib
    DOC "Path to MIOPEN library directory." )

 if(MIOPEN_LIBRARY_DIR STREQUAL "MIOPEN_LIBRARY_DIR-NOTFOUND")
    message(FATAL_ERROR "Can not find MIOPEN Library")
 endif()

 get_filename_component(__found_miopen_include ${HIP_ROOT_DIR}/../miopen/include REALPATH)
 find_path(MIOPEN_INCLUDE_DIR
    NAMES miopen
    HINTS ${PC_MIOPEN_INCLUDE_DIRS} ${MIOPEN_ROOT_DIR} ${ROCM_TOOLKIT_INCLUDE} ${__found_miopen_include}
    PATH_SUFFIXES include
    DOC "Path to MIOPEN include directory." )

 if(MIOPEN_INCLUDE_DIR STREQUAL "MIOPEN_INCLUDE_DIR-NOTFOUND")
    message(FATAL_ERROR "Can not find MIOEPN INCLUDE")
 endif()

 #rocblas
 get_filename_component(__found_rocblas_library ${HIP_ROOT_DIR}/../rocblas/lib REALPATH)
 find_path(ROCBLAS_LIBRARY_DIR
    NAMES librocblas.so
    HINTS ${PC_ROCBLAS_INCLUDE_DIRS} ${ROCBLAS_ROOT_DIR} ${ROCM_TOOLKIT_INCLUDE} ${__found_rocblas_library}
    PATH_SUFFIXES lib
    DOC "Path to ROCBLAS library directory." )

 if(ROCBLAS_LIBRARY_DIR STREQUAL "ROCBLAS_LIBRARY_DIR-NOTFOUND")
    message(FATAL_ERROR "Can not find ROCBLAS Library")
 endif()

 get_filename_component(__found_rocblas_include ${HIP_ROOT_DIR}/../rocblas/include REALPATH)
 find_path(ROCBLAS_INCLUDE_DIR
    NAMES rocblas.h
    HINTS ${PC_ROCBLAS_INCLUDE_DIRS} ${ROCBLAS_ROOT_DIR} ${ROCM_TOOLKIT_INCLUDE} ${__found_rocblas_include}
    PATH_SUFFIXES include
    DOC "Path to ROCBLAS include directory." )

 if(ROCBLAS_INCLUDE_DIR STREQUAL "ROCBLAS_INCLUDE_DIR-NOTFOUND")
    message(FATAL_ERROR "Can not find ROCBLAS INCLUDE")
 endif()

 #rocrand
 get_filename_component(__found_rocrand_library ${HIP_ROOT_DIR}/../rocrand/lib REALPATH)
 find_path(ROCRAND_LIBRARY_DIR
    NAMES librocrand.so
    HINTS ${PC_ROCRAND_INCLUDE_DIRS} ${ROCRAND_ROOT_DIR} ${ROCM_TOOLKIT_INCLUDE} ${__found_rocrand_library}
    PATH_SUFFIXES lib
    DOC "Path to ROCRAND library directory." )
 function(find_rocm_library name dirname include library)
    find_path(${name}_LIBRARY_DIR
        NAMES ${library}
        HINTS "${${name}_ROOT_DIR}" "${HIP_ROOT_DIR}/../${dirname}"
        PATH_SUFFIXES lib lib/x86_64
        DOC "Path to ${name} library directory")

 if(ROCRAND_LIBRARY_DIR STREQUAL "ROCRAND_LIBRARY_DIR-NOTFOUND")
    message(FATAL_ERROR "Can not find ROCRAND Library")
 endif()
    if(${${name}_LIBRARY_DIR} MATCHES "NOTFOUND$")
        message(FATAL_ERROR "Can not find ${name} library")
    endif()

 get_filename_component(__found_rocrand_include ${HIP_ROOT_DIR}/../rocrand/include REALPATH)
 find_path(ROCRAND_INCLUDE_DIR
    NAMES rocrand.h
    HINTS ${PC_ROCRAND_INCLUDE_DIRS} ${ROCRAND_ROOT_DIR} ${ROCM_TOOLKIT_INCLUDE} ${__found_rocrand_include}
    PATH_SUFFIXES include
    DOC "Path to ROCRAND include directory." )

 if(ROCRAND_INCLUDE_DIR STREQUAL "ROCRAND_INCLUDE_DIR-NOTFOUND")
    message(FATAL_ERROR "Can not find ROCRAND INCLUDE")
 endif()
    find_path(${name}_INCLUDE_DIR
        NAMES ${include}
        HINTS "${${name}_ROOT_DIR}" "${HIP_ROOT_DIR}/../${dirname}"
        PATH_SUFFIXES include
        DOC "Path to ${name} include directory")

    if(${name}_INCLUDE_DIR MATCHES "NOTFOUND$")
        message(FATAL_ERROR "Can not find ${name} include")
    endif()
    message(DEBUG "Found lib ${${name}_LIBRARY_DIR}, include ${${name}_INCLUDE_DIR}")
 endfunction()

 find_rocm_library(MIOPEN miopen miopen libMIOpen.so)
 find_rocm_library(ROCBLAS rocblas rocblas.h librocblas.so)
 find_rocm_library(ROCRAND rocrand rocrand.h librocrand.so)
 find_rocm_library(AMDOCL opencl CL libamdocl64.so)
--- a/dnn/CMakeLists.txt
+++ b/dnn/CMakeLists.txt
@@ -7,9 +7,9 @@ add_custom_command(
    OUTPUT
        ${OPR_PARAM_DEFS_OUT_DIR}/megdnn/opr_param_defs.h
        ${OPR_PARAM_DEFS_OUT_DIR}/megdnn/opr_param_json.h
    COMMAND ${PYTHON_EXECUTABLE} ${OPR_PARAM_DEFS_SCRIPT} ${OPR_PARAM_DEFS_SRCS}
        COMMAND ${PYTHON3_EXECUTABLE_WITHOUT_VERSION} ${OPR_PARAM_DEFS_SCRIPT} ${OPR_PARAM_DEFS_SRCS}
        ${OPR_PARAM_DEFS_OUT_DIR}/megdnn/opr_param_defs.h
    COMMAND ${PYTHON_EXECUTABLE} ${OPR_PARAM_DEFS_SCRIPT} ${OPR_PARAM_DEFS_SRCS}
        COMMAND ${PYTHON3_EXECUTABLE_WITHOUT_VERSION} ${OPR_PARAM_DEFS_SCRIPT} ${OPR_PARAM_DEFS_SRCS}
       tmp_unuse.log --write-cppjson ${OPR_PARAM_DEFS_OUT_DIR}/megdnn/opr_param_json.h
    DEPENDS ${OPR_PARAM_DEFS_SRCS} ${OPR_PARAM_DEFS_SCRIPT}
    VERBATIM
@@ -26,7 +26,7 @@ file(MAKE_DIRECTORY ${OPR_PARAM_DEFS_OUT_DIR}/src/common)
 add_custom_command(
    OUTPUT
        ${OPR_PARAM_DEFS_OUT_DIR}/src/common/opr_param_defs_enumv.cuh
    COMMAND ${PYTHON_EXECUTABLE} ${OPR_PARAM_DEFS_SCRIPT}
        COMMAND ${PYTHON3_EXECUTABLE_WITHOUT_VERSION} ${OPR_PARAM_DEFS_SCRIPT}
        --enumv ${OPR_PARAM_DEFS_SRCS}
        ${OPR_PARAM_DEFS_OUT_DIR}/src/common/opr_param_defs_enumv.cuh
    DEPENDS ${OPR_PARAM_DEFS_SRCS} ${OPR_PARAM_DEFS_SCRIPT}
--- a/dnn/include/hcc_detail/hcc_defs_epilogue.h.in
+++ b/dnn/include/hcc_detail/hcc_defs_epilogue.h.in
@@ -9,10 +9,6 @@
 * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 */

 #ifdef __HIP_PLATFORM_HCC__
 #undef __HIP_PLATFORM_HCC__
 #else
 #error "hcc_defs_epilogue.h must be included after hcc_defs_prologue.h"
 #endif
@HIP_CPP_UNDEFINE@

 // vim: syntax=cpp.doxygen
--- a/dnn/include/hcc_detail/hcc_defs_prologue.h.in
+++ b/dnn/include/hcc_detail/hcc_defs_prologue.h.in
@@ -9,6 +9,6 @@
 * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 */

 #define __HIP_PLATFORM_HCC__
@HIP_CPP_DEFINE@

 // vim: syntax=cpp.doxygen
--- a/dnn/include/megdnn/oprs.h
+++ b/dnn/include/megdnn/oprs.h
@@ -18,4 +18,39 @@
 #include "megdnn/oprs/utils.h"
 #include "megdnn/oprs/linalg.h"

 template <typename Opr>
 struct OprArityTrait;

 template <typename Opr, int _arity_in, int _arity_out>
 struct OprArityTraitTmpl {
    static constexpr int arity_in = _arity_in;
    static constexpr int arity_out = _arity_out;
    static constexpr int arity = arity_in + arity_out;
 };

 #define INST_ARITY(_Opr, _in, _out) \
    template <>                     \
    struct OprArityTrait<_Opr> : public OprArityTraitTmpl<_Opr, _in, _out> {};

 INST_ARITY(megdnn::ConvolutionBackwardData, 2, 1);
 INST_ARITY(megdnn::ConvolutionBackwardFilter, 2, 1);
 INST_ARITY(megdnn::Convolution3DForward, 2, 1);
 INST_ARITY(megdnn::Convolution3DBackwardData, 2, 1);
 INST_ARITY(megdnn::Convolution3DBackwardFilter, 2, 1);
 INST_ARITY(megdnn::LocalShareForward, 2, 1);
 INST_ARITY(megdnn::LocalShareBackwardData, 2, 1);
 INST_ARITY(megdnn::LocalShareBackwardFilter, 2, 1);
 INST_ARITY(megdnn::Convolution, 2, 1);
 INST_ARITY(megdnn::DeformableConvForward, 4, 1);
 INST_ARITY(megdnn::DeformableConvBackwardFilter, 4, 1);
 INST_ARITY(megdnn::BatchConvBiasForward, 4, 1);
 INST_ARITY(megdnn::ConvBias, 4, 1);
 INST_ARITY(megdnn::DeformableConvBackwardData, 5, 3);
 INST_ARITY(megdnn::MatrixMul, 2, 1);
 INST_ARITY(megdnn::BatchedMatrixMul, 2, 1);

 #undef INST_ARITY



 // vim: syntax=cpp.doxygen
--- a/dnn/include/megdnn/oprs/base.h
+++ b/dnn/include/megdnn/oprs/base.h
@@ -122,6 +122,20 @@ public:
         * these algorithms to speed up fastrun.
         * */
        NAIVE = 1 << 1,

        /**
         * \brief whether the algo is usable once shape changed.
         * */
        USABLE_DEPEND_ON_SHAPE = 1 << 2,

        /**
         * \brief whether the accuracy of the algo is dependent with respect
         * to batch
         * In the case of using algorithm with this attribute, even if the
         * content of each batch is the same, the output under multiple batch
         * input and single batch input may not equal
         * */
        ACCURACY_DEPEND_ON_BATCH = 1 << 3,
    };

    /**
--- a/dnn/include/megdnn/oprs/general.h
+++ b/dnn/include/megdnn/oprs/general.h
@@ -192,6 +192,87 @@ class ReduceForward: public OperatorBase {
 };
 using Reduce = ReduceForward;

 class CorrelationBase : public OperatorBase {
    DEF_OPR_IMPL_CTOR(CorrelationBase, OperatorBase);
    DEF_OPR_PARAM(Correlation);

 protected:
    void deduce_layout_fwd(const TensorLayout& data1, const TensorLayout& data2,
                            TensorLayout& dst);
    void check_layout_fwd(const TensorLayout& data1, const TensorLayout& data2,
                            const TensorLayout& dst);
 };

 class CorrelationForward : public CorrelationBase {
    DEF_OPR_IMPL(CorrelationForward, CorrelationBase, 2, 1);

 public:
    /**
     * \param[in] data1 (n, c, ih, iw) 
     * \param[in] data2 (n, c, ih, iw)
     * \param[out] dst (n, q, oh, ow), q is the number of neighborhood
     * */ 
    virtual void exec(_megdnn_tensor_in data1, _megdnn_tensor_in data2,
                      _megdnn_tensor_out dst, _megdnn_workspace workspace) = 0;
    void deduce_layout(const TensorLayout& data1, const TensorLayout& data2,
                       TensorLayout& dst);
    virtual size_t get_workspace_in_bytes(const TensorLayout& data1,
                                          const TensorLayout& data2,
                                          const TensorLayout& dst) = 0;
 protected:
    void check_exec(const TensorLayout& data1, const TensorLayout& data2,
                    const TensorLayout& dst, size_t workspace_in_bytes);
 };
 using Correlation = CorrelationForward;

 class CorrelationBackwardData1 : public CorrelationBase {
    DEF_OPR_IMPL(CorrelationBackwardData1, CorrelationBase, 3, 1);

 public:
    /**
     * \param[in] diff the backpropagated gradient wrt. dst
     * \param[in] data1 the `data1' parameter in CorrelationForward::exec
     * \param[in] data2 the `data2' parameter in CorrelationForward::exec
     * \param[out] grad1 the backpropagated gradient wrt. data1
     */
    virtual void exec(_megdnn_tensor_in diff, _megdnn_tensor_in data1, _megdnn_tensor_in data2,
                      _megdnn_tensor_out grad1, _megdnn_workspace workspace) = 0;
    void deduce_layout(const TensorLayout& diff1, const TensorLayout& data1,
                        const TensorLayout& data2, TensorLayout& dst);
    virtual size_t get_workspace_in_bytes(const TensorLayout& diff,
                                          const TensorLayout& data1,
                                          const TensorLayout& data2,
                                          const TensorLayout& grad1) = 0;

 protected:
    void check_exec(const TensorLayout& diff, const TensorLayout& data1, const TensorLayout& data2,
                    const TensorLayout& grad1, size_t workspace_in_bytes);
 };

 class CorrelationBackwardData2 : public CorrelationBase {
    DEF_OPR_IMPL(CorrelationBackwardData2, CorrelationBase, 3, 1);

 public:
    /**
     * \param[in] diff the backpropagated gradient wrt. dst
     * \param[in] data1 the `data1' parameter in CorrelationForward::exec
     * \param[in] data2 the `data2' parameter in CorrelationForward::exec
     * \param[out] grad2 the backpropagated gradient wrt. data2
     */
    virtual void exec(_megdnn_tensor_in diff, _megdnn_tensor_in data1, _megdnn_tensor_in data2,
                      _megdnn_tensor_out grad2, _megdnn_workspace workspace) = 0;
    void deduce_layout(const TensorLayout& diff1, const TensorLayout& data1,
                        const TensorLayout& data2, TensorLayout& dst);
    virtual size_t get_workspace_in_bytes(const TensorLayout& diff,
                                          const TensorLayout& data1,
                                          const TensorLayout& data2,
                                          const TensorLayout& grad2) = 0;

 protected:
    void check_exec(const TensorLayout& diff, const TensorLayout& data1, const TensorLayout& data2,
                    const TensorLayout& grad2, size_t workspace_in_bytes);
 };

 class CumsumForward: public OperatorBase {
    DEF_OPR_PARAM(Cumsum);
    DEF_OPR_IMPL(CumsumForward, OperatorBase, 1, 1);
--- a/dnn/scripts/opr_param_defs.py
+++ b/dnn/scripts/opr_param_defs.py
@@ -220,7 +220,7 @@ pdef('Axis').add_fields('int32', 'axis', 0)

 (pdef('Images2Neibs').
 add_fields('uint32', 'pad_h', 0, 'pad_w', 0, 'stride_h', 1, 'stride_w', 1,
            'window_h', 3, 'window_w', 3))
            'dilate_h', 1, 'dilate_w', 1, 'window_h', 3, 'window_w', 3))

 (pdef('Pooling', version=0, is_legacy=True).
 add_enum(
@@ -1053,6 +1053,16 @@ Note: NCHW_NCHW4_WEIGHT will auto pad oc and ic, you should remove oc in later o
            'sample_width', '2')
 )

 (pdef('Correlation').
 add_enum_alias('Format', 'ConvolutionV0').
 add_fields('uint32', 'kernel_size', '1').
 add_fields('uint32', 'max_displacement', '1').
 add_fields('uint32', 'stride1', '1').
 add_fields('uint32', 'stride2', '1').
 add_fields('uint32', 'pad_size', '0').
 add_fields('bool', 'is_multiply', 'true')
 )

 (pdef('DeformablePSROIPooling').
 add_fields('bool', 'no_trans', 'true').
 add_fields('float32', 'spatial_scale', 1,
--- a/dnn/src/CMakeLists.txt
+++ b/dnn/src/CMakeLists.txt
@@ -63,7 +63,7 @@ macro (HIP_COMPILE _hip_target _hip_objs)
    add_custom_target(${_hip_target})

    # set return value
    set (${_hip_objs} ${_generated_files})
    set(${_hip_objs} ${_generated_files})
 endmacro()

 if (MGE_WITH_ROCM)
@@ -74,14 +74,21 @@ if (MGE_WITH_ROCM)
    # empty file to bypass this error.
    file(GLOB start.cpp.hip "" )
    list(APPEND HIP_SOURCES start.cpp.hip)
    configure_file(
        ${PROJECT_SOURCE_DIR}/dnn/include/hcc_detail/hcc_defs_prologue.h.in
        ${PROJECT_BINARY_DIR}/dnn/include/hcc_detail/hcc_defs_prologue.h)

    file (GLOB_RECURSE HIPSOURCES rocm/*.cpp.hip)
    set(HIP_TARGET_NAME hip_kernel)
    configure_file(
        ${PROJECT_SOURCE_DIR}/dnn/include/hcc_detail/hcc_defs_epilogue.h.in
        ${PROJECT_BINARY_DIR}/dnn/include/hcc_detail/hcc_defs_epilogue.h)

    file(GLOB_RECURSE HIP_SOURCES_ rocm/*.cpp.hip)
    set(HIP_TARGET_NAME megdnn_hip_kernel)
    set(_HIPCC_OPTIONS "-fPIC")
    set(_HCC_OPTIONS "-fPIC")
    set(_NVCC_OPTIONS "-fPIC")

    list(APPEND HIP_SOURCES ${HIPSOURCES})
    list(APPEND HIP_SOURCES ${HIP_SOURCES_})
    set_source_files_properties(${HIP_SOURCES} PROPERTIES HIP_SOURCE_PROPERTY_FORMAT 1)
    HIP_INCLUDE_DIRECTORIES(${PROJECT_SOURCE_DIR}/dnn
        ${PROJECT_SOURCE_DIR}/dnn/include
@@ -91,13 +98,14 @@ if (MGE_WITH_ROCM)
        ${HIP_INCLUDE_DIR}
        ${MIOPEN_INCLUDE_DIR}
        ${ROCBLAS_INCLUDE_DIR}
        ${ROCRAND_INCLUDE_DIR})
        ${ROCRAND_INCLUDE_DIR}
        ${AMDOCL_INCLUDE_DIR})
    hip_compile(
       ${HIP_TARGET_NAME} HIPOBJS ${HIP_SOURCES}
       HIPCC_OPTIONS ${_HIPCC_OPTIONS}
       HCC_OPTIONS ${_HCC_OPTIONS}
       NVCC_OPTIONS ${_NVCC_OPTIONS})
    list (APPEND SOURCES ${HIPOBJS})
        ${HIP_TARGET_NAME} HIPOBJS ${HIP_SOURCES}
        HIPCC_OPTIONS ${_HIPCC_OPTIONS}
        HCC_OPTIONS ${_HCC_OPTIONS}
        NVCC_OPTIONS ${_NVCC_OPTIONS})
    list(APPEND SOURCES ${HIPOBJS})
 endif ()

 if(MGE_WITH_CUDA)
@@ -139,16 +147,18 @@ if(MGE_WITH_CUDA)
 endif()

 if(MGE_WITH_ROCM)
        target_include_directories(megdnn PUBLIC
    target_include_directories(megdnn PUBLIC
        ${HIP_INCLUDE_DIR}
        ${MIOPEN_INCLUDE_DIR}
        ${ROCBLAS_INCLUDE_DIR}
        ${ROCRAND_INCLUDE_DIR})
        target_link_directories(megdnn PUBLIC
        ${ROCRAND_INCLUDE_DIR}
        ${AMDOCL_INCLUDE_DIR})
    target_link_directories(megdnn PUBLIC
        ${HIP_LIBRARY_DIR}
        ${MIOPEN_LIBRARY_DIR}
        ${ROCBLAS_LIBRARY_DIR}
        ${ROCRAND_LIBRARY_DIR})
        ${ROCRAND_LIBRARY_DIR}
        ${AMDOCL_LIBRARY_DIR})
 endif()


--- a/dnn/src/aarch64/matrix_mul/algos.h
+++ b/dnn/src/aarch64/matrix_mul/algos.h
@@ -35,7 +35,8 @@ public:
 class MatrixMulImpl::AlgoF32MK4_8x12x1 final : public AlgoBase {
 public:
    AlgoAttribute attribute() const override {
        return AlgoAttribute::REPRODUCIBLE;
        return AlgoAttribute::REPRODUCIBLE |
               AlgoAttribute::USABLE_DEPEND_ON_SHAPE;
    }
    const char* name() const override { return "AARCH64_F32_MK4_K8X12X1"; }
    bool usable(const KernSizeParam&) const override;
@@ -146,7 +147,8 @@ public:
 class MatrixMulImpl::AlgoInt8x8x32MK4_4x4x16 final : public AlgoBase {
 public:
    AlgoAttribute attribute() const override {
        return AlgoAttribute::REPRODUCIBLE;
        return AlgoAttribute::REPRODUCIBLE |
               AlgoAttribute::USABLE_DEPEND_ON_SHAPE;
    }
    const char* name() const override { return "AARCH64_INT8X8X32_MK4_4X4X16"; }
    bool usable(const KernSizeParam&) const override;
@@ -220,7 +222,8 @@ public:
 class MatrixMulImpl::AlgoInt4x4x16K8x8x8 final : public AlgoBase {
 public:
    AlgoAttribute attribute() const override {
        return AlgoAttribute::REPRODUCIBLE;
        return AlgoAttribute::REPRODUCIBLE |
               AlgoAttribute::USABLE_DEPEND_ON_SHAPE;
    }
    const char* name() const override { return "AARCH64_INT4X4X16_K8X8X8"; }
    bool usable(const KernSizeParam&) const override;
@@ -235,7 +238,8 @@ public:
 class MatrixMulImpl::AlgoInt8x8x16MK4_16x12x4 final : public AlgoBase {
 public:
    AlgoAttribute attribute() const override {
        return AlgoAttribute::REPRODUCIBLE;
        return AlgoAttribute::REPRODUCIBLE |
               AlgoAttribute::USABLE_DEPEND_ON_SHAPE;
    }
    const char* name() const override {
        return "AARCH64_INT8X8X16_MK4_16X12X4";
@@ -253,7 +257,8 @@ public:
 class MatrixMulImpl::AlgoInt8x8x16MK4_K8x8x8 final : public AlgoBase {
 public:
    AlgoAttribute attribute() const override {
        return AlgoAttribute::REPRODUCIBLE;
        return AlgoAttribute::REPRODUCIBLE |
               AlgoAttribute::USABLE_DEPEND_ON_SHAPE;
    }
    const char* name() const override {
        return "AARCH64_INT8X8X16_MK4_K8X8X8";
@@ -271,7 +276,8 @@ public:
 class MatrixMulImpl::AlgoInt8x8x16MK4_4x4x8 final : public AlgoBase {
 public:
    AlgoAttribute attribute() const override {
        return AlgoAttribute::REPRODUCIBLE;
        return AlgoAttribute::REPRODUCIBLE |
               AlgoAttribute::USABLE_DEPEND_ON_SHAPE;
    }
    const char* name() const override { return "AARCH64_INT8X8X16_MK4_4X4X8"; }
    bool usable(const KernSizeParam&) const override;
@@ -330,7 +336,8 @@ public:
 class MatrixMulImpl::AlgoQuint8GemvDotProd final : public AlgoBase {
 public:
    AlgoAttribute attribute() const override {
        return AlgoAttribute::REPRODUCIBLE;
        return AlgoAttribute::REPRODUCIBLE |
               AlgoAttribute::USABLE_DEPEND_ON_SHAPE;
    }
    const char* name() const override { return "AARCH64_QUINT8_GEMV_DOTPROD"; }
    bool usable(const KernSizeParam&) const override;
--- a/dnn/src/arm_common/matrix_mul/algos.h
+++ b/dnn/src/arm_common/matrix_mul/algos.h
@@ -34,7 +34,8 @@ public:
 class MatrixMulImpl::AlgoInt8x8x32Gemv : public AlgoBase {
 public:
    AlgoAttribute attribute() const override {
        return AlgoAttribute::REPRODUCIBLE;
        return AlgoAttribute::REPRODUCIBLE |
               AlgoAttribute::USABLE_DEPEND_ON_SHAPE;
    }
    const char* name() const override { return "ARM_COMMON_INT8X8X32_GEMV"; }
    bool usable(const KernSizeParam&) const override;
@@ -50,7 +51,8 @@ public:
 class MatrixMulImpl::AlgoInt8x8x32GemvMK4 : public AlgoBase {
 public:
    AlgoAttribute attribute() const override {
        return AlgoAttribute::REPRODUCIBLE;
        return AlgoAttribute::REPRODUCIBLE |
               AlgoAttribute::USABLE_DEPEND_ON_SHAPE;
    }
    const char* name() const override { return "ARM_COMMON_INT8X8X32_GEMV_MK4"; }
    bool usable(const KernSizeParam&) const override;
@@ -67,7 +69,8 @@ public:
 class MatrixMulImpl::AlgoInt8x8x32GemvMK4Dot : public AlgoBase {
 public:
    AlgoAttribute attribute() const override {
        return AlgoAttribute::REPRODUCIBLE;
        return AlgoAttribute::REPRODUCIBLE |
               AlgoAttribute::USABLE_DEPEND_ON_SHAPE;
    }
    const char* name() const override { return "ARM_COMMON_INT8X8X32_GEMV_MK4_DOT"; }
    bool usable(const KernSizeParam&) const override;
@@ -102,7 +105,8 @@ public:
 class MatrixMulImpl::AlgoF32GemvMK4 : public AlgoBase {
 public:
    AlgoAttribute attribute() const override {
        return AlgoAttribute::REPRODUCIBLE;
        return AlgoAttribute::REPRODUCIBLE |
               AlgoAttribute::USABLE_DEPEND_ON_SHAPE;
    }
    const char* name() const override { return "ARM_COMMON_F32_GEMV_MK4"; }
    bool usable(const KernSizeParam&) const override;
--- a/dnn/src/armv7/matrix_mul/algos.h
+++ b/dnn/src/armv7/matrix_mul/algos.h
@@ -35,7 +35,8 @@ public:
 class MatrixMulImpl::AlgoF32MK4Pack4x12 final : public AlgoBase {
 public:
    AlgoAttribute attribute() const override {
        return AlgoAttribute::REPRODUCIBLE;
        return AlgoAttribute::REPRODUCIBLE |
               AlgoAttribute::USABLE_DEPEND_ON_SHAPE;
    }
    const char* name() const override { return "ARMV7_F32_MK4_PACK_4X12"; }
    bool usable(const KernSizeParam&) const override;
@@ -224,7 +225,8 @@ public:
 class MatrixMulImpl::AlgoInt8x8x16MK4_8x8x4 final : public AlgoBase {
 public:
    AlgoAttribute attribute() const override {
        return AlgoAttribute::REPRODUCIBLE;
        return AlgoAttribute::REPRODUCIBLE |
               AlgoAttribute::USABLE_DEPEND_ON_SHAPE;
    }
    const char* name() const override { return "ARMV7_INT8X8X16_MK4_K8X8X4"; }
    bool usable(const KernSizeParam&) const override;
@@ -266,7 +268,8 @@ public:
 class MatrixMulImpl::AlgoInt8x8x32MK4_4x2x16 final : public AlgoBase {
 public:
    AlgoAttribute attribute() const override {
        return AlgoAttribute::REPRODUCIBLE;
        return AlgoAttribute::REPRODUCIBLE |
               AlgoAttribute::USABLE_DEPEND_ON_SHAPE;
    }
    const char* name() const override { return "ARMV7_INT8X8X32_MK4_4X2X16"; }
    bool usable(const KernSizeParam&) const override;
--- a/dnn/src/common/algo_base.cpp
+++ b/dnn/src/common/algo_base.cpp
@@ -18,7 +18,9 @@ using namespace megdnn;
 #define FOREACH_ALGO_ATTRIBUTE(cb) \
    cb(DEFAULT)                    \
    cb(REPRODUCIBLE)               \
    cb(NAIVE)
    cb(NAIVE)                      \
    cb(USABLE_DEPEND_ON_SHAPE)     \
    cb(ACCURACY_DEPEND_ON_BATCH)

 namespace {
 inline const char* attr_str(const AlgoAttribute& attr) {
--- a/dnn/src/common/algo_base.h
+++ b/dnn/src/common/algo_base.h
@@ -47,6 +47,9 @@ namespace megdnn {
        return algo_pack().all_algos_map().at(desc);            \
    }

 #define MEGDNN_FOREACH_ALGO_ATTRIBUTE_INHERITABLE(cb) \
    cb(AlgoAttribute::ACCURACY_DEPEND_ON_BATCH)

 /**
 * \brief construct algo from AlgorithmDesc
 */
--- a/dnn/src/common/api_cache.h
+++ b/dnn/src/common/api_cache.h
@@ -0,0 +1,430 @@
 /**
 * \file dnn/src/common/api_cache.h
 * MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
 *
 * Copyright (c) 2014-2021 Megvii Inc. All rights reserved.
 *
 * Unless required by applicable law or agreed to in writing,
 * software distributed under the License is distributed on an
 * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or
 * implied.
 */

 #pragma once

 #include <atomic>
 #include <cstring>
 #include <memory>
 #include <mutex>
 #include <tuple>
 #include <unordered_map>

 #include "megdnn/thin/function.h"

 #include "./utils.h"

 namespace megdnn {

 // https://jfdube.wordpress.com/2014/01/03/implementing-a-recursive-read-write-spinlock/
 class RWSpin {
 public:
    class Lock {
    private:
        RWSpin* m_spin;
        void (RWSpin::*m_lock)(void);
        void (RWSpin::*m_unlock)(void);

    public:
        Lock(RWSpin* spin, decltype(m_lock) lock, decltype(m_unlock) unlock)
                : m_spin{spin}, m_lock{lock}, m_unlock{unlock} {}
        void lock() { (m_spin->*m_lock)(); }
        void unlock() { (m_spin->*m_unlock)(); }
    };

 private:
    std::atomic<uint32_t> m_atomic{0};

    static constexpr uint32_t sm_reader_mask = 0x7FFFFFFF;
    static constexpr uint32_t sm_writer_mask = 0x80000000;

    void _reader_lock() {
        uint32_t expected = m_atomic;
        do {
            expected &= sm_reader_mask;
        } while (!m_atomic.compare_exchange_strong(expected, expected + 1));
    }
    void _reader_unlock() { m_atomic--; }
    void _writer_lock() {
        uint32_t expected = m_atomic;
        do {
            expected &= sm_reader_mask;
        } while (!m_atomic.compare_exchange_strong(expected,
                                                   expected | sm_writer_mask));
        while (m_atomic.load() != sm_writer_mask)
            ;
    }
    void _writer_unlock() {
        // assert m_atomic == sm_writer_mask
        m_atomic = 0;
    }

 public:
    Lock reader() {
        return {this, &RWSpin::_reader_lock, &RWSpin::_reader_unlock};
    }
    Lock writer() {
        return {this, &RWSpin::_writer_lock, &RWSpin::_writer_unlock};
    }
 };

 template <typename TSignature>
 class FunctionCache;

 template <typename TRet, typename... TArgs>
 class FunctionCache<TRet(TArgs...)> {
 public:
    using key_t = std::string;
    using value_t = TRet;
    using key_mapper_t = thin_function<key_t(TArgs...)>;
    using value_mapper_t = thin_function<value_t(TArgs...)>;
    using storage_t = std::unordered_map<key_t, value_t>;

    storage_t storage;
    key_mapper_t key_mapper;
    value_mapper_t value_mapper;

    RWSpin spin;

 public:
    TRet operator()(TArgs... args) {
        key_t key = key_mapper(args...);
        auto reader_lock = spin.reader();
        auto writer_lock = spin.writer();
        {
            MEGDNN_LOCK_GUARD(reader_lock);
            auto iter = storage.find(key);
            if (iter != storage.end()) {
                return iter->second;
            }
        }
        // RWSpin doesn't support upgrade
        {
            MEGDNN_LOCK_GUARD(writer_lock);
            if (storage.count(key) != 0) {
                return storage[key];
            }
            value_t ret = value_mapper(std::forward<TArgs>(args)...);
            storage[key] = ret;
            return ret;
        }
    }
 };

 // FIFO
 class StringSerializer {
 private:
    std::string m_buffer;
    size_t m_cursor = 0;

 public:
    template <typename T>
    T read_plain() {
        static_assert(std::is_trivially_copyable<T>::value, "invalid type");
        T ret;
        std::memcpy(&ret, m_buffer.data() + m_cursor, sizeof(T));
        m_cursor += sizeof(T);
        return ret;
    }
    template <typename T>
    void read_plain(T* dest) {
        static_assert(std::is_trivially_copyable<T>::value, "invalid type");
        std::memcpy(dest, m_buffer.data() + m_cursor, sizeof(T));
        m_cursor += sizeof(T);
    }
    template <typename T>
    void write_plain(const T& value) {
        static_assert(std::is_trivially_copyable<T>::value,
                      "type should be trivially copyable");
        m_buffer.append(reinterpret_cast<const char*>(&value), sizeof(T));
    }
    std::string take() { return std::move(m_buffer); }
    void reset(std::string new_buf) {
        m_cursor = 0;
        m_buffer = std::move(new_buf);
    }
 };

 struct Empty {};

 // in: seq[1, 2, ..., m]
 // out: seq[N+1, N+2, ... N+m]
 template <std::size_t N, std::size_t... Seq>
 inline std::index_sequence<N + Seq...> inc_index_sequence(
        std::index_sequence<Seq...>) {
    return {};
 }

 template <typename... TParams>
 class ParamBundle {
 private:
    // out: Min, Min+1, ..., Max
    template <std::size_t Min, std::size_t Max>
    using make_index_range = decltype(
            inc_index_sequence<Min>(std::make_index_sequence<Max - Min>()));

    // store params in a tuple
    using storage_t = std::tuple<typename std::remove_reference_t<TParams>...>;
    storage_t m_storage;

    // deconstruct tuple and call functor
    template <typename TFunctor, size_t... Indices>
    auto call_helper(TFunctor&& functor, std::index_sequence<Indices...>) {
        return functor(std::get<Indices>(m_storage).value...);
    }

    template <size_t Index, size_t... Indices, typename TPrev>
    auto serialize_helper(StringSerializer& ser, TPrev&& prev,
                          std::index_sequence<Index, Indices...>) {
        return serialize_helper(ser,
                                std::get<Index>(m_storage).serialize(ser, prev),
                                std::index_sequence<Indices...>());
    }

    template <typename TPrev>
    auto serialize_helper(StringSerializer& ser, TPrev&& prev,
                          std::index_sequence<>) {}

    template <size_t Index, size_t... Indices, typename TPrev>
    auto deserialize_helper(StringSerializer& ser, TPrev&& prev,
                            std::index_sequence<Index, Indices...>) {
        return deserialize_helper(
                ser, std::get<Index>(m_storage).deserialize(ser, prev),
                std::index_sequence<Indices...>());
    }

    template <typename TPrev>
    auto deserialize_helper(StringSerializer& ser, TPrev&& prev,
                            std::index_sequence<>) {}

    template <size_t Index, size_t... Indices, typename TArg, typename... TArgs>
    void set_values_helper(std::index_sequence<Index, Indices...>, TArg&& arg,
                           TArgs&&... args) {
        std::get<Index>(m_storage).value = std::forward<TArg>(arg);
        set_values_helper(std::index_sequence<Indices...>(),
                          std::forward<TArgs>(args)...);
    }

    template <size_t... Indices>
    void set_values_helper(std::index_sequence<Indices...>) {
        static_assert(sizeof...(Indices) == 0, "redundant indices");
    }

 public:
    template <typename TFunctor>
    auto call_by(TFunctor&& functor) {
        return call_helper(std::forward<TFunctor>(functor),
                           std::make_index_sequence<sizeof...(TParams)>());
    }

    // recursively store params into ser
    template <size_t NBegin, size_t NEnd>
    void serialize_params(StringSerializer& ser) {
        static_assert(NEnd >= NBegin, "invalid range");
        serialize_helper(ser, Empty{}, make_index_range<NBegin, NEnd>());
    }

    // recursively load params from ser
    template <size_t NBegin, size_t NEnd>
    void deserialize_params(StringSerializer& ser) {
        static_assert(NEnd >= NBegin, "invalid range");
        deserialize_helper(ser, Empty{}, make_index_range<NBegin, NEnd>());
    }

    // recursively set params into m_storage
    template <size_t NBegin, size_t NEnd, typename... TArgs>
    void set_values(TArgs&&... args) {
        set_values_helper(make_index_range<NBegin, NEnd>(),
                          std::forward<TArgs>(args)...);
    }
 };

 template <typename T>
 class Param {
 public:
    T value;

    Empty serialize(StringSerializer& ser, Empty) {
        ser.write_plain(value);
        return Empty{};
    }

    Empty deserialize(StringSerializer& ser, Empty) {
        ser.read_plain(&value);
        return Empty{};
    }
 };

 template <typename TRet = Param<Empty>, typename TInputs = std::tuple<>,
          typename TOutputs = std::tuple<>>
 class FunctionCacheBuilder {
 private:
    // decl value with type of tuple-of-args
    static auto declargs()
            -> decltype(std::tuple_cat(std::declval<TInputs>(),
                                       std::declval<TOutputs>())) {
        return {};
    }

    template <size_t... Indices>
    static auto declfunction_helper(std::index_sequence<Indices...>)
            -> thin_function<decltype(std::declval<TRet>().value)(
                    decltype(std::get<Indices>(declargs()).value)...)> {
        return {};
    }

    // decl value with type of original function
    static auto declfunction() {
        return declfunction_helper(
                std::make_index_sequence<std::tuple_size<TInputs>::value +
                                         std::tuple_size<TOutputs>::value>());
    }

    template <size_t... Indices>
    static auto declbundle_helper(std::index_sequence<Indices...>)
            -> ParamBundle<std::remove_reference_t<
                    decltype(std::get<Indices>(declargs()))>...> {
        return {};
    }

    // decl value with type of bundle-of-args
    static auto declbundle() {
        return declbundle_helper(
                std::make_index_sequence<std::tuple_size<TInputs>::value +
                                         std::tuple_size<TOutputs>::value>());
    }

    // type of original function
    using function_t = decltype(declfunction());
    // type of bundle-of-args
    using bundle_t = decltype(declbundle());

 public:
    // declare new return type, cannot be override
    template <typename TNewRet>
    auto ret() {
        static_assert(std::is_same<TRet, Param<Empty>>::value,
                      "return value redefinition");
        return FunctionCacheBuilder<TNewRet, TInputs, TOutputs>{};
    }
    // declare new input
    template <typename TNewInput>
    auto input() {
        static_assert(std::tuple_size<TOutputs>::value == 0,
                      "input arg cannot be declared after output");
        using TNewInputs =
                decltype(std::tuple_cat(std::declval<TInputs>(),
                                        std::declval<std::tuple<TNewInput>>()));
        return FunctionCacheBuilder<TRet, TNewInputs, TOutputs>{};
    }
    // declare new output
    template <typename TNewOutput>
    auto output() {
        using TNewOutputs = decltype(
                std::tuple_cat(std::declval<TOutputs>(),
                               std::declval<std::tuple<TNewOutput>>()));
        return FunctionCacheBuilder<TRet, TInputs, TNewOutputs>{};
    }
    // summary
    template <typename TFunctor>
    function_t build(TFunctor&& func) {
        constexpr size_t n_inputs = std::tuple_size<TInputs>::value;
        constexpr size_t n_outputs = std::tuple_size<TOutputs>::value;
        auto cache = std::make_shared<FunctionCache<std::string(bundle_t)>>();
        // bundle -> ser(in args)
        cache->key_mapper = [](bundle_t bundle) {
            StringSerializer ser;
            bundle.template serialize_params<0, n_inputs>(ser);
            return ser.take();
        };
        // bundle -> ser(out args)
        cache->value_mapper = [func](bundle_t bundle) {
            StringSerializer ser;
            TRet ret;
            ret.value = bundle.call_by(func);
            ret.serialize(ser, Empty{});
            bundle.template serialize_params<n_inputs, n_inputs + n_outputs>(
                    ser);
            return ser.take();
        };
        return [=](auto&&... args) mutable {
            bundle_t bundle;
            TRet ret;
            StringSerializer ser;
            static_assert(
                    sizeof...(args) == std::tuple_size<TInputs>::value +
                                               std::tuple_size<TOutputs>::value,
                    "args count mismatch");
            bundle.template set_values<0, sizeof...(args)>(
                    std::forward<decltype(args)>(args)...);
            ser.reset((*cache)(bundle));
            ret.deserialize(ser, Empty{});
            bundle.template deserialize_params<n_inputs, n_inputs + n_outputs>(
                    ser);
            return ret.value;
        };
    }
 };

 template <typename T>
 class RefParam {
 public:
    T* value;
    Empty serialize(StringSerializer& ser, Empty) {
        ser.write_plain(*value);
        return Empty{};
    }
    Empty deserialize(StringSerializer& ser, Empty) {
        *value = ser.read_plain<T>();
        return Empty{};
    }
 };

 // like RefParam but return *value while ser and deser. Working with ArrayParam
 template <typename T>
 class RefArraySizeParam {
 public:
    T* value;
    T serialize(StringSerializer& ser, Empty) {
        ser.write_plain(*value);
        return *value;
    }
    T deserialize(StringSerializer& ser, Empty) {
        ser.read_plain(value);
        return *value;
    }
 };

 // accept array length from previous param. Working with RefArraySizeParam
 template <typename TSize, typename TItem>
 class ArrayParam {
 public:
    decltype(std::declval<TItem>().value)* value;
    Empty serialize(StringSerializer& ser, TSize size) {
        TItem param;
        for (TSize i = 0; i < size; ++i) {
            param.value = value[i];
            param.serialize(ser, Empty{});
        }
        return Empty{};
    }
    Empty deserialize(StringSerializer& ser, TSize size) {
        TItem param;
        for (TSize i = 0; i < size; ++i) {
            param.deserialize(ser, Empty{});
            value[i] = param.value;
        }
        return Empty{};
    }
 };

 }  // namespace megdnn
--- a/dnn/src/common/conv_bias.cpp
+++ b/dnn/src/common/conv_bias.cpp
@@ -323,6 +323,34 @@ void handle_bias_and_nonlinear(Handle* handle, param::ConvBias args,
    }
 }

 bool check_bias_share_in_channel(const TensorLayout& bias,
                                 const param::ConvBias::Format format) {
    bool share_in_channel = false;
    if (format == param::ConvBias::Format::NCHW ||
        format == param::ConvBias::Format::NCHW4_NCHW) {
        share_in_channel = (bias.ndim == 4 && bias[0] == 1 && bias[2] == 1 &&
                            bias[3] == 1);
    } else if (format == param::ConvBias::Format::NHWC) {
        share_in_channel = (bias.ndim == 4 && bias[0] == 1 && bias[1] == 1 &&
                            bias[2] == 1);
    } else if (format == param::ConvBias::Format::NCHW4 ||
               format == param::ConvBias::Format::NCHW8 ||
               format == param::ConvBias::Format::NCHW32 ||
               format == param::ConvBias::Format::NCHW4_NCHW32 ||
               format == param::ConvBias::Format::NCHW32_NCHW4) {
        share_in_channel = (bias.ndim == 5 && bias[0] == 1 && bias[2] == 1 &&
                            bias[3] == 1);
    } else if (format == param::ConvBias::Format::NHWCD4) {
        share_in_channel = (bias.ndim == 5 && bias[0] == 1 && bias[1] == 1 &&
                            bias[3] == 1);
    } else {
        megdnn_assert(format == param::ConvBias::Format::CHWN4);
        share_in_channel = (bias.ndim == 5 && bias[1] == 1 && bias[2] == 1 &&
                            bias[3] == 1);
    }
    return share_in_channel;
 }

 }  // namespace megdnn

 // vim: syntax=cpp.doxygen
--- a/dnn/src/common/conv_bias.h
+++ b/dnn/src/common/conv_bias.h
@@ -21,6 +21,9 @@ void handle_bias_and_nonlinear(Handle* handle, param::ConvBias args,
                               const TensorND* conv_dst_tensor,
                               const TensorND* dst_tensor,
                               const TensorND* bias_tensor);

 bool check_bias_share_in_channel(const TensorLayout& bias,
                                 const param::ConvBias::Format format);
 }  // namespace megdnn

 // vim: syntax=cpp.doxygen
--- a/dnn/src/common/correlation.cpp
+++ b/dnn/src/common/correlation.cpp
@@ -0,0 +1,132 @@
 /**
 * \file dnn/src/common/correlation.cpp
 * MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
 *
 * Copyright (c) 2014-2021 Megvii Inc. All rights reserved.
 *
 * Unless required by applicable law or agreed to in writing,
 * software distributed under the License is distributed on an
 * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or
 * implied.
 */
 #include "megdnn/oprs.h"

 #include "src/common/utils.h"

 namespace megdnn {

 void CorrelationBase::deduce_layout_fwd(const TensorLayout& data1,
                                        const TensorLayout& data2,
                                        TensorLayout& dst) {
    megdnn_assert_contiguous(data1);
    megdnn_assert_contiguous(data2);
    megdnn_assert_contiguous(dst);
    auto errmsg = [&]() {
        return megdnn_layout_msg(data1) + ", " + megdnn_layout_msg(data2) +
               ", " + megdnn_layout_msg(dst);
    };
    MEGDNN_MARK_USED_VAR(errmsg);
    using Format = CorrelationBase::Param::Format;
    megdnn_assert(param().format == Format::NCHW);
    auto data1_dtype = data1.dtype, data2_dtype = data2.dtype;
    megdnn_assert(data1_dtype == data2_dtype &&
                  data1_dtype.category() == DTypeCategory::FLOAT);
    megdnn_assert(data1.ndim == 4_z, "%s", errmsg().c_str());
    megdnn_assert(data2.ndim == 4_z, "%s", errmsg().c_str());

    uint32_t pad_size = param().pad_size;
    uint32_t kernel_size = param().kernel_size;
    uint32_t stride1 = param().stride1;
    uint32_t stride2 = param().stride2;
    uint32_t max_displacement = param().max_displacement;

    int paddedbottomheight = data1[2] + 2 * pad_size;
    int paddedbottomwidth = data1[3] + 2 * pad_size;
    uint32_t kernel_radius = (kernel_size - 1) / 2;
    uint32_t border_size = max_displacement + kernel_radius;
    uint32_t top_width =
            ceil(static_cast<float>(paddedbottomwidth - border_size * 2) /
                 static_cast<float>(stride1));
    uint32_t top_height =
            ceil(static_cast<float>(paddedbottomheight - border_size * 2) /
                 static_cast<float>(stride1));
    uint32_t neighborhood_grid_radius = max_displacement / stride2;
    uint32_t neighborhood_grid_width = neighborhood_grid_radius * 2 + 1;
    uint32_t top_channels = neighborhood_grid_width * neighborhood_grid_width;
    megdnn_assert(top_width >= 1 && top_height >= 1);

    dst = TensorLayout{{data1[0], top_channels, top_height, top_width},
                       data1.dtype};
 }

 void CorrelationBase::check_layout_fwd(const TensorLayout& data1,
                                       const TensorLayout& data2,
                                       const TensorLayout& dst) {
    TensorLayout dst_expected;
    megdnn_assert_eq_dtype(data1, dst);
    megdnn_assert_eq_shape(data1, data2);
    deduce_layout_fwd(data1, data2, dst_expected);
    megdnn_assert_eq_shape(dst_expected, dst);
 }

 void CorrelationForward::deduce_layout(const TensorLayout& data1,
                                       const TensorLayout& data2,
                                       TensorLayout& dst) {
    deduce_layout_fwd(data1, data2, dst);
 }

 void CorrelationForward::check_exec(const TensorLayout& data1,
                                    const TensorLayout& data2,
                                    const TensorLayout& dst,
                                    size_t workspace_in_bytes) {
    check_layout_fwd(data1, data2, dst);
    auto required_workspace_in_bytes =
            get_workspace_in_bytes(data1, data2, dst);
    megdnn_assert(workspace_in_bytes >= required_workspace_in_bytes);
 }

 void CorrelationBackwardData1::check_exec(const TensorLayout& diff,
                                          const TensorLayout& data1,
                                          const TensorLayout& data2,
                                          const TensorLayout& grad1,
                                          size_t workspace_in_bytes) {
    check_layout_fwd(grad1, data2, diff);
    megdnn_assert_eq_shape(data1, data2);
    auto required_workspace_in_bytes =
            get_workspace_in_bytes(diff, data1, data2, grad1);
    megdnn_assert(workspace_in_bytes >= required_workspace_in_bytes);
 }

 void CorrelationBackwardData2::check_exec(const TensorLayout& diff,
                                          const TensorLayout& data1,
                                          const TensorLayout& data2,
                                          const TensorLayout& grad2,
                                          size_t workspace_in_bytes) {
    check_layout_fwd(data1, grad2, diff);
    megdnn_assert_eq_shape(data1, data2);
    auto required_workspace_in_bytes =
            get_workspace_in_bytes(diff, data1, data2, grad2);
    megdnn_assert(workspace_in_bytes >= required_workspace_in_bytes);
 }

 void CorrelationBackwardData2::deduce_layout(const TensorLayout& diff,
                                             const TensorLayout& data1,
                                             const TensorLayout& data2,
                                             TensorLayout& grad) {
    megdnn_assert_eq_shape(data1, data2);
    check_layout_fwd(data1, data2, diff);
    grad = data2;
 }

 void CorrelationBackwardData1::deduce_layout(const TensorLayout& diff,
                                             const TensorLayout& data1,
                                             const TensorLayout& data2,
                                             TensorLayout& grad) {
    megdnn_assert_eq_shape(data1, data2);
    check_layout_fwd(data1, data2, diff);
    grad = data1;
 }

 }  // namespace megdnn

 // vim: syntax=cpp.doxygen
--- a/dnn/src/common/elemwise/kern_defs.cuh
+++ b/dnn/src/common/elemwise/kern_defs.cuh
@@ -204,7 +204,7 @@ namespace megdnn {

    DEF_KERN_FLOAT(ATAN2, atan2f(x, y));
    DEF_KERN_FLOAT(H_SWISH_GRAD,
                   x < -3.f ? 0.f : (x > 3.f ? y : (2.f * x + 3.f) / 6.f * y));
                   x < -3.f ? (ctype)0.f : (ctype)(x > 3.f ? (ctype)y : (ctype)((2.f * x + 3.f) / 6.f * y)));

    DEF_KERN_FLOAT(FUSE_ADD_H_SWISH, fuse_add_hswish(x, y));
 #undef KERN_SIG
--- a/dnn/src/common/handle.cpp
+++ b/dnn/src/common/handle.cpp
@@ -6,7 +6,8 @@
 *
 * Unless required by applicable law or agreed to in writing,
 * software distributed under the License is distributed on an
 * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or
 * implied.
 */

 #include "megdnn/basic_types.h"
@@ -65,7 +66,7 @@ std::unique_ptr<Handle> Handle::make(megcoreComputingHandle_t computing_handle,
        // only enable midout for CPU, becuase CPU might be unused when some
        // other platforms are used
        MIDOUT_BEGIN(HandlePlatform, midout_iv(megcorePlatformCPU)) {
        // CPU
            // CPU
 #if MEGDNN_NAIVE
            return make_unique<naive::HandleImpl>(computing_handle);
 #else
@@ -90,91 +91,92 @@ std::unique_ptr<Handle> Handle::make(megcoreComputingHandle_t computing_handle,
            } else {
                megdnn_throw("Debug level must be 0/1/2.");
            }
        }
        MIDOUT_END();
 #endif
        }
        else if (platform == megcorePlatformROCM) {
        MIDOUT_END();

    }
    else if (platform == megcorePlatformROCM) {
 #if MEGDNN_WITH_ROCM
            return make_rocm_handle(computing_handle);
        return make_rocm_handle(computing_handle);
 #else
            return nullptr;
        return nullptr;
 #endif
        }
        else if (platform == megcorePlatformCambricon) {
    } else if (platform == megcorePlatformCambricon) {
 #if MEGDNN_WITH_CAMBRICON
            return make_unique<cambricon::HandleImpl>(computing_handle);
        return make_unique<cambricon::HandleImpl>(computing_handle);
 #else
            return nullptr;
        return nullptr;
 #endif
        }
        else if (platform == megcorePlatformAtlas) {
    } else if (platform == megcorePlatformAtlas) {
 #if MEGDNN_WITH_ATLAS
            return make_unique<atlas::HandleImpl>(computing_handle);
        return make_unique<atlas::HandleImpl>(computing_handle);
 #else
            return nullptr;
        return nullptr;
 #endif
        }
        else {
            // CUDA
            megdnn_throw_if(platform != megcorePlatformCUDA, megdnn_error,
                            "platform should be CUDA Platform");
    }
    else {
        // CUDA
        megdnn_throw_if(platform != megcorePlatformCUDA, megdnn_error,
                        "platform should be CUDA Platform");
 #if MEGDNN_WITH_CUDA
            return make_unique<cuda::HandleImpl>(computing_handle);
        return make_unique<cuda::HandleImpl>(computing_handle);
 #else
            return nullptr;
 #endif
        }
        return nullptr;
 #endif
    }


    void Handle::set_destructor(const thin_function<void()>& d) {
        megdnn_assert(!m_destructor, "destructor can be set only once");
        m_destructor = d;
    }

    Handle::~Handle() {
        if (m_destructor)
            m_destructor();
        m_alive_magic = 0;
    }

    size_t Handle::alignment_requirement() const {
        // default to 32
        return 32;
    return nullptr;
 }


 void Handle::set_destructor(const thin_function<void()>& d) {
    megdnn_assert(!m_destructor, "destructor can be set only once");
    m_destructor = d;
 }

 Handle::~Handle() {
    if (m_destructor)
        m_destructor();
    m_alive_magic = 0;
 }

 size_t Handle::alignment_requirement() const {
    // default to 32
    return 32;
 }

 size_t Handle::image2d_pitch_alignment() const {
    megdnn_throw("image2d tensor format not supported on this handle");
 }

 megdnn::HandleImplHelper::HandleVendorType Handle::vendor_type() const {
    return HandleVendorType::NOT_SPEC;
 }

 bool Handle::check_cross_dev_copy_constraint(const TensorLayout& src) {
    return src.is_contiguous();
 }

 void Handle::on_opr_destructed(OperatorBase* opr) {
    if (m_alive_magic != ALIVE_MAGIC) {
        megdnn_log_error(
                "Handle is destructed before opr gets destructed. "
                "Please fix the destruction order as this would cause "
                "undefined memory access. "
                "Abort now to avoid further problems.");
        abort();
    }

    size_t Handle::image2d_pitch_alignment() const {
        megdnn_throw("image2d tensor format not supported on this handle");
    if (m_on_opr_destructed) {
        m_on_opr_destructed(opr);
    }
 }

    megdnn::HandleImplHelper::HandleVendorType Handle::vendor_type() const {
        return HandleVendorType::NOT_SPEC;
    }
 OperatorBase::~OperatorBase() {
    m_handle->on_opr_destructed(this);
 }

    bool Handle::check_cross_dev_copy_constraint(const TensorLayout& src) {
        return src.is_contiguous();
    }

    void Handle::on_opr_destructed(OperatorBase * opr) {
        if (m_alive_magic != ALIVE_MAGIC) {
            megdnn_log_error(
                    "Handle is destructed before opr gets destructed. "
                    "Please fix the destruction order as this would cause "
                    "undefined memory access. "
                    "Abort now to avoid further problems.");
            abort();
        }
        if (m_on_opr_destructed) {
            m_on_opr_destructed(opr);
        }
    }

    OperatorBase::~OperatorBase() { m_handle->on_opr_destructed(this); }

    template <typename Opr>
    std::unique_ptr<Opr> Handle::create_operator() {
 template <typename Opr>
 std::unique_ptr<Opr> Handle::create_operator() {
 #define CASE(etype, nm)                                                        \
    case HandleType::etype: {                                                  \
        MIDOUT_BEGIN(HandleOpr, Opr, midout_iv(HandleType::etype)) {           \
@@ -183,48 +185,47 @@ std::unique_ptr<Handle> Handle::make(megcoreComputingHandle_t computing_handle,
        MIDOUT_END();                                                          \
    }

        switch (m_handle_type) {
            CASE(NAIVE, naive);
    switch (m_handle_type) {
        CASE(NAIVE, naive);
 #if !MEGDNN_NAIVE
            CASE(FALLBACK, fallback);
        CASE(FALLBACK, fallback);
 #if MEGDNN_X86
            CASE(X86, x86);
        CASE(X86, x86);
 #endif
 #if MEGDNN_ARMV7
            CASE(ARMV7, armv7);
        CASE(ARMV7, armv7);
 #endif
 #if MEGDNN_AARCH64
            CASE(AARCH64, aarch64);
        CASE(AARCH64, aarch64);
 #endif
 #if MEGDNN_ARMV7 || MEGDNN_AARCH64
            CASE(ARM_COMMON, arm_common);
        CASE(ARM_COMMON, arm_common);
 #endif
 #endif  // !MEGDNN_NAIVE
 #if MEGDNN_WITH_CUDA
            CASE(CUDA,cuda);
        CASE(CUDA, cuda);
 #endif
 #if MEGDNN_WITH_ATLAS
            CASE(ATLAS, atlas);
        CASE(ATLAS, atlas);
 #endif
 #if MEGDNN_WITH_ROCM
            case HandleType::ROCM: {
                MIDOUT_BEGIN(HandleOpr, Opr, midout_iv(HandleType::ROCM)) {
                    return create_rocm_operator<Opr>();
                }
                MIDOUT_END();
        case HandleType::ROCM: {
            MIDOUT_BEGIN(HandleOpr, Opr, midout_iv(HandleType::ROCM)) {
                return create_rocm_operator<Opr>();
            }
            MIDOUT_END();
        }
 #endif
 #if MEGDNN_WITH_CAMBRICON
            CASE(CAMBRICON, cambricon);
 #endif
            default:
                megdnn_throw("bad handle type");
        }
 #undef CASE
        default:
            megdnn_throw("bad handle type");
    }
 #undef CASE
 }

 #define INST(opr) template std::unique_ptr<opr> Handle::create_operator();
        MEGDNN_FOREACH_OPR_CLASS(INST)
 MEGDNN_FOREACH_OPR_CLASS(INST)
 #undef INST
 // vim: syntax=cpp.doxygen

--- a/dnn/src/common/handle_impl.h
+++ b/dnn/src/common/handle_impl.h
@@ -194,6 +194,9 @@ private:
    cb(LocalShareBackwardFilter) \
    cb(ROIAlignForward) \
    cb(ROIAlignBackward) \
    cb(CorrelationForward) \
    cb(CorrelationBackwardData1) \
    cb(CorrelationBackwardData2) \
    cb(BatchConvBiasForward) \
    cb(Remap) \
    cb(RemapBackwardData) \
--- a/dnn/src/common/images2neibs.cpp
+++ b/dnn/src/common/images2neibs.cpp
@@ -23,6 +23,8 @@ void Images2NeibsBase::deduce_layout_fwd(const TensorLayout &src,
               "pad_w=" + std::to_string(param().pad_w) + ", " +
               "stride_h=" + std::to_string(param().stride_h) + ", " +
               "stride_w=" + std::to_string(param().stride_w) + ", " +
               "dilate_h=" + std::to_string(param().dilate_h) + ", " +
               "dilate_w=" + std::to_string(param().dilate_w) + ", " +
               "window_h=" + std::to_string(param().window_h) + ", " +
               "window_w=" + std::to_string(param().window_w);
    };
@@ -34,11 +36,13 @@ void Images2NeibsBase::deduce_layout_fwd(const TensorLayout &src,
    size_t pw = this->param().pad_w;
    size_t sh = this->param().stride_h;
    size_t sw = this->param().stride_w;
    size_t dh = this->param().dilate_h;
    size_t dw = this->param().dilate_w;
    size_t wh = this->param().window_h;
    size_t ww = this->param().window_w;
    size_t oh, ow;

    infer_conv_shape2d(ih, iw, wh, ww, sh, sw, ph, pw, oh, ow);
    infer_conv_shape2d(ih, iw, wh+(wh-1)*(dh-1), ww+(ww-1)*(dw-1), sh, sw, ph, pw, oh, ow);
    dst = TensorLayout(TensorShape({n, ic, oh, ow, wh, ww}), src.dtype);
 }

--- a/dnn/src/common/opr_trait.h
+++ b/dnn/src/common/opr_trait.h
@@ -54,6 +54,9 @@ DEF(BNForward, 8, true, true);
 DEF(BNBackward, 8, true, false);
 DEF(ROIPoolingForward, 4, true, false);
 DEF(ROIPoolingBackward, 5, true, false);
 DEF(CorrelationForward, 3, true, true);
 DEF(CorrelationBackwardData1, 4, true, true);
 DEF(CorrelationBackwardData2, 4, true, true);
 DEF(WarpPerspectiveForward, 3, true, false);
 DEF(WarpPerspectiveBackwardData, 3, true, false);
 DEF(WarpPerspectiveBackwardMat, 4, true, false);
--- a/dnn/src/common/relayout_helper.h
+++ b/dnn/src/common/relayout_helper.h
@@ -41,7 +41,7 @@ bool is_transpose(const TensorLayout& src, const TensorLayout& dst,

 namespace transpose_fallback {

 #if MEGDNN_X86
 #if MEGDNN_X86 || MEGDNN_NAIVE
 constexpr size_t BLOCK_LINE_SIZE_BYTES = 64;
 #elif MEGDNN_AARCH64 || MEGDNN_ARMV7 /*BEGIN-INLINE-INTERNAL*/ || \
        MEGDNN_MIPS /*END-INLINE-INTERNAL*/
--- a/dnn/src/cuda/api_cache.h
+++ b/dnn/src/cuda/api_cache.h
@@ -0,0 +1,109 @@
 /**
 * \file dnn/src/cuda/api_cache.h
 * MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
 *
 * Copyright (c) 2014-2021 Megvii Inc. All rights reserved.
 *
 * Unless required by applicable law or agreed to in writing,
 * software distributed under the License is distributed on an
 * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or
 * implied.
 */

 #pragma once

 #include "src/common/api_cache.h"
 #include "src/cuda/cudnn_wrapper.h"

 namespace megdnn {
 class CudnnConvDescParam {
 public:
    cudnnConvolutionDescriptor_t value;
    Empty serialize(StringSerializer& ser, Empty) {
        constexpr int maxNbDims = CUDNN_DIM_MAX - 2;
        int nbDims = maxNbDims;
        int padA[maxNbDims];
        int strideA[maxNbDims];
        int dilationA[maxNbDims];
        cudnnConvolutionMode_t mode;
        cudnnDataType_t computeType;
        cudnnGetConvolutionNdDescriptor(value, maxNbDims, &nbDims, padA,
                                        strideA, dilationA, &mode,
                                        &computeType);
        ser.write_plain(nbDims);
        for (int i = 0; i < nbDims; ++i) {
            ser.write_plain(padA[i]);
            ser.write_plain(strideA[i]);
            ser.write_plain(dilationA[i]);
        }
        ser.write_plain(mode);
        ser.write_plain(computeType);
        return Empty{};
    }
 };

 class CudnnTensorDescParam {
 public:
    cudnnTensorDescriptor_t value;
    Empty serialize(StringSerializer& ser, Empty) {
        int nbDims = MEGDNN_MAX_NDIM;
        cudnnDataType_t dataType;
        int dimA[MEGDNN_MAX_NDIM];
        int strideA[MEGDNN_MAX_NDIM];
        cudnnGetTensorNdDescriptor(value, MEGDNN_MAX_NDIM, &dataType, &nbDims,
                                   dimA, strideA);
        ser.write_plain(nbDims);
        for (int i = 0; i < nbDims; ++i) {
            ser.write_plain(dimA[i]);
            ser.write_plain(strideA[i]);
        }
        ser.write_plain(dataType);
        return Empty{};
    }
 };

 class CudnnFilterDescParam {
 public:
    cudnnFilterDescriptor_t value;
    Empty serialize(StringSerializer& ser, Empty) {
        int nbDims = MEGDNN_MAX_NDIM;
        cudnnDataType_t dataType;
        cudnnTensorFormat_t format;
        int filterDimA[MEGDNN_MAX_NDIM];
        cudnnGetFilterNdDescriptor(value, nbDims, &dataType, &format, &nbDims,
                                   filterDimA);
        ser.write_plain(nbDims);
        for (int i = 0; i < nbDims; ++i) {
            ser.write_plain(filterDimA[i]);
        }
        ser.write_plain(dataType);
        ser.write_plain(format);
        return Empty{};
    }
 };

 template <typename T>
 class CudnnConvAlgoPerfParam {
 public:
    T value;
    Empty serialize(StringSerializer& ser, Empty) {
        ser.write_plain(value.algo);
        ser.write_plain(value.status);
        ser.write_plain(value.time);
        ser.write_plain(value.memory);
        ser.write_plain(value.determinism);
        ser.write_plain(value.mathType);
        return Empty{};
    }

    Empty deserialize(StringSerializer& ser, Empty) {
        ser.read_plain(&value.algo);
        ser.read_plain(&value.status);
        ser.read_plain(&value.time);
        ser.read_plain(&value.memory);
        ser.read_plain(&value.determinism);
        ser.read_plain(&value.mathType);
        return Empty{};
    }
 };
 }  // namespace megdnn
--- a/dnn/src/cuda/batch_conv_bias/gemm_int8_nchw4_dp4a.cpp
+++ b/dnn/src/cuda/batch_conv_bias/gemm_int8_nchw4_dp4a.cpp
@@ -9,7 +9,7 @@
 * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 */

 #include "src/common/utils.h"
 #include "src/common/conv_bias.h"
 #include "src/cuda/batch_conv_bias/algo.h"
 #include "src/cuda/batch_conv_bias/batch_conv_bias.cuh"
 #include "src/cuda/batch_conv_bias/opr_impl.h"
@@ -106,7 +106,7 @@ bool BatchConvBiasForwardImpl::AlgoInt8NCHW4DotProdGemm::is_available(
    using Mode = Param::Mode;
    bool available = true;
    auto&& param = args.opr->param();
    if (!conv_bias::check_bias_share_in_channel(args.bias_layout, param.format))
    if (!check_bias_share_in_channel(args.bias_layout, param.format))
        return false;
    if (param.format != Format::NCHW4)
        return false;
--- a/dnn/src/cuda/batch_conv_bias/implicit_gemm_int8_nchw4_dp4a.cpp
+++ b/dnn/src/cuda/batch_conv_bias/implicit_gemm_int8_nchw4_dp4a.cpp
@@ -10,7 +10,7 @@
 */

 #include "megdnn/oprs/general.h"
 #include "src/common/utils.h"
 #include "src/common/conv_bias.h"
 #include "src/cuda/batch_conv_bias/algo.h"
 #include "src/cuda/batch_conv_bias/batch_conv_bias.cuh"
 #include "src/cuda/batch_conv_bias/opr_impl.h"
@@ -86,7 +86,7 @@ bool BatchConvBiasForwardImpl::AlgoInt8NCHW4DotProdImplicitGemmPrecomp::
    using Mode = Param::Mode;
    bool available = true;
    auto&& param = args.opr->param();
    if (!conv_bias::check_bias_share_in_channel(args.bias_layout, param.format))
    if (!check_bias_share_in_channel(args.bias_layout, param.format))
        return false;
    if (param.format != Format::NCHW4)
        return false;
--- a/dnn/src/cuda/batched_matrix_mul/algo.h
+++ b/dnn/src/cuda/batched_matrix_mul/algo.h
@@ -115,7 +115,8 @@ public:
    size_t get_workspace_in_bytes(const SizeArgs& /*args*/) const override;
    void exec(const ExecArgs& args) const final;
    AlgoAttribute attribute() const override {
        return AlgoAttribute::REPRODUCIBLE;
        return AlgoAttribute::REPRODUCIBLE |
               AlgoAttribute::ACCURACY_DEPEND_ON_BATCH;
    }
    const char* name() const override { return "CUBLAS"; }
    MEGDNN_DECL_ALGO_TYPE(CUDA_CUBLAS)
@@ -128,7 +129,8 @@ public:
    size_t get_workspace_in_bytes(const SizeArgs& /*args*/) const override;
    void exec(const ExecArgs& args) const final;
    AlgoAttribute attribute() const override {
        return AlgoAttribute::REPRODUCIBLE;
        return AlgoAttribute::REPRODUCIBLE |
               AlgoAttribute::ACCURACY_DEPEND_ON_BATCH;
    }
    const char* name() const override { return "CUBLAS_LT"; }
    MEGDNN_DECL_ALGO_TYPE(CUDA_CUBLASLT)
--- a/dnn/src/cuda/conv_bias/algo.h
+++ b/dnn/src/cuda/conv_bias/algo.h
@@ -173,6 +173,9 @@ public:
        if (m_attr.is_reproducible) {
            ret |= AlgoAttribute::REPRODUCIBLE;
        }
        if (m_attr.accuracy_depend_on_batch) {
            ret |= AlgoAttribute::ACCURACY_DEPEND_ON_BATCH;
        }
        return ret;
    }

@@ -280,6 +283,9 @@ public:
        if (m_attr.is_reproducible) {
            ret |= AlgoAttribute::REPRODUCIBLE;
        }
        if (m_attr.accuracy_depend_on_batch) {
            ret |= AlgoAttribute::ACCURACY_DEPEND_ON_BATCH;
        }
        return ret;
    }

@@ -352,7 +358,8 @@ public:
            const OperatorBase* opr) const override;
    MEGDNN_DECL_ALGO_TYPE(CUDA_MATMUL)
    AlgoAttribute attribute() const override {
        return AlgoAttribute::REPRODUCIBLE;
        return AlgoAttribute::REPRODUCIBLE |
               AlgoAttribute::ACCURACY_DEPEND_ON_BATCH;
    }

 private:
@@ -406,7 +413,8 @@ public:
            const OperatorBase* opr) const override;

    AlgoAttribute attribute() const override {
        return AlgoAttribute::REPRODUCIBLE;
        return AlgoAttribute::REPRODUCIBLE |
               AlgoAttribute::ACCURACY_DEPEND_ON_BATCH;
    }

    MEGDNN_DECL_ALGO_TYPE(CUDA_BATCHED_MATMUL)
@@ -428,7 +436,14 @@ public:
    const char* name() const override { return m_name.c_str(); }

    AlgoAttribute attribute() const override {
        auto ret = static_cast<AlgoAttribute>(0);
        auto ret = AlgoAttribute::DEFAULT;
 #define cb(attr)                               \
    if (m_impl->contain_attribute_all(attr)) { \
        ret |= attr;                           \
    }
        MEGDNN_FOREACH_ALGO_ATTRIBUTE_INHERITABLE(cb)
 #undef cb

        if (m_impl->contain_attribute_all(AlgoAttribute::REPRODUCIBLE)) {
            ret |= AlgoAttribute::REPRODUCIBLE;
        }
--- a/dnn/src/cuda/conv_bias/cudnn_conv.cpp
+++ b/dnn/src/cuda/conv_bias/cudnn_conv.cpp
@@ -39,7 +39,8 @@ bool ConvBiasForwardImpl::AlgoCUDNNConv::is_available(
    conv_args.init_conv_desc(D);

    size_t workspace_size;
    auto status = cudnnGetConvolutionForwardWorkspaceSize(
    auto& cudnn = conv_args.handle->cudnn();
    auto status = cudnn.GetConvolutionForwardWorkspaceSize(
            conv_args.handle->cudnn_handle(), D.src_desc.desc,
            D.filter_desc.desc, D.conv_desc.conv_desc, D.dst_desc.desc,
            m_cudnn_enum, &workspace_size);
@@ -65,7 +66,8 @@ WorkspaceBundle ConvBiasForwardImpl::AlgoCUDNNConv::get_workspace_bundle(
    conv_args.init_conv_desc(D);

    size_t conv_workspace_size;
    auto status = cudnnGetConvolutionForwardWorkspaceSize(
    auto& cudnn = conv_args.handle->cudnn();
    auto status = cudnn.GetConvolutionForwardWorkspaceSize(
            conv_args.handle->cudnn_handle(), D.src_desc.desc,
            D.filter_desc.desc, D.conv_desc.conv_desc, D.dst_desc.desc,
            m_cudnn_enum, &conv_workspace_size);
--- a/dnn/src/cuda/conv_bias/cudnn_conv_bias_activation.cpp
+++ b/dnn/src/cuda/conv_bias/cudnn_conv_bias_activation.cpp
@@ -16,6 +16,7 @@
 #include "src/cuda/conv_bias/helper.h"
 #include "src/cuda/cudnn_wrapper.h"
 #include "src/cuda/utils.h"
 #include "src/common/conv_bias.h"

 using namespace megdnn;
 using namespace cuda;
@@ -29,7 +30,7 @@ bool ConvBiasForwardImpl::AlgoCUDNNConvBiasActivation::is_available(
    }

    if (args.bias_layout->ndim == 0 ||
        !conv_bias::check_bias_share_in_channel(*(args.bias_layout),
        !check_bias_share_in_channel(*(args.bias_layout),
                                                args.opr->param().format)) {
        return false;
    }
@@ -107,7 +108,8 @@ bool ConvBiasForwardImpl::AlgoCUDNNConvBiasActivation::is_available(
            megdnn_throw("unsupported NonlineMode");
    }
    size_t workspace_size;
    auto status = cudnnGetConvolutionForwardWorkspaceSize(
    auto& cudnn = args.handle->cudnn();
    auto status = cudnn.GetConvolutionForwardWorkspaceSize(
            args.handle->cudnn_handle(), D.src_desc.desc, D.filter_desc.desc,
            D.conv_desc.conv_desc, D.dst_desc.desc, m_cudnn_enum,
            &workspace_size);
@@ -120,7 +122,8 @@ size_t ConvBiasForwardImpl::AlgoCUDNNConvBiasActivation::get_workspace_in_bytes(

    args.init_conv_bias_desc(D);
    size_t workspace_size;
    auto status = cudnnGetConvolutionForwardWorkspaceSize(
    auto& cudnn = args.handle->cudnn();
    auto status = cudnn.GetConvolutionForwardWorkspaceSize(
            args.handle->cudnn_handle(), D.src_desc.desc, D.filter_desc.desc,
            D.conv_desc.conv_desc, D.dst_desc.desc, m_cudnn_enum,
            &workspace_size);
--- a/dnn/src/cuda/conv_bias/helper.cpp
+++ b/dnn/src/cuda/conv_bias/helper.cpp
@@ -168,34 +168,6 @@ bool is_cudnn_supported(const BiasForwardSizeArgs& args) {
    return supported;
 }

 bool check_bias_share_in_channel(const TensorLayout& bias,
                                 const param::ConvBias::Format format) {
    bool share_in_channel = false;
    if (format == param::ConvBias::Format::NCHW ||
        format == param::ConvBias::Format::NCHW4_NCHW) {
        share_in_channel = (bias.ndim == 4 && bias[0] == 1 && bias[2] == 1 &&
                            bias[3] == 1);
    } else if (format == param::ConvBias::Format::NHWC) {
        share_in_channel = (bias.ndim == 4 && bias[0] == 1 && bias[1] == 1 &&
                            bias[2] == 1);
    } else if (format == param::ConvBias::Format::NCHW4 ||
               format == param::ConvBias::Format::NCHW8 ||
               format == param::ConvBias::Format::NCHW32 ||
               format == param::ConvBias::Format::NCHW4_NCHW32 ||
               format == param::ConvBias::Format::NCHW32_NCHW4) {
        share_in_channel = (bias.ndim == 5 && bias[0] == 1 && bias[2] == 1 &&
                            bias[3] == 1);
    } else if (format == param::ConvBias::Format::NHWCD4) {
        share_in_channel = (bias.ndim == 5 && bias[0] == 1 && bias[1] == 1 &&
                            bias[3] == 1);
    } else {
        megdnn_assert(format == param::ConvBias::Format::CHWN4);
        share_in_channel = (bias.ndim == 5 && bias[1] == 1 && bias[2] == 1 &&
                            bias[3] == 1);
    }
    return share_in_channel;
 }

 SmallVector<size_t> matmul_get_workspace_bundle(
        const BiasForwardSizeArgs& args) {
    auto dtype = args.src_layout->dtype;
--- a/dnn/src/cuda/conv_bias/helper.h
+++ b/dnn/src/cuda/conv_bias/helper.h
@@ -126,9 +126,6 @@ namespace conv_bias {
        }
    };

    bool check_bias_share_in_channel(const TensorLayout& bias,
                                     const param::ConvBias::Format format);

 }  // namespace conv_bias
 } // namespace cuda
 } // namespace megdnn
--- a/dnn/src/cuda/conv_bias/implicit_gemm_int8_chwn4_dp4a.cpp
+++ b/dnn/src/cuda/conv_bias/implicit_gemm_int8_chwn4_dp4a.cpp
@@ -15,6 +15,7 @@
 #include "src/cuda/convolution_helper/layout.cuh"
 #include "src/cuda/convolution_helper/parameter.cuh"
 #include "src/cuda/utils.h"
 #include "src/common/conv_bias.h"

 using namespace megdnn;
 using namespace cuda;
@@ -83,7 +84,7 @@ bool ConvBiasForwardImpl::AlgoInt8CHWN4DotProdImplicitGemm::is_available(
    bool available = true;
    auto&& param = args.opr->param();
    auto&& fm = args.filter_meta;
    if (!conv_bias::check_bias_share_in_channel(*(args.bias_layout),
    if (!check_bias_share_in_channel(*(args.bias_layout),
                                                param.format))
        return false;
    if (param.format != Format::CHWN4)
--- a/dnn/src/cuda/conv_bias/implicit_gemm_int8_chwn4_imma.cpp
+++ b/dnn/src/cuda/conv_bias/implicit_gemm_int8_chwn4_imma.cpp
@@ -15,6 +15,7 @@
 #include "src/cuda/convolution_helper/layout.cuh"
 #include "src/cuda/convolution_helper/parameter.cuh"
 #include "src/cuda/utils.h"
 #include "src/common/conv_bias.h"

 using namespace megdnn;
 using namespace cuda;
@@ -71,7 +72,7 @@ bool ConvBiasForwardImpl::AlgoInt8CHWN4IMMAImplicitGemm::is_available(
    bool available = true;
    auto&& param = args.opr->param();
    auto&& fm = args.filter_meta;
    if (!conv_bias::check_bias_share_in_channel(*(args.bias_layout),
    if (!check_bias_share_in_channel(*(args.bias_layout),
                                                param.format))
        return false;
    if (param.format != Format::CHWN4)
--- a/dnn/src/cuda/conv_bias/implicit_gemm_int8_chwn4_imma_reorder_filter.cpp
+++ b/dnn/src/cuda/conv_bias/implicit_gemm_int8_chwn4_imma_reorder_filter.cpp
@@ -15,6 +15,7 @@
 #include "src/cuda/convolution_helper/layout.cuh"
 #include "src/cuda/convolution_helper/parameter.cuh"
 #include "src/cuda/utils.h"
 #include "src/common/conv_bias.h"

 using namespace megdnn;
 using namespace cuda;
@@ -118,7 +119,7 @@ bool ConvBiasForwardImpl::AlgoInt8CHWN4IMMAImplicitGemmReorderFilter::
    bool available = true;
    auto&& param = args.opr->param();
    auto&& fm = args.filter_meta;
    if (!conv_bias::check_bias_share_in_channel(*(args.bias_layout),
    if (!check_bias_share_in_channel(*(args.bias_layout),
                                                param.format))
        return false;
    if (param.format != Format::CHWN4)
--- a/dnn/src/cuda/conv_bias/implicit_gemm_int8_chwn4_imma_unroll_width.cpp
+++ b/dnn/src/cuda/conv_bias/implicit_gemm_int8_chwn4_imma_unroll_width.cpp
@@ -15,6 +15,7 @@
 #include "src/cuda/convolution_helper/layout.cuh"
 #include "src/cuda/convolution_helper/parameter.cuh"
 #include "src/cuda/utils.h"
 #include "src/common/conv_bias.h"

 using namespace megdnn;
 using namespace cuda;
@@ -118,7 +119,7 @@ bool ConvBiasForwardImpl::AlgoInt8CHWN4IMMAImplicitGemmUnrollWidth::
    bool available = true;
    auto&& param = args.opr->param();
    auto&& fm = args.filter_meta;
    if (!conv_bias::check_bias_share_in_channel(*(args.bias_layout),
    if (!check_bias_share_in_channel(*(args.bias_layout),
                                                param.format))
        return false;
    if (param.format != Format::CHWN4)
--- a/dnn/src/cuda/conv_bias/implicit_gemm_int8_nchw32_imma.cpp
+++ b/dnn/src/cuda/conv_bias/implicit_gemm_int8_nchw32_imma.cpp
@@ -14,6 +14,7 @@
 #include "src/cuda/conv_bias/cutlass_convolution_wrapper.cuh"
 #include "src/cuda/convolution_helper/parameter.cuh"
 #include "src/cuda/utils.h"
 #include "src/common/conv_bias.h"

 using namespace megdnn;
 using namespace cuda;
@@ -32,7 +33,7 @@ bool ConvBiasForwardImpl::AlgoInt8NCHW32IMMAImplicitGemm::is_available(
    bool available = true;
    auto&& param = args.opr->param();
    auto&& fm = args.filter_meta;
    if (!conv_bias::check_bias_share_in_channel(*(args.bias_layout),
    if (!check_bias_share_in_channel(*(args.bias_layout),
                                                param.format))
        return false;
    if (param.format != Format::NCHW32 && param.format != Format::NCHW32_NCHW4)
--- a/dnn/src/cuda/conv_bias/implicit_gemm_int8_nchw4_dp4a.cpp
+++ b/dnn/src/cuda/conv_bias/implicit_gemm_int8_nchw4_dp4a.cpp
@@ -13,6 +13,7 @@
 #include "src/cuda/utils.h"
 #include "src/cuda/convolution_helper/parameter.cuh"
 #include "src/cuda/conv_bias/cutlass_convolution_wrapper.cuh"
 #include "src/common/conv_bias.h"

 using namespace megdnn;
 using namespace cuda;
@@ -29,7 +30,7 @@ bool ConvBiasForwardImpl::AlgoInt8NCHW4DotProdImplicitGemm::is_available(
    bool available = true;
    auto&& param = args.opr->param();
    auto&& fm = args.filter_meta;
    if (!conv_bias::check_bias_share_in_channel(*(args.bias_layout),
    if (!check_bias_share_in_channel(*(args.bias_layout),
                                                param.format))
        return false;
    if (param.format == Format::NCHW4_NCHW32) {
--- a/dnn/src/cuda/conv_bias/implicit_gemm_int8_nchw4_imma.cpp
+++ b/dnn/src/cuda/conv_bias/implicit_gemm_int8_nchw4_imma.cpp
@@ -12,6 +12,7 @@
 #include "./algo.h"
 #include "src/cuda/utils.h"
 #include "src/cuda/convolution_helper/bias_visitor.cuh"
 #include "src/common/conv_bias.h"

 using namespace megdnn;
 using namespace cuda;
@@ -29,7 +30,7 @@ bool ConvBiasForwardImpl::AlgoInt8NCHW4IMMAImplicitGemm::is_available(
    bool available = true;
    auto&& param = args.opr->param();
    auto&& fm = args.filter_meta;
    if (!conv_bias::check_bias_share_in_channel(*(args.bias_layout),
    if (!check_bias_share_in_channel(*(args.bias_layout),
                                                param.format))
        return false;
    if (param.format != Format::NCHW4)
--- a/dnn/src/cuda/conv_bias/opr_impl.cpp
+++ b/dnn/src/cuda/conv_bias/opr_impl.cpp
@@ -83,12 +83,13 @@ ConvBiasForward::Algorithm* ConvBiasForwardImpl::get_algorithm_heuristic(
        CUDNNForwardDescs desc;
        conv_args.init_conv_desc(desc);
 #if CUDNN_MAJOR >= 7
        auto& cudnn = static_cast<HandleImpl*>(this->handle())->cudnn();
        int max_count = 0;
        cudnn_check(cudnnGetConvolutionForwardAlgorithmMaxCount(cudnn_handle,
        cudnn_check(cudnn.GetConvolutionForwardAlgorithmMaxCount(cudnn_handle,
                                                                &max_count));
        SmallVector<cudnnConvolutionFwdAlgoPerf_t> algo_perf(max_count);
        int ret_count = 0;
        cudnn_check(cudnnGetConvolutionForwardAlgorithm_v7(
        cudnn_check(cudnn.GetConvolutionForwardAlgorithm_v7(
                cudnn_handle, desc.src_desc.desc, desc.filter_desc.desc,
                desc.conv_desc.conv_desc, desc.dst_desc.desc, max_count,
                &ret_count, algo_perf.data()));
--- a/dnn/src/cuda/convolution/backward_data/algo.h
+++ b/dnn/src/cuda/convolution/backward_data/algo.h
@@ -127,6 +127,9 @@ public:
        if (m_attr.is_reproducible) {
            ret |= AlgoAttribute::REPRODUCIBLE;
        }
        if (m_attr.accuracy_depend_on_batch) {
            ret |= AlgoAttribute::ACCURACY_DEPEND_ON_BATCH;
        }
        return ret;
    }
    cudnnConvolutionBwdDataAlgo_t cudnn_enum() const { return m_cudnn_enum; }
@@ -158,7 +161,8 @@ public:
    const char* name() const override { return "MATMUL"; }
    MEGDNN_DECL_ALGO_TYPE(CUDA_MATMUL)
    AlgoAttribute attribute() const override {
        return AlgoAttribute::REPRODUCIBLE;
        return AlgoAttribute::REPRODUCIBLE |
               AlgoAttribute::ACCURACY_DEPEND_ON_BATCH;
    }
 };

@@ -184,7 +188,8 @@ public:
    const char* name() const override { return "CHANNEL_WISE_SMALL"; }
    MEGDNN_DECL_ALGO_TYPE(CUDA_CHANWISE_SMALL)
    AlgoAttribute attribute() const override {
        return AlgoAttribute::REPRODUCIBLE;
        return AlgoAttribute::REPRODUCIBLE |
               AlgoAttribute::USABLE_DEPEND_ON_SHAPE;
    }
 };

@@ -231,7 +236,13 @@ public:
                                 TensorLayout& grad_pg);
    MEGDNN_DECL_ALGO_TYPE(CUDA_GROUP_CONV_GENERAL)
    AlgoAttribute attribute() const override {
        auto ret = static_cast<AlgoAttribute>(0);
        auto ret = AlgoAttribute::DEFAULT;
 #define cb(attr)                               \
    if (m_impl->contain_attribute_all(attr)) { \
        ret |= attr;                           \
    }
        MEGDNN_FOREACH_ALGO_ATTRIBUTE_INHERITABLE(cb)
 #undef cb
        if (m_impl->contain_attribute_all(AlgoAttribute::REPRODUCIBLE)) {
            ret |= AlgoAttribute::REPRODUCIBLE;
        }
--- a/dnn/src/cuda/convolution/backward_data/cudnn.cpp
+++ b/dnn/src/cuda/convolution/backward_data/cudnn.cpp
@@ -44,9 +44,10 @@ bool ConvolutionBackwardDataImpl::AlgoCUDNN::is_available(
    }
 #endif

    auto& cudnn = args.handle->cudnn();
    args.init_desc(D);
    size_t workspace_size;
    auto status = cudnnGetConvolutionBackwardDataWorkspaceSize(
    auto status = cudnn.GetConvolutionBackwardDataWorkspaceSize(
            args.handle->cudnn_handle(),
            D.filter_desc.desc,
            D.diff_desc.desc,
@@ -59,10 +60,11 @@ bool ConvolutionBackwardDataImpl::AlgoCUDNN::is_available(

 size_t ConvolutionBackwardDataImpl::AlgoCUDNN::get_workspace_in_bytes(
        const SizeArgs &args) const {
    auto& cudnn = args.handle->cudnn();
    CUDNNBwdDataDescs D;
    args.init_desc(D);
    size_t workspace_size;
    auto status = cudnnGetConvolutionBackwardDataWorkspaceSize(
    auto status = cudnn.GetConvolutionBackwardDataWorkspaceSize(
            args.handle->cudnn_handle(),
            D.filter_desc.desc,
            D.diff_desc.desc,
--- a/dnn/src/cuda/convolution/backward_filter/algo.h
+++ b/dnn/src/cuda/convolution/backward_filter/algo.h
@@ -123,6 +123,9 @@ public:
        if (m_attr.is_reproducible) {
            ret |= AlgoAttribute::REPRODUCIBLE;
        }
        if (m_attr.accuracy_depend_on_batch) {
            ret |= AlgoAttribute::ACCURACY_DEPEND_ON_BATCH;
        }
        return ret;
    }

@@ -155,7 +158,8 @@ public:
    const char* name() const override { return "MATMUL"; }
    MEGDNN_DECL_ALGO_TYPE(CUDA_MATMUL)
    AlgoAttribute attribute() const override {
        return AlgoAttribute::REPRODUCIBLE;
        return AlgoAttribute::REPRODUCIBLE |
               AlgoAttribute::ACCURACY_DEPEND_ON_BATCH;
    }
 };

--- a/dnn/src/cuda/convolution/backward_filter/cudnn.cpp
+++ b/dnn/src/cuda/convolution/backward_filter/cudnn.cpp
@@ -21,6 +21,7 @@ using namespace convolution;

 bool ConvolutionBackwardFilterImpl::AlgoCUDNN::is_available(
        const SizeArgs &args) const {
    auto& cudnn = args.handle->cudnn();
    CUDNNBwdFilterDescs D;

    if (!is_cudnn_supported(args.as_fwd_args()))
@@ -28,7 +29,7 @@ bool ConvolutionBackwardFilterImpl::AlgoCUDNN::is_available(

    args.init_desc(D);
    size_t workspace_size;
    auto status = cudnnGetConvolutionBackwardFilterWorkspaceSize(
    auto status = cudnn.GetConvolutionBackwardFilterWorkspaceSize(
            args.handle->cudnn_handle(),
            D.src_desc.desc,
            D.diff_desc.desc,
@@ -41,10 +42,11 @@ bool ConvolutionBackwardFilterImpl::AlgoCUDNN::is_available(

 size_t ConvolutionBackwardFilterImpl::AlgoCUDNN::get_workspace_in_bytes(
        const SizeArgs &args) const {
    auto& cudnn = args.handle->cudnn();
    CUDNNBwdFilterDescs D;
    args.init_desc(D);
    size_t workspace_size;
    auto status = cudnnGetConvolutionBackwardFilterWorkspaceSize(
    auto status = cudnn.GetConvolutionBackwardFilterWorkspaceSize(
            args.handle->cudnn_handle(),
            D.src_desc.desc,
            D.diff_desc.desc,
--- a/dnn/src/cuda/convolution/opr_impl.cpp
+++ b/dnn/src/cuda/convolution/opr_impl.cpp
@@ -141,12 +141,13 @@ ConvolutionBackwardDataImpl::get_algorithm_heuristic(const TensorLayout& filter,

 #if CUDNN_MAJOR >= 7
        MEGDNN_MARK_USED_VAR(negative_attr);
        auto& cudnn = args.handle->cudnn();
        int max_count = 0;
        cudnn_check(cudnnGetConvolutionBackwardDataAlgorithmMaxCount(
        cudnn_check(cudnn.GetConvolutionBackwardDataAlgorithmMaxCount(
                cudnn_handle, &max_count));
        SmallVector<cudnnConvolutionBwdDataAlgoPerf_t> algo_perf(max_count);
        int ret_count = 0;
        cudnn_check(cudnnGetConvolutionBackwardDataAlgorithm_v7(
        cudnn_check(cudnn.GetConvolutionBackwardDataAlgorithm_v7(
                cudnn_handle, desc.filter_desc.desc, desc.diff_desc.desc,
                desc.conv_desc.desc, desc.grad_desc.desc, max_count, &ret_count,
                algo_perf.data()));
@@ -286,12 +287,13 @@ ConvolutionBackwardFilterImpl::get_algorithm_heuristic(
 #endif
 #if CUDNN_MAJOR >= 7
        MEGDNN_MARK_USED_VAR(negative_attr);
        auto& cudnn = args.handle->cudnn();
        int max_count = 0;
        cudnn_check(cudnnGetConvolutionBackwardFilterAlgorithmMaxCount(
        cudnn_check(cudnn.GetConvolutionBackwardFilterAlgorithmMaxCount(
                cudnn_handle, &max_count));
        SmallVector<cudnnConvolutionBwdFilterAlgoPerf_t> algo_perf(max_count);
        int ret_count = 0;
        cudnn_check(cudnnGetConvolutionBackwardFilterAlgorithm_v7(
        cudnn_check(cudnn.GetConvolutionBackwardFilterAlgorithm_v7(
                cudnn_handle, desc.src_desc.desc, desc.diff_desc.desc,
                desc.conv_desc.desc, desc.grad_desc.desc, max_count, &ret_count,
                algo_perf.data()));
--- a/dnn/src/cuda/convolution3d/backward_data/algo.h
+++ b/dnn/src/cuda/convolution3d/backward_data/algo.h
@@ -119,6 +119,9 @@ public:
        if (m_attr.is_reproducible) {
            ret |= AlgoAttribute::REPRODUCIBLE;
        }
        if (m_attr.accuracy_depend_on_batch) {
            ret |= AlgoAttribute::ACCURACY_DEPEND_ON_BATCH;
        }
        return ret;
    }

--- a/dnn/src/cuda/convolution3d/backward_data/cudnn.cpp
+++ b/dnn/src/cuda/convolution3d/backward_data/cudnn.cpp
@@ -28,7 +28,8 @@ bool Convolution3DBackwardDataImpl::AlgoCUDNN::is_available(

    args.init_desc(D);
    size_t workspace_size;
    auto status = cudnnGetConvolutionBackwardDataWorkspaceSize(
    auto& cudnn = args.handle->cudnn();
    auto status = cudnn.GetConvolutionBackwardDataWorkspaceSize(
            args.handle->cudnn_handle(),
            D.filter_desc.desc,
            D.diff_desc.desc,
@@ -44,7 +45,8 @@ size_t Convolution3DBackwardDataImpl::AlgoCUDNN::get_workspace_in_bytes(
    CUDNNBwdDataDescs D;
    args.init_desc(D);
    size_t workspace_size;
    auto status = cudnnGetConvolutionBackwardDataWorkspaceSize(
    auto& cudnn = args.handle->cudnn();
    auto status = cudnn.GetConvolutionBackwardDataWorkspaceSize(
            args.handle->cudnn_handle(),
            D.filter_desc.desc,
            D.diff_desc.desc,
--- a/dnn/src/cuda/convolution3d/backward_filter/algo.h
+++ b/dnn/src/cuda/convolution3d/backward_filter/algo.h
@@ -112,6 +112,9 @@ public:
        if (m_attr.is_reproducible) {
            ret |= AlgoAttribute::REPRODUCIBLE;
        }
        if (m_attr.accuracy_depend_on_batch) {
            ret |= AlgoAttribute::ACCURACY_DEPEND_ON_BATCH;
        }
        return ret;
    }

--- a/dnn/src/cuda/convolution3d/backward_filter/cudnn.cpp
+++ b/dnn/src/cuda/convolution3d/backward_filter/cudnn.cpp
@@ -28,7 +28,8 @@ bool Convolution3DBackwardFilterImpl::AlgoCUDNN::is_available(

    args.init_desc(D);
    size_t workspace_size;
    auto status = cudnnGetConvolutionBackwardFilterWorkspaceSize(
    auto& cudnn = args.handle->cudnn();
    auto status = cudnn.GetConvolutionBackwardFilterWorkspaceSize(
            args.handle->cudnn_handle(), D.src_desc.desc, D.diff_desc.desc,
            D.conv_desc.desc, D.grad_desc.desc, m_cudnn_enum, &workspace_size);
    return status == CUDNN_STATUS_SUCCESS;
@@ -40,7 +41,8 @@ size_t Convolution3DBackwardFilterImpl::AlgoCUDNN::get_workspace_in_bytes(

    args.init_desc(D);
    size_t workspace_size;
    auto status = cudnnGetConvolutionBackwardFilterWorkspaceSize(
    auto& cudnn = args.handle->cudnn();
    auto status = cudnn.GetConvolutionBackwardFilterWorkspaceSize(
            args.handle->cudnn_handle(), D.src_desc.desc, D.diff_desc.desc,
            D.conv_desc.desc, D.grad_desc.desc, m_cudnn_enum, &workspace_size);
    megdnn_assert(status == CUDNN_STATUS_SUCCESS,
--- a/dnn/src/cuda/convolution3d/forward/algo.h
+++ b/dnn/src/cuda/convolution3d/forward/algo.h
@@ -106,7 +106,8 @@ public:

    const char* name() const override { return "1x1x1"; }
    AlgoAttribute attribute() const override {
        return AlgoAttribute::REPRODUCIBLE;
        return AlgoAttribute::REPRODUCIBLE |
               AlgoAttribute::ACCURACY_DEPEND_ON_BATCH;
    }
    MEGDNN_DECL_ALGO_TYPE(CUDA_1X1X1)
 };
@@ -126,10 +127,17 @@ public:
    const char* name() const override { return m_name.c_str(); }

    AlgoAttribute attribute() const override {
        auto ret = static_cast<AlgoAttribute>(0);
        auto ret = AlgoAttribute::DEFAULT;
        if (m_impl->contain_attribute_all(AlgoAttribute::REPRODUCIBLE)) {
            ret |= AlgoAttribute::REPRODUCIBLE;
        }
 #define cb(attr)                               \
    if (m_impl->contain_attribute_all(attr)) { \
        ret |= attr;                           \
    }
        MEGDNN_FOREACH_ALGO_ATTRIBUTE_INHERITABLE(cb)
 #undef cb

        return ret;
    }
    static void modify_size_args(SizeArgs& args, TensorLayout& src_pg,
@@ -157,6 +165,9 @@ public:
        if (m_attr.is_reproducible) {
            ret |= AlgoAttribute::REPRODUCIBLE;
        }
        if (m_attr.accuracy_depend_on_batch) {
            ret |= AlgoAttribute::ACCURACY_DEPEND_ON_BATCH;
        }
        return ret;
    }

--- a/dnn/src/cuda/convolution3d/forward/cudnn.cpp
+++ b/dnn/src/cuda/convolution3d/forward/cudnn.cpp
@@ -27,7 +27,8 @@ bool Convolution3DForwardImpl::AlgoCUDNN::is_available(

    args.init_desc(D);
    size_t workspace_size;
    auto status = cudnnGetConvolutionForwardWorkspaceSize(
    auto& cudnn = args.handle->cudnn();
    auto status = cudnn.GetConvolutionForwardWorkspaceSize(
            args.handle->cudnn_handle(),
            D.src_desc.desc,
            D.filter_desc.desc,
@@ -43,7 +44,8 @@ size_t Convolution3DForwardImpl::AlgoCUDNN::get_workspace_in_bytes(
    CUDNNForwardDescs D;
    args.init_desc(D);
    size_t workspace_size;
    auto status = cudnnGetConvolutionForwardWorkspaceSize(
    auto& cudnn = args.handle->cudnn();
    auto status = cudnn.GetConvolutionForwardWorkspaceSize(
            args.handle->cudnn_handle(),
            D.src_desc.desc,
            D.filter_desc.desc,
--- a/dnn/src/cuda/convolution3d/helper.h
+++ b/dnn/src/cuda/convolution3d/helper.h
@@ -92,7 +92,7 @@ namespace convolution3d {
            const Workspace &workspace, void *&raw_ptr);

    inline bool cudnn_get_convolution_fwd_algo_helper(
            cudnnHandle_t cudnn_handle, const cudnnTensorDescriptor_t x_desc,
            Handle* handle, const cudnnTensorDescriptor_t x_desc,
            const cudnnFilterDescriptor_t w_desc,
            const cudnnConvolutionDescriptor_t conv_desc,
            const cudnnTensorDescriptor_t y_desc,
@@ -102,13 +102,14 @@ namespace convolution3d {
        MEGDNN_MARK_USED_VAR(positive_attr);
        MEGDNN_MARK_USED_VAR(negative_attr);
 #if CUDNN_MAJOR >= 7
        auto& cudnn = static_cast<HandleImpl*>(handle)->cudnn();
        int algo_max_count = 0;
        cudnn_check(cudnnGetConvolutionForwardAlgorithmMaxCount(
                cudnn_handle, &algo_max_count));
        cudnn_check(cudnn.GetConvolutionForwardAlgorithmMaxCount(
                cuda::cudnn_handle(handle), &algo_max_count));
        SmallVector<cudnnConvolutionFwdAlgoPerf_t> algo_perf(algo_max_count);
        int algo_count = 0;
        cudnn_check(cudnnGetConvolutionForwardAlgorithm_v7(
                cudnn_handle, x_desc, w_desc, conv_desc, y_desc, algo_max_count,
        cudnn_check(cudnn.GetConvolutionForwardAlgorithm_v7(
                cuda::cudnn_handle(handle), x_desc, w_desc, conv_desc, y_desc, algo_max_count,
                &algo_count, algo_perf.data()));
        for (int i = 0; i < algo_count; ++i) {
            if (algo_perf[i].algo ==
@@ -116,8 +117,8 @@ namespace convolution3d {
                        CUDNN_CONVOLUTION_FWD_ALGO_FFT_TILING)
                continue;
            size_t workspace_size = 0;
            cudnn_check(cudnnGetConvolutionForwardWorkspaceSize(
                    cudnn_handle, x_desc, w_desc, conv_desc, y_desc,
            cudnn_check(cudnn.GetConvolutionForwardWorkspaceSize(
                    cuda::cudnn_handle(handle), x_desc, w_desc, conv_desc, y_desc,
                    algo_perf[i].algo, &workspace_size));
            if (workspace_size > workspace_limit_in_bytes) continue;
            if (!(positive_attr & AlgoAttribute::REPRODUCIBLE)) {
@@ -133,7 +134,7 @@ namespace convolution3d {
        return false;
 #else
        cudnn_check(cudnnGetConvolutionForwardAlgorithm(
                cudnn_handle, x_desc, w_desc, conv_desc, y_desc,
                cuda::cudnn_handle(handle), x_desc, w_desc, conv_desc, y_desc,
                CUDNN_CONVOLUTION_FWD_SPECIFY_WORKSPACE_LIMIT,
                workspace_limit_in_bytes, algo));
        return true;
--- a/dnn/src/cuda/convolution3d/opr_impl.cpp
+++ b/dnn/src/cuda/convolution3d/opr_impl.cpp
@@ -74,13 +74,12 @@ Convolution3DForwardImpl::get_algorithm_heuristic(
    auto get_cudnn_algo =
            [this, &args, workspace_limit_in_bytes, positive_attr,
             negative_attr]() -> Convolution3DForwardImpl::AlgoBase* {
        auto cudnn_handle = cuda::cudnn_handle(this->handle());
        cudnnConvolutionFwdAlgo_t algo;
        CUDNNForwardDescs desc;
        args.init_desc(desc);

        bool got = cudnn_get_convolution_fwd_algo_helper(
                cudnn_handle, desc.src_desc.desc, desc.filter_desc.desc,
                this->handle(), desc.src_desc.desc, desc.filter_desc.desc,
                desc.conv_desc.desc, desc.dst_desc.desc,
                workspace_limit_in_bytes, &algo, positive_attr, negative_attr);
        if (got) {
--- a/dnn/src/cuda/convolution_helper/conv_trait/ibatch_conv_trait.cuh
+++ b/dnn/src/cuda/convolution_helper/conv_trait/ibatch_conv_trait.cuh
@@ -56,7 +56,7 @@ namespace convolution {
    using KernLayout = _kern_layout;                                   \
    using OutputLayout = _output_layout;                               \
    using Param = _conv_param;                                         \
    static constexpr bool check_bounds = check_bounds_;
    static constexpr bool check_bounds = check_bounds_
 #define MEGDNN_COMMA ,

 template <bool check_bounds_, typename src_ldg_dtype, typename filter_ldg_dtype,
--- a/dnn/src/cuda/convolution_helper/conv_trait/iconv_imma_trait.cuh
+++ b/dnn/src/cuda/convolution_helper/conv_trait/iconv_imma_trait.cuh
@@ -53,7 +53,7 @@ namespace convolution {
    using KernLayout = _kern_layout;                                   \
    using OutputLayout = _output_layout;                               \
    using Param = _conv_param;                                         \
    static constexpr bool check_bounds = check_bounds_;
    static constexpr bool check_bounds = check_bounds_
 #define MEGDNN_COMMA ,

 template <bool check_bounds_, typename IMMAConfig_, typename WarpTileConfig_,
--- a/dnn/src/cuda/convolution_helper/conv_trait/iconv_trait.cuh
+++ b/dnn/src/cuda/convolution_helper/conv_trait/iconv_trait.cuh
@@ -53,7 +53,7 @@ namespace convolution {
    using KernLayout = _kern_layout;                                   \
    using OutputLayout = _output_layout;                               \
    using Param = _conv_param;                                         \
    static constexpr bool check_bounds = check_bounds_;
    static constexpr bool check_bounds = check_bounds_
 #define MEGDNN_COMMA ,

 template <bool check_bounds_, typename ldg_dtype, typename RegBlockConfig_,
--- a/dnn/src/cuda/correlation/correlation_cuda.cu
+++ b/dnn/src/cuda/correlation/correlation_cuda.cu
@@ -0,0 +1,371 @@
 /**
 * \file dnn/src/cuda/correlation/correlation_cuda.cu
 * MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
 *
 * Copyright (c) 2014-2021 Megvii Inc. All rights reserved.
 *
 * Unless required by applicable law or agreed to in writing,
 * software distributed under the License is distributed on an
 * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or
 * implied.
 */
 #include "src/cuda/correlation/correlation_cuda.cuh"

 #include <cfloat>
 #include "megdnn/dtype.h"
 #include "src/cuda/query_blocksize.cuh"
 #include "src/cuda/utils.cuh"
 #define ROUND_OFF 50000

 using namespace megdnn;
 namespace megdnn {
 namespace cuda {
 namespace correlation {

 #define CUDA_KERNEL_LOOP(vtid, vthreads)                                    \
    for (int vtid = blockIdx.x * blockDim.x + threadIdx.x; vtid < vthreads; \
         vtid += blockDim.x * gridDim.x)

 template <typename T>
 __global__ void forward_kernel(const int nthreads, const T* data1,
                               const T* data2, T* dst, const int bchannels,
                               const int bheight, const int bwidth,
                               const int tchannels, const int theight,
                               const int twidth, const int kernel_size,
                               const int max_displacement, const int stride1,
                               const int stride2, const int pad_size,
                               const bool is_multiply) {
    CUDA_KERNEL_LOOP(idx, nthreads) {
        int kernel_radius = (kernel_size - 1) / 2;
        int neighborhood_grid_radius = max_displacement / stride2;
        int neighborhood_grid_width = neighborhood_grid_radius * 2 + 1;

        int x = idx % twidth;
        int y = (idx / twidth) % theight;
        int c = (idx / twidth / theight) % tchannels;
        int n = idx / twidth / theight / tchannels;

        // get src center position in image1
        int x1 = x * stride1 + kernel_radius + max_displacement - pad_size;
        int y1 = y * stride1 + kernel_radius + max_displacement - pad_size;

        // get offset of center in image2
        int s2o = (c % neighborhood_grid_width - neighborhood_grid_radius) *
                  stride2;
        int s2p = (c / neighborhood_grid_width - neighborhood_grid_radius) *
                  stride2;

        int x2 = x1 + s2o;
        int y2 = y1 + s2p;

        // compute kernel correlation
        T sum = T(0.f);
        for (int i = -kernel_radius; i <= kernel_radius; i++) {
            for (int j = -kernel_radius; j <= kernel_radius; j++) {
                int in_x1 = x1 + i;
                int in_y1 = y1 + j;
                int in_x2 = x2 + i;
                int in_y2 = y2 + j;

                for (int channel = 0; channel < bchannels; channel++) {
                    T tmp1 = T(0.f);
                    T tmp2 = T(0.f);
                    if (in_x1 >= 0 && in_x1 < bwidth && in_y1 >= 0 &&
                        in_y1 < bheight) {
                        int idx1 =
                                ((n * bchannels + channel) * bheight + in_y1) *
                                        bwidth +
                                in_x1;
                        tmp1 = data1[idx1];
                    }

                    if (in_x2 >= 0 && in_x2 < bwidth && in_y2 >= 0 &&
                        in_y2 < bheight) {
                        int idx2 =
                                ((n * bchannels + channel) * bheight + in_y2) *
                                        bwidth +
                                in_x2;
                        tmp2 = data2[idx2];
                    }
                    if (is_multiply) {
                        sum += tmp1 * tmp2;
                    } else {
                        sum += fabsf(tmp1 - tmp2);
                    }
                }
            }
        }

        const int sumelems =
                (kernel_radius * 2 + 1) * (kernel_radius * 2 + 1) * bchannels;
        dst[idx] = sum / sumelems;
    }
 }

 template <typename T>
 __global__ void backward_kernel_data1(
        const int nthreads, const T* diff, const T* data1, const T* data2,
        T* grad1, const int bchannels, const int bheight, const int bwidth,
        const int tchannels, const int theight, const int twidth,
        const int kernel_size, const int max_displacement, const int stride1,
        const int stride2, const int pad_size, const bool is_multiply) {
    CUDA_KERNEL_LOOP(idx, nthreads) {
        int kernel_radius = (kernel_size - 1) / 2;
        int neighborhood_grid_radius = max_displacement / stride2;
        int neighborhood_grid_width = neighborhood_grid_radius * 2 + 1;

        int x = idx % bwidth;
        int y = (idx / bwidth) % bheight;
        int c = (idx / bwidth / bheight) % bchannels;
        int n = idx / bwidth / bheight / bchannels;

        T tmp1 = data1[idx];
        // Get X,Y ranges and clamp
        // round_off is a trick to enable integer division with ceil, even for
        // negative numbers We use a large offset, for the inner part not to
        // become negative.
        const int round_off = ROUND_OFF;
        const int round_off_s1 = stride1 * round_off;

        // we show cal the x_min,y_min,x_max,y_max of diff for grad1(x,y)
        // for diff_x_min, diff_y_min, x,y at the position of right-down
        // ceil (l - 2*kernel_radius - max_displacement + pad_size) / stride1
        int xmin = (x + pad_size - 2 * kernel_radius - max_displacement +
                    round_off_s1 - 1) /
                           stride1 +
                   1 - round_off;
        int ymin = (y + pad_size - 2 * kernel_radius - max_displacement +
                    round_off_s1 - 1) /
                           stride1 +
                   1 - round_off;

        // floor (l - max_displacement + pad_size) / stride1
        int xmax = (x + pad_size - max_displacement + round_off_s1) / stride1 -
                   round_off;
        int ymax = (y + pad_size - max_displacement + round_off_s1) / stride1 -
                   round_off;

        T sum = T(0.f);

        if (xmax >= 0 && ymax >= 0 && (xmin <= twidth - 1) &&
            (ymin <= theight - 1)) {
            xmin = max(0, xmin);
            xmax = min(twidth - 1, xmax);

            ymin = max(0, ymin);
            ymax = min(theight - 1, ymax);

            for (int p = -neighborhood_grid_radius;
                 p <= neighborhood_grid_radius; p++) {
                for (int o = -neighborhood_grid_radius;
                     o <= neighborhood_grid_radius; o++) {
                    // Get bottom1 data:
                    int s2o = stride2 * o;
                    int s2p = stride2 * p;
                    int x2 = x + s2o, y2 = y + s2p;

                    int idx2 =
                            ((n * bchannels + c) * bheight + y2) * bwidth + x2;
                    T tmp2 = T(0.f);

                    if (x2 >= 0 && x2 < bwidth && y2 >= 0 && y2 < bheight) {
                        tmp2 = data2[idx2];
                    }

                    int op = (p + neighborhood_grid_radius) *
                                     neighborhood_grid_width +
                             (o + neighborhood_grid_radius);
                    int diff_channels_offset = (n * tchannels + op);
                    for (int diff_y = ymin; diff_y <= ymax; diff_y++) {
                        for (int diff_x = xmin; diff_x <= xmax; diff_x++) {
                            int idxtopdiff =
                                    (diff_channels_offset * theight + diff_y) *
                                            twidth +
                                    diff_x;

                            if (is_multiply) {
                                sum += diff[idxtopdiff] * tmp2;
                            } else {
                                T sign = (tmp1 >= tmp2) ? T(1.f) : T(-1.f);
                                sum += diff[idxtopdiff] * sign;
                            }
                        }
                    }
                }
            }
        }

        const int sumelems =
                (kernel_radius * 2 + 1) * (kernel_radius * 2 + 1) * bchannels;
        grad1[idx] = sum / sumelems;
    }
 }

 template <typename T>
 __global__ void backward_kernel_data2(
        const int nthreads, const T* diff, const T* data1, const T* data2,
        T* grad2, const int bchannels, const int bheight, const int bwidth,
        const int tchannels, const int theight, const int twidth,
        const int kernel_size, const int max_displacement, const int stride1,
        const int stride2, const int pad_size, const bool is_multiply) {
    CUDA_KERNEL_LOOP(idx, nthreads) {
        int kernel_radius = (kernel_size - 1) / 2;
        int neighborhood_grid_radius = max_displacement / stride2;
        int neighborhood_grid_width = neighborhood_grid_radius * 2 + 1;

        int x = idx % bwidth;
        int y = (idx / bwidth) % bheight;
        int c = (idx / bwidth / bheight) % bchannels;
        int n = idx / bwidth / bheight / bchannels;

        T tmp2 = data2[idx];

        T sum = T(0.f);

        for (int p = -neighborhood_grid_radius; p <= neighborhood_grid_radius;
             p++) {
            for (int o = -neighborhood_grid_radius;
                 o <= neighborhood_grid_radius; o++) {
                int s2o = o * stride2;
                int s2p = p * stride2;

                int x1 = x - s2o;
                int y1 = y - s2p;

                const int round_off = ROUND_OFF;
                const int round_off_s1 = stride1 * round_off;

                int xmin = (x1 + pad_size - 2 * kernel_radius -
                            max_displacement + round_off_s1 - 1) /
                                   stride1 +
                           1 - round_off;
                int ymin = (y1 + pad_size - 2 * kernel_radius -
                            max_displacement + round_off_s1 - 1) /
                                   stride1 +
                           1 - round_off;
                int xmax = (x1 + pad_size - max_displacement + round_off_s1) /
                                   stride1 -
                           round_off;
                int ymax = (y1 + pad_size - max_displacement + round_off_s1) /
                                   stride1 -
                           round_off;

                if (xmax >= 0 && ymax >= 0 && (xmin <= twidth - 1) &&
                    (ymin <= theight - 1)) {
                    xmin = max(0, xmin);
                    xmax = min(twidth - 1, xmax);

                    ymin = max(0, ymin);
                    ymax = min(theight - 1, ymax);

                    int idx1 =
                            ((n * bchannels + c) * bheight + y1) * bwidth + x1;
                    T tmp1 = T(0.f);
                    if (x1 >= 0 && x1 < bwidth && y1 >= 0 && y1 < bheight) {
                        tmp1 = data1[idx1];
                    }

                    int op = (p + neighborhood_grid_radius) *
                                     neighborhood_grid_width +
                             (o + neighborhood_grid_radius);
                    int diff_channels_offset = (n * tchannels + op);
                    for (int diff_y = ymin; diff_y <= ymax; diff_y++) {
                        for (int diff_x = xmin; diff_x <= xmax; diff_x++) {
                            int idxtopdiff =
                                    (diff_channels_offset * theight + diff_y) *
                                            twidth +
                                    diff_x;

                            if (is_multiply) {
                                sum += diff[idxtopdiff] * tmp1;
                            } else {
                                T sign = (tmp1 >= tmp2) ? T(-1.f) : T(1.f);
                                sum += diff[idxtopdiff] * sign;
                            }
                        }
                    }
                }
            }
        }

        const int sumelems =
                (kernel_radius * 2 + 1) * (kernel_radius * 2 + 1) * bchannels;
        grad2[idx] = sum / sumelems;
    }
 }

 template <typename T>
 void forward_proxy(const int nthreads, const T* data1, const T* data2, T* dst,
                   const int bchannels, const int bheight, const int bwidth,
                   const int tchannels, const int theight, const int twidth,
                   const int kernel_size, const int max_displacement,
                   const int stride1, const int stride2, const int pad_size,
                   const bool is_multiply, cudaStream_t stream) {
    int threads_block = query_blocksize_for_kernel(forward_kernel<T>);
    forward_kernel<T>
            <<<DIVUP(nthreads, threads_block), threads_block, 0, stream>>>(
                    nthreads, data1, data2, dst, bchannels, bheight, bwidth,
                    tchannels, theight, twidth, kernel_size, max_displacement,
                    stride1, stride2, pad_size, is_multiply);
    after_kernel_launch();
 }

 template <typename T>
 void backward_proxy_data1(const int nthreads, const T* diff, const T* data1,
                          const T* data2, T* grad1, const int bchannels,
                          const int bheight, const int bwidth,
                          const int tchannels, const int theight,
                          const int twidth, const int kernel_size,
                          const int max_displacement, const int stride1,
                          const int stride2, const int pad_size,
                          const bool is_multiply, cudaStream_t stream) {
    int threads_block = query_blocksize_for_kernel(backward_kernel_data1<T>);
    backward_kernel_data1<T>
            <<<DIVUP(nthreads, threads_block), threads_block, 0, stream>>>(
                    nthreads, diff, data1, data2, grad1, bchannels, bheight,
                    bwidth, tchannels, theight, twidth, kernel_size,
                    max_displacement, stride1, stride2, pad_size, is_multiply);
    after_kernel_launch();
 }

 template <typename T>
 void backward_proxy_data2(const int nthreads, const T* diff, const T* data1,
                          const T* data2, T* grad2, const int bchannels,
                          const int bheight, const int bwidth,
                          const int tchannels, const int theight,
                          const int twidth, const int kernel_size,
                          const int max_displacement, const int stride1,
                          const int stride2, const int pad_size,
                          const bool is_multiply, cudaStream_t stream) {
    int threads_block = query_blocksize_for_kernel(backward_kernel_data2<T>);
    backward_kernel_data2<T>
            <<<DIVUP(nthreads, threads_block), threads_block, 0, stream>>>(
                    nthreads, diff, data1, data2, grad2, bchannels, bheight,
                    bwidth, tchannels, theight, twidth, kernel_size,
                    max_displacement, stride1, stride2, pad_size, is_multiply);
    after_kernel_launch();
 }

 #define INST(T)                                                                \
    template void forward_proxy<T>(                                            \
            const int, const T*, const T*, T* dst, const int, const int,       \
            const int, const int, const int, const int, const int, const int,  \
            const int, const int, const int, const bool, cudaStream_t);        \
    template void backward_proxy_data1<T>(                                     \
            const int, const T*, const T*, const T*, T*, const int, const int, \
            const int, const int, const int, const int, const int, const int,  \
            const int, const int, const int, const bool, cudaStream_t);        \
    template void backward_proxy_data2<T>(                                     \
            const int, const T*, const T*, const T*, T*, const int, const int, \
            const int, const int, const int, const int, const int, const int,  \
            const int, const int, const int, const bool, cudaStream_t);
 INST(dt_float32)
 INST(dt_float16)
 INST(dt_bfloat16)
 #undef INST

 }  // namespace roi_align
 }  // namespace cuda
 }  // namespace megdnn

 // vim: syntax=cpp.doxygen
--- a/dnn/src/cuda/correlation/correlation_cuda.cuh
+++ b/dnn/src/cuda/correlation/correlation_cuda.cuh
@@ -0,0 +1,51 @@
 /**
 * \file dnn/src/cuda/correlation/correlation.cuh
 * MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
 *
 * Copyright (c) 2014-2021 Megvii Inc. All rights reserved.
 *
 * Unless required by applicable law or agreed to in writing,
 * software distributed under the License is distributed on an
 * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or
 * implied.
 */
 #pragma once
 #include <cuda_runtime_api.h>

 namespace megdnn {
 namespace cuda {
 namespace correlation {

 template <typename T>
 void forward_proxy(const int nthreads, const T* data1, const T* data2, T* dst,
                   const int bchannels, const int bheight, const int bwidth,
                   const int tchannels, const int theight, const int twidth,
                   const int kernel_size, const int max_displacement,
                   const int stride1, const int stride2, const int pad_size,
                   const bool is_multiply, cudaStream_t stream);

 template <typename T>
 void backward_proxy_data1(const int nthreads, const T* diff, const T* data1,
                          const T* data2, T* grad1, const int bchannels,
                          const int bheight, const int bwidth,
                          const int tchannels, const int theight,
                          const int twidth, const int kernel_size,
                          const int max_displacement, const int stride1,
                          const int stride2, const int pad_size,
                          const bool is_multiply, cudaStream_t stream);

 template <typename T>
 void backward_proxy_data2(const int nthreads, const T* diff, const T* data1,
                          const T* data2, T* grad2, const int bchannels,
                          const int bheight, const int bwidth,
                          const int tchannels, const int theight,
                          const int twidth, const int kernel_size,
                          const int max_displacement, const int stride1,
                          const int stride2, const int pad_size,
                          const bool is_multiply, cudaStream_t stream);

 }  // namespace correlation
 }  // namespace cuda
 }  // namespace megdnn

 // vim: syntax=cpp.doxygen
--- a/dnn/src/cuda/correlation/opr_impl.cpp
+++ b/dnn/src/cuda/correlation/opr_impl.cpp
@@ -0,0 +1,129 @@
 /**
 * \file dnn/src/naive/correlation/opr_impl.cpp
 * MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
 *
 * Copyright (c) 2014-2021 Megvii Inc. All rights reserved.
 *
 * Unless required by applicable law or agreed to in writing,
 * software distributed under the License is distributed on an
 * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or
 * implied.
 */

 #include "src/cuda/correlation/opr_impl.h"
 #include "src/cuda/correlation/correlation_cuda.cuh"
 #include "src/cuda/utils.h"

 namespace megdnn {
 namespace cuda {

 void CorrelationForwardImpl::exec(_megdnn_tensor_in data1,
                                  _megdnn_tensor_in data2,
                                  _megdnn_tensor_out dst,
                                  _megdnn_workspace workspace) {
    check_exec(data1.layout, data2.layout, dst.layout, workspace.size);
    auto p = param();
    auto stream = cuda_stream(handle());
    int nthreads = dst.layout.total_nr_elems();
    int stride1 = p.stride1;
    int stride2 = p.stride2;
    int kernel_size = p.kernel_size;
    int max_displacement = p.max_displacement;
    int pad_size = p.pad_size;
    bool is_multiply = p.is_multiply;

    int tchannels = dst.layout[1];
    int theight = dst.layout[2], twidth = dst.layout[3];
    int bchannels = data1.layout[1];
    int bheight = data1.layout[2], bwidth = data1.layout[3];
    using namespace ::megdnn::cuda::correlation;

 #define cb(DType)                                                             \
    if (data1.layout.dtype == DType()) {                                      \
        using T = typename DTypeTrait<DType>::ctype;                          \
        forward_proxy<T>(nthreads, data1.ptr<T>(), data2.ptr<T>(),            \
                         dst.ptr<T>(), bchannels, bheight, bwidth, tchannels, \
                         theight, twidth, kernel_size, max_displacement,      \
                         stride1, stride2, pad_size, is_multiply, stream);    \
    }
    MEGDNN_FOREACH_COMPUTING_DTYPE_FLOAT(cb)
 #undef cb
 }

 void CorrelationBackwardData1Impl::exec(_megdnn_tensor_in diff,
                                        _megdnn_tensor_in data1,
                                        _megdnn_tensor_in data2,
                                        _megdnn_tensor_out grad1,
                                        _megdnn_workspace workspace) {
    check_exec(diff.layout, data1.layout, data2.layout, grad1.layout,
               workspace.size);

    auto stream = cuda_stream(handle());
    int nthreads = grad1.layout.total_nr_elems();
    int stride1 = param().stride1;
    int stride2 = param().stride2;
    int kernel_size = param().kernel_size;
    int max_displacement = param().max_displacement;
    int pad_size = param().pad_size;
    bool is_multiply = param().is_multiply;

    int tchannels = diff.layout[1];
    int theight = diff.layout[2], twidth = diff.layout[3];
    int bchannels = data1.layout[1];
    int bheight = data1.layout[2], bwidth = data1.layout[3];

    using namespace ::megdnn::cuda::correlation;

 #define cb(DType)                                                            \
    if (diff.layout.dtype == DType()) {                                      \
        using T = typename DTypeTrait<DType>::ctype;                         \
        backward_proxy_data1<T>(nthreads, diff.ptr<T>(), data1.ptr<T>(),     \
                                data2.ptr<T>(), grad1.ptr<T>(), bchannels,   \
                                bheight, bwidth, tchannels, theight, twidth, \
                                kernel_size, max_displacement, stride1,      \
                                stride2, pad_size, is_multiply, stream);     \
    }
    MEGDNN_FOREACH_COMPUTING_DTYPE_FLOAT(cb)
 #undef cb
 }

 void CorrelationBackwardData2Impl::exec(_megdnn_tensor_in diff,
                                        _megdnn_tensor_in data1,
                                        _megdnn_tensor_in data2,
                                        _megdnn_tensor_out grad2,
                                        _megdnn_workspace workspace) {
    check_exec(diff.layout, data1.layout, data2.layout, grad2.layout,
               workspace.size);
    auto p = param();
    auto stream = cuda_stream(handle());
    int nthreads = grad2.layout.total_nr_elems();
    int stride1 = p.stride1;
    int stride2 = p.stride2;
    int kernel_size = p.kernel_size;
    int max_displacement = p.max_displacement;
    int pad_size = p.pad_size;
    bool is_multiply = p.is_multiply;

    int tchannels = diff.layout[1];
    int theight = diff.layout[2], twidth = diff.layout[3];
    int bchannels = data1.layout[1];
    int bheight = data1.layout[2], bwidth = data1.layout[3];

    using namespace ::megdnn::cuda::correlation;

 #define cb(DType)                                                            \
    if (diff.layout.dtype == DType()) {                                      \
        using T = typename DTypeTrait<DType>::ctype;                         \
        backward_proxy_data2<T>(nthreads, diff.ptr<T>(), data1.ptr<T>(),     \
                                data2.ptr<T>(), grad2.ptr<T>(), bchannels,   \
                                bheight, bwidth, tchannels, theight, twidth, \
                                kernel_size, max_displacement, stride1,      \
                                stride2, pad_size, is_multiply, stream);     \
    }
    MEGDNN_FOREACH_COMPUTING_DTYPE_FLOAT(cb)
 #undef cb
 }

 }  // namespace cuda
 }  // namespace megdnn
 // vim: syntax=cpp.doxygen
--- a/dnn/src/cuda/correlation/opr_impl.h
+++ b/dnn/src/cuda/correlation/opr_impl.h
@@ -0,0 +1,61 @@
 /**
 * \file dnn/src/naive/correlation/opr_impl.h
 * MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
 *
 * Copyright (c) 2014-2021 Megvii Inc. All rights reserved.
 *
 * Unless required by applicable law or agreed to in writing,
 * software distributed under the License is distributed on an
 * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or
 * implied.
 */
 #pragma once
 #include "megdnn/oprs.h"

 #include "src/cuda/cudnn_wrapper.h"

 namespace megdnn {
 namespace cuda {

 class CorrelationForwardImpl final : public CorrelationForward {
 public:
    using CorrelationForward::CorrelationForward;
    void exec(_megdnn_tensor_in data1, _megdnn_tensor_in data2,
              _megdnn_tensor_out dst, _megdnn_workspace workspace) override;
    size_t get_workspace_in_bytes(const TensorLayout& data1,
                                  const TensorLayout& data2,
                                  const TensorLayout& dst) override {
        return 0;
    }
 };

 class CorrelationBackwardData1Impl final : public CorrelationBackwardData1 {
 public:
    using CorrelationBackwardData1::CorrelationBackwardData1;
    void exec(_megdnn_tensor_in diff, _megdnn_tensor_in data1,
              _megdnn_tensor_in data2, _megdnn_tensor_out grad1,
              _megdnn_workspace workspace) override;
    size_t get_workspace_in_bytes(const TensorLayout&, const TensorLayout&,
                                  const TensorLayout&,
                                  const TensorLayout&) override {
        return 0;
    }
 };

 class CorrelationBackwardData2Impl final : public CorrelationBackwardData2 {
 public:
    using CorrelationBackwardData2::CorrelationBackwardData2;
    void exec(_megdnn_tensor_in diff, _megdnn_tensor_in data1,
              _megdnn_tensor_in data2, _megdnn_tensor_out grad2,
              _megdnn_workspace workspace) override;
    size_t get_workspace_in_bytes(const TensorLayout&, const TensorLayout&,
                                  const TensorLayout&,
                                  const TensorLayout&) override {
        return 0;
    }
 };

 }  // namespace cuda
 }  // namespace megdnn

 // vim: syntax=cpp.doxygen
--- a/dnn/src/cuda/cub/iterator/arg_index_input_iterator.cuh
+++ b/dnn/src/cuda/cub/iterator/arg_index_input_iterator.cuh
@@ -41,14 +41,6 @@
 #include "../util_device.cuh"
 #include "../util_namespace.cuh"

 #include <thrust/version.h>

 #if (THRUST_VERSION >= 100700)
    // This iterator is compatible with Thrust API 1.7 and newer
    #include <thrust/iterator/iterator_facade.h>
    #include <thrust/iterator/iterator_traits.h>
 #endif // THRUST_VERSION

 /// Optional outer namespace(s)
 CUB_NS_PREFIX

@@ -121,17 +113,7 @@ public:
    typedef value_type*                                 pointer;                ///< The type of a pointer to an element the iterator can point to
    typedef value_type                                  reference;              ///< The type of a reference to an element the iterator can point to

 #if (THRUST_VERSION >= 100700)
    // Use Thrust's iterator categories so we can use these iterators in Thrust 1.7 (or newer) methods
    typedef typename thrust::detail::iterator_facade_category<
        thrust::any_system_tag,
        thrust::random_access_traversal_tag,
        value_type,
        reference
      >::type iterator_category;                                        ///< The iterator category
 #else
    typedef std::random_access_iterator_tag     iterator_category;      ///< The iterator category
 #endif  // THRUST_VERSION

 private:

--- a/dnn/src/cuda/cub/iterator/cache_modified_input_iterator.cuh
+++ b/dnn/src/cuda/cub/iterator/cache_modified_input_iterator.cuh
@@ -41,12 +41,6 @@
 #include "../util_device.cuh"
 #include "../util_namespace.cuh"

 #if (THRUST_VERSION >= 100700)
    // This iterator is compatible with Thrust API 1.7 and newer
    #include <thrust/iterator/iterator_facade.h>
    #include <thrust/iterator/iterator_traits.h>
 #endif // THRUST_VERSION


 /// Optional outer namespace(s)
 CUB_NS_PREFIX
@@ -115,17 +109,7 @@ public:
    typedef ValueType*                          pointer;                ///< The type of a pointer to an element the iterator can point to
    typedef ValueType                           reference;              ///< The type of a reference to an element the iterator can point to

 #if (THRUST_VERSION >= 100700)
    // Use Thrust's iterator categories so we can use these iterators in Thrust 1.7 (or newer) methods
    typedef typename thrust::detail::iterator_facade_category<
        thrust::device_system_tag,
        thrust::random_access_traversal_tag,
        value_type,
        reference
      >::type iterator_category;                                        ///< The iterator category
 #else
    typedef std::random_access_iterator_tag     iterator_category;      ///< The iterator category
 #endif  // THRUST_VERSION


 public:
--- a/dnn/src/cuda/cub/iterator/cache_modified_output_iterator.cuh
+++ b/dnn/src/cuda/cub/iterator/cache_modified_output_iterator.cuh
@@ -41,13 +41,6 @@
 #include "../util_device.cuh"
 #include "../util_namespace.cuh"

 #if (THRUST_VERSION >= 100700)
    // This iterator is compatible with Thrust API 1.7 and newer
    #include <thrust/iterator/iterator_facade.h>
    #include <thrust/iterator/iterator_traits.h>
 #endif // THRUST_VERSION


 /// Optional outer namespace(s)
 CUB_NS_PREFIX

@@ -135,17 +128,7 @@ public:
    typedef void                                pointer;                ///< The type of a pointer to an element the iterator can point to
    typedef Reference                           reference;              ///< The type of a reference to an element the iterator can point to

 #if (THRUST_VERSION >= 100700)
    // Use Thrust's iterator categories so we can use these iterators in Thrust 1.7 (or newer) methods
    typedef typename thrust::detail::iterator_facade_category<
        thrust::device_system_tag,
        thrust::random_access_traversal_tag,
        value_type,
        reference
      >::type iterator_category;                                        ///< The iterator category
 #else
    typedef std::random_access_iterator_tag     iterator_category;      ///< The iterator category
 #endif  // THRUST_VERSION

 private:

--- a/dnn/src/cuda/cub/iterator/constant_input_iterator.cuh
+++ b/dnn/src/cuda/cub/iterator/constant_input_iterator.cuh
@@ -40,13 +40,6 @@
 #include "../thread/thread_store.cuh"
 #include "../util_namespace.cuh"

 #if (THRUST_VERSION >= 100700)
    // This iterator is compatible with Thrust API 1.7 and newer
    #include <thrust/iterator/iterator_facade.h>
    #include <thrust/iterator/iterator_traits.h>
 #endif // THRUST_VERSION


 /// Optional outer namespace(s)
 CUB_NS_PREFIX

@@ -104,17 +97,7 @@ public:
    typedef ValueType*                          pointer;                ///< The type of a pointer to an element the iterator can point to
    typedef ValueType                           reference;              ///< The type of a reference to an element the iterator can point to

 #if (THRUST_VERSION >= 100700)
    // Use Thrust's iterator categories so we can use these iterators in Thrust 1.7 (or newer) methods
    typedef typename thrust::detail::iterator_facade_category<
        thrust::any_system_tag,
        thrust::random_access_traversal_tag,
        value_type,
        reference
      >::type iterator_category;                                        ///< The iterator category
 #else
    typedef std::random_access_iterator_tag     iterator_category;      ///< The iterator category
 #endif  // THRUST_VERSION

 private:

--- a/dnn/src/cuda/cub/iterator/counting_input_iterator.cuh
+++ b/dnn/src/cuda/cub/iterator/counting_input_iterator.cuh
@@ -41,13 +41,6 @@
 #include "../util_device.cuh"
 #include "../util_namespace.cuh"

 #if (THRUST_VERSION >= 100700)
    // This iterator is compatible with Thrust API 1.7 and newer
    #include <thrust/iterator/iterator_facade.h>
    #include <thrust/iterator/iterator_traits.h>
 #endif // THRUST_VERSION


 /// Optional outer namespace(s)
 CUB_NS_PREFIX

@@ -102,17 +95,7 @@ public:
    typedef ValueType*                          pointer;                ///< The type of a pointer to an element the iterator can point to
    typedef ValueType                           reference;              ///< The type of a reference to an element the iterator can point to

 #if (THRUST_VERSION >= 100700)
    // Use Thrust's iterator categories so we can use these iterators in Thrust 1.7 (or newer) methods
    typedef typename thrust::detail::iterator_facade_category<
        thrust::any_system_tag,
        thrust::random_access_traversal_tag,
        value_type,
        reference
      >::type iterator_category;                                        ///< The iterator category
 #else
    typedef std::random_access_iterator_tag     iterator_category;      ///< The iterator category
 #endif  // THRUST_VERSION

 private:

--- a/dnn/src/cuda/cub/iterator/discard_output_iterator.cuh
+++ b/dnn/src/cuda/cub/iterator/discard_output_iterator.cuh
@@ -39,13 +39,6 @@
 #include "../util_namespace.cuh"
 #include "../util_macro.cuh"

 #if (THRUST_VERSION >= 100700)
    // This iterator is compatible with Thrust API 1.7 and newer
    #include <thrust/iterator/iterator_facade.h>
    #include <thrust/iterator/iterator_traits.h>
 #endif // THRUST_VERSION


 /// Optional outer namespace(s)
 CUB_NS_PREFIX

@@ -74,17 +67,7 @@ public:
    typedef void                    pointer;                ///< The type of a pointer to an element the iterator can point to
    typedef void                    reference;              ///< The type of a reference to an element the iterator can point to

 #if (THRUST_VERSION >= 100700)
    // Use Thrust's iterator categories so we can use these iterators in Thrust 1.7 (or newer) methods
    typedef typename thrust::detail::iterator_facade_category<
        thrust::any_system_tag,
        thrust::random_access_traversal_tag,
        value_type,
        reference
      >::type iterator_category;                                        ///< The iterator category
 #else
    typedef std::random_access_iterator_tag     iterator_category;      ///< The iterator category
 #endif  // THRUST_VERSION

 private:

--- a/dnn/src/cuda/cub/iterator/tex_obj_input_iterator.cuh
+++ b/dnn/src/cuda/cub/iterator/tex_obj_input_iterator.cuh
@@ -42,13 +42,6 @@
 #include "../util_debug.cuh"
 #include "../util_namespace.cuh"

 #if (THRUST_VERSION >= 100700)
    // This iterator is compatible with Thrust API 1.7 and newer
    #include <thrust/iterator/iterator_facade.h>
    #include <thrust/iterator/iterator_traits.h>
 #endif // THRUST_VERSION


 /// Optional outer namespace(s)
 CUB_NS_PREFIX

@@ -119,17 +112,7 @@ public:
    typedef T*                                  pointer;                ///< The type of a pointer to an element the iterator can point to
    typedef T                                   reference;              ///< The type of a reference to an element the iterator can point to

 #if (THRUST_VERSION >= 100700)
    // Use Thrust's iterator categories so we can use these iterators in Thrust 1.7 (or newer) methods
    typedef typename thrust::detail::iterator_facade_category<
        thrust::device_system_tag,
        thrust::random_access_traversal_tag,
        value_type,
        reference
      >::type iterator_category;                                        ///< The iterator category
 #else
    typedef std::random_access_iterator_tag     iterator_category;      ///< The iterator category
 #endif  // THRUST_VERSION

 private:

--- a/dnn/src/cuda/cub/iterator/tex_ref_input_iterator.cuh
+++ b/dnn/src/cuda/cub/iterator/tex_ref_input_iterator.cuh
@@ -44,12 +44,6 @@

 #if (CUDA_VERSION >= 5050) || defined(DOXYGEN_ACTIVE)  // This iterator is compatible with CUDA 5.5 and newer

 #if (THRUST_VERSION >= 100700)    // This iterator is compatible with Thrust API 1.7 and newer
    #include <thrust/iterator/iterator_facade.h>
    #include <thrust/iterator/iterator_traits.h>
 #endif // THRUST_VERSION


 /// Optional outer namespace(s)
 CUB_NS_PREFIX

@@ -212,17 +206,7 @@ public:
    typedef T*                                  pointer;                ///< The type of a pointer to an element the iterator can point to
    typedef T                                   reference;              ///< The type of a reference to an element the iterator can point to

 #if (THRUST_VERSION >= 100700)
    // Use Thrust's iterator categories so we can use these iterators in Thrust 1.7 (or newer) methods
    typedef typename thrust::detail::iterator_facade_category<
        thrust::device_system_tag,
        thrust::random_access_traversal_tag,
        value_type,
        reference
      >::type iterator_category;                                        ///< The iterator category
 #else
    typedef std::random_access_iterator_tag     iterator_category;      ///< The iterator category
 #endif  // THRUST_VERSION

 private:

--- a/dnn/src/cuda/cub/iterator/transform_input_iterator.cuh
+++ b/dnn/src/cuda/cub/iterator/transform_input_iterator.cuh
@@ -41,12 +41,6 @@
 #include "../util_device.cuh"
 #include "../util_namespace.cuh"

 #if (THRUST_VERSION >= 100700)
    // This iterator is compatible with Thrust API 1.7 and newer
    #include <thrust/iterator/iterator_facade.h>
    #include <thrust/iterator/iterator_traits.h>
 #endif // THRUST_VERSION


 /// Optional outer namespace(s)
 CUB_NS_PREFIX
@@ -125,17 +119,7 @@ public:
    typedef ValueType*                          pointer;                ///< The type of a pointer to an element the iterator can point to
    typedef ValueType                           reference;              ///< The type of a reference to an element the iterator can point to

 #if (THRUST_VERSION >= 100700)
    // Use Thrust's iterator categories so we can use these iterators in Thrust 1.7 (or newer) methods
    typedef typename thrust::detail::iterator_facade_category<
        thrust::any_system_tag,
        thrust::random_access_traversal_tag,
        value_type,
        reference
      >::type iterator_category;                                        ///< The iterator category
 #else
    typedef std::random_access_iterator_tag     iterator_category;      ///< The iterator category
 #endif  // THRUST_VERSION

 private:

--- a/dnn/src/cuda/cub/util_namespace.cuh
+++ b/dnn/src/cuda/cub/util_namespace.cuh
@@ -38,9 +38,9 @@
 //#define CUB_NS_POSTFIX } }

 #ifndef CUB_NS_PREFIX
 #define CUB_NS_PREFIX
 #define CUB_NS_PREFIX namespace megdnn { namespace cuda {
 #endif

 #ifndef CUB_NS_POSTFIX
 #define CUB_NS_POSTFIX
 #define CUB_NS_POSTFIX } }
 #endif
--- a/dnn/src/cuda/cudnn_wrapper.cpp
+++ b/dnn/src/cuda/cudnn_wrapper.cpp
@@ -470,9 +470,9 @@ void Conv3DDesc::set(const param::Convolution3D& param, const size_t nr_group) {
 #define V(v) V1(v)
 #define DEF_NAME(NAME) \
    #NAME "v" V(CUDNN_MAJOR) "." V(CUDNN_MINOR) "." V(CUDNN_PATCHLEVEL)
 #define DEF_ALGO(NAME, PROD)           \
    {                                  \
        NAME, { DEF_NAME(NAME), PROD } \
 #define DEF_ALGO(NAME, PROD1, PROD2)           \
    {                                          \
        NAME, { DEF_NAME(NAME), PROD1, PROD2 } \
    }

 #if !(CUDNN_MAJOR >= 6 || CUDNN_MINOR >= 1)
@@ -483,19 +483,18 @@ const std::unordered_map<cudnnConvolutionBwdDataAlgo_t, CudnnAlgoPack::Attr>
 CudnnAlgoPack::conv_bwd_data_algos() {
    static const std::unordered_map<cudnnConvolutionBwdDataAlgo_t,
                                    CudnnAlgoPack::Attr>
            algos = {
                DEF_ALGO(CUDNN_CONVOLUTION_BWD_DATA_ALGO_0, false),
                DEF_ALGO(CUDNN_CONVOLUTION_BWD_DATA_ALGO_1, true),
                DEF_ALGO(CUDNN_CONVOLUTION_BWD_DATA_ALGO_FFT, true),
                DEF_ALGO(CUDNN_CONVOLUTION_BWD_DATA_ALGO_FFT_TILING, true),
            algos =
    { DEF_ALGO(CUDNN_CONVOLUTION_BWD_DATA_ALGO_0, false, false),
      DEF_ALGO(CUDNN_CONVOLUTION_BWD_DATA_ALGO_1, true, true),
      DEF_ALGO(CUDNN_CONVOLUTION_BWD_DATA_ALGO_FFT, true, true),
      DEF_ALGO(CUDNN_CONVOLUTION_BWD_DATA_ALGO_FFT_TILING, true, true),
 #if CUDNN_MAJOR >= 5
                DEF_ALGO(CUDNN_CONVOLUTION_BWD_DATA_ALGO_WINOGRAD, true),
      DEF_ALGO(CUDNN_CONVOLUTION_BWD_DATA_ALGO_WINOGRAD, true, false),
 #if CUDNN_MAJOR >= 6 || CUDNN_MINOR >= 1
                DEF_ALGO(CUDNN_CONVOLUTION_BWD_DATA_ALGO_WINOGRAD_NONFUSED,
                         true),
      DEF_ALGO(CUDNN_CONVOLUTION_BWD_DATA_ALGO_WINOGRAD_NONFUSED, true, false),
 #endif
 #endif
            };
    };

    return algos;
 }
@@ -505,15 +504,16 @@ CudnnAlgoPack::conv_bwd_flt_algos() {
    static const std::unordered_map<cudnnConvolutionBwdFilterAlgo_t,
                                    CudnnAlgoPack::Attr>
            algos = {
                DEF_ALGO(CUDNN_CONVOLUTION_BWD_FILTER_ALGO_0, false),
                DEF_ALGO(CUDNN_CONVOLUTION_BWD_FILTER_ALGO_1, true),
                DEF_ALGO(CUDNN_CONVOLUTION_BWD_FILTER_ALGO_FFT, true),
                DEF_ALGO(CUDNN_CONVOLUTION_BWD_FILTER_ALGO_3, false),
                DEF_ALGO(CUDNN_CONVOLUTION_BWD_FILTER_ALGO_0, false, false),
                DEF_ALGO(CUDNN_CONVOLUTION_BWD_FILTER_ALGO_1, true, false),
                DEF_ALGO(CUDNN_CONVOLUTION_BWD_FILTER_ALGO_FFT, true, true),
                DEF_ALGO(CUDNN_CONVOLUTION_BWD_FILTER_ALGO_3, false, false),
 #if CUDNN_MAJOR >= 6 || (CUDNN_MAJOR >= 5 && CUDNN_MINOR >= 1)
                DEF_ALGO(CUDNN_CONVOLUTION_BWD_FILTER_ALGO_WINOGRAD_NONFUSED,
                         true),
                         true, false),
 #if CUDNN_MAJOR >= 6
                DEF_ALGO(CUDNN_CONVOLUTION_BWD_FILTER_ALGO_FFT_TILING, true),
                DEF_ALGO(CUDNN_CONVOLUTION_BWD_FILTER_ALGO_FFT_TILING, true,
                         true),
 #endif
 #endif

@@ -522,28 +522,30 @@ CudnnAlgoPack::conv_bwd_flt_algos() {
    return algos;
 }


 const std::unordered_map<cudnnConvolutionFwdAlgo_t, CudnnAlgoPack::Attr>
 CudnnAlgoPack::conv_fwd_algos() {
    static const std::unordered_map<cudnnConvolutionFwdAlgo_t,
                                    CudnnAlgoPack::Attr>
            algos = {
                DEF_ALGO(CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_GEMM, true),
                DEF_ALGO(CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_PRECOMP_GEMM,
                         true),
                DEF_ALGO(CUDNN_CONVOLUTION_FWD_ALGO_GEMM, true),
                DEF_ALGO(CUDNN_CONVOLUTION_FWD_ALGO_DIRECT, true),
                DEF_ALGO(CUDNN_CONVOLUTION_FWD_ALGO_FFT, true),
                DEF_ALGO(CUDNN_CONVOLUTION_FWD_ALGO_FFT_TILING, true),
            algos =
    { DEF_ALGO(CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_GEMM, true, false),
 #if CUDNN_VERSION == 8004
      DEF_ALGO(CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_PRECOMP_GEMM, true, true),
 #else
      DEF_ALGO(CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_PRECOMP_GEMM, true, false),
 #endif
      DEF_ALGO(CUDNN_CONVOLUTION_FWD_ALGO_GEMM, true, false),
      DEF_ALGO(CUDNN_CONVOLUTION_FWD_ALGO_DIRECT, true, false),
      DEF_ALGO(CUDNN_CONVOLUTION_FWD_ALGO_FFT, true, true),
      DEF_ALGO(CUDNN_CONVOLUTION_FWD_ALGO_FFT_TILING, true, true),

 #if CUDNN_MAJOR >= 5
                DEF_ALGO(CUDNN_CONVOLUTION_FWD_ALGO_WINOGRAD, true),
      DEF_ALGO(CUDNN_CONVOLUTION_FWD_ALGO_WINOGRAD, true, false),
 #if CUDNN_MAJOR >= 6 || CUDNN_MINOR >= 1
                DEF_ALGO(CUDNN_CONVOLUTION_FWD_ALGO_WINOGRAD_NONFUSED, true),
      DEF_ALGO(CUDNN_CONVOLUTION_FWD_ALGO_WINOGRAD_NONFUSED, true, false),
 #endif
 #endif

            };
    };

    return algos;
 }
@@ -553,9 +555,10 @@ CudnnAlgoPack::conv3d_bwd_data_algos() {
    static const std::unordered_map<cudnnConvolutionBwdDataAlgo_t,
                                    CudnnAlgoPack::Attr>
            algos = {
                    DEF_ALGO(CUDNN_CONVOLUTION_BWD_DATA_ALGO_0, false),
                    DEF_ALGO(CUDNN_CONVOLUTION_BWD_DATA_ALGO_1, true),
                    DEF_ALGO(CUDNN_CONVOLUTION_BWD_DATA_ALGO_FFT_TILING, true),
                    DEF_ALGO(CUDNN_CONVOLUTION_BWD_DATA_ALGO_0, false, false),
                    DEF_ALGO(CUDNN_CONVOLUTION_BWD_DATA_ALGO_1, true, false),
                    DEF_ALGO(CUDNN_CONVOLUTION_BWD_DATA_ALGO_FFT_TILING, true,
                             true),
            };

    return algos;
@@ -568,9 +571,9 @@ CudnnAlgoPack::conv3d_bwd_flt_algos() {
    static const std::unordered_map<cudnnConvolutionBwdFilterAlgo_t,
                                    CudnnAlgoPack::Attr>
            algos = {
                    DEF_ALGO(CUDNN_CONVOLUTION_BWD_FILTER_ALGO_0, false),
                    DEF_ALGO(CUDNN_CONVOLUTION_BWD_FILTER_ALGO_1, true),
                    DEF_ALGO(CUDNN_CONVOLUTION_BWD_FILTER_ALGO_3, false),
                    DEF_ALGO(CUDNN_CONVOLUTION_BWD_FILTER_ALGO_0, false, false),
                    DEF_ALGO(CUDNN_CONVOLUTION_BWD_FILTER_ALGO_1, true, false),
                    DEF_ALGO(CUDNN_CONVOLUTION_BWD_FILTER_ALGO_3, false, false),
            };

    return algos;
@@ -581,10 +584,15 @@ CudnnAlgoPack::conv3d_fwd_algos() {
    static const std::unordered_map<cudnnConvolutionFwdAlgo_t,
                                    CudnnAlgoPack::Attr>
            algos = {
                    DEF_ALGO(CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_GEMM, true),
                    DEF_ALGO(CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_PRECOMP_GEMM,
                             true),
                    DEF_ALGO(CUDNN_CONVOLUTION_FWD_ALGO_FFT_TILING, true),
                DEF_ALGO(CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_GEMM, true, false),
 #if CUDNN_VERSION == 8004
                DEF_ALGO(CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_PRECOMP_GEMM, true,
                         true),
 #else
                DEF_ALGO(CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_PRECOMP_GEMM, true,
                         false),
 #endif
                DEF_ALGO(CUDNN_CONVOLUTION_FWD_ALGO_FFT_TILING, true, true),
            };

    return algos;
--- a/dnn/src/cuda/cudnn_wrapper.h
+++ b/dnn/src/cuda/cudnn_wrapper.h
@@ -112,6 +112,7 @@ public:
    struct Attr {
        std::string name;
        bool is_reproducible;
        bool accuracy_depend_on_batch;
    };

    static const std::unordered_map<cudnnConvolutionBwdDataAlgo_t, Attr>
--- a/dnn/src/cuda/dot/dot.cu
+++ b/dnn/src/cuda/dot/dot.cu
@@ -16,6 +16,7 @@
 namespace {

 using namespace megdnn;
 using namespace cuda;

 template <typename T> __global__ void kernel(const T *a, const T *b,
        dt_float32 *c,
--- a/dnn/src/cuda/handle.cpp
+++ b/dnn/src/cuda/handle.cpp
@@ -11,12 +11,15 @@

 #include "src/common/handle_impl.h"
 #include "src/common/version_symbol.h"
 #include "src/common/api_cache.h"

 #include "src/cuda/handle.h"
 #include "src/cuda/utils.h"
 #include "src/cuda/api_cache.h"

 #include <cuda.h>
 #include <cstring>
 #include <memory>

 #define STR_HELPER(x) #x
 #define STR(x) STR_HELPER(x)
@@ -88,6 +91,8 @@ HandleImpl::HandleImpl(megcoreComputingHandle_t comp_handle):
    // check tk1
    m_is_tegra_k1 = (strcmp(m_device_prop->name, "GK20A") == 0);
    m_cusolver_handle = nullptr;

    m_cudnn_api_cache = std::make_unique<CUDNN>(m_cudnn_handle);
 }

 HandleImpl::~HandleImpl() noexcept {
@@ -133,8 +138,112 @@ HandleImpl::HandleVendorType HandleImpl::vendor_type() const {
    return HandleVendorType::CUDA;
 }

 } // namespace cuda
 } // namespace megdnn
 HandleImpl::CUDNN& HandleImpl::cudnn() {
    return *m_cudnn_api_cache;
 }

 HandleImpl::CUDNN::CUDNN(cudnnHandle_t handle) {
    m_handle = handle;
    GetConvolutionForwardWorkspaceSize =
            FunctionCacheBuilder<>()
                    .input<Param<cudnnHandle_t>>()
                    .input<CudnnTensorDescParam>()
                    .input<CudnnFilterDescParam>()
                    .input<CudnnConvDescParam>()
                    .input<CudnnTensorDescParam>()
                    .input<Param<cudnnConvolutionFwdAlgo_t>>()
                    .output<RefParam<size_t>>()
                    .ret<Param<cudnnStatus_t>>()
                    .build(&cudnnGetConvolutionForwardWorkspaceSize);
 #if CUDNN_MAJOR >= 7
    GetConvolutionForwardAlgorithm_v7 =
            FunctionCacheBuilder<>()
                    .input<Param<cudnnHandle_t>>()
                    .input<CudnnTensorDescParam>()
                    .input<CudnnFilterDescParam>()
                    .input<CudnnConvDescParam>()
                    .input<CudnnTensorDescParam>()
                    .input<Param<int>>()
                    .output<RefArraySizeParam<int>>()
                    .output<ArrayParam<int,
                                       Param<cudnnConvolutionFwdAlgoPerf_t>>>()
                    .ret<Param<cudnnStatus_t>>()
                    .build(&cudnnGetConvolutionForwardAlgorithm_v7);
    GetConvolutionForwardAlgorithmMaxCount =
            FunctionCacheBuilder<>()
                    .input<Param<cudnnHandle_t>>()
                    .output<RefParam<int>>()
                    .ret<Param<cudnnStatus_t>>()
                    .build(&cudnnGetConvolutionForwardAlgorithmMaxCount);
 #endif
    GetConvolutionBackwardDataWorkspaceSize =
            FunctionCacheBuilder<>()
                    .input<Param<cudnnHandle_t>>()
                    .input<CudnnFilterDescParam>()
                    .input<CudnnTensorDescParam>()
                    .input<CudnnConvDescParam>()
                    .input<CudnnTensorDescParam>()
                    .input<Param<cudnnConvolutionBwdDataAlgo_t>>()
                    .output<RefParam<size_t>>()
                    .ret<Param<cudnnStatus_t>>()
                    .build(&cudnnGetConvolutionBackwardDataWorkspaceSize);
 #if CUDNN_MAJOR >= 7
    GetConvolutionBackwardDataAlgorithm_v7 =
            FunctionCacheBuilder<>()
                    .input<Param<cudnnHandle_t>>()
                    .input<CudnnFilterDescParam>()
                    .input<CudnnTensorDescParam>()
                    .input<CudnnConvDescParam>()
                    .input<CudnnTensorDescParam>()
                    .input<Param<int>>()
                    .output<RefArraySizeParam<int>>()
                    .output<ArrayParam<
                            int, Param<cudnnConvolutionBwdDataAlgoPerf_t>>>()
                    .ret<Param<cudnnStatus_t>>()
                    .build(&cudnnGetConvolutionBackwardDataAlgorithm_v7);
    GetConvolutionBackwardDataAlgorithmMaxCount =
            FunctionCacheBuilder<>()
                    .input<Param<cudnnHandle_t>>()
                    .output<RefParam<int>>()
                    .ret<Param<cudnnStatus_t>>()
                    .build(&cudnnGetConvolutionBackwardDataAlgorithmMaxCount);
 #endif
    GetConvolutionBackwardFilterWorkspaceSize =
            FunctionCacheBuilder<>()
                    .input<Param<cudnnHandle_t>>()
                    .input<CudnnTensorDescParam>()
                    .input<CudnnTensorDescParam>()
                    .input<CudnnConvDescParam>()
                    .input<CudnnFilterDescParam>()
                    .input<Param<cudnnConvolutionBwdFilterAlgo_t>>()
                    .output<RefParam<size_t>>()
                    .ret<Param<cudnnStatus_t>>()
                    .build(&cudnnGetConvolutionBackwardFilterWorkspaceSize);
 #if CUDNN_MAJOR >= 7
    GetConvolutionBackwardFilterAlgorithm_v7 =
            FunctionCacheBuilder<>()
                    .input<Param<cudnnHandle_t>>()
                    .input<CudnnTensorDescParam>()
                    .input<CudnnTensorDescParam>()
                    .input<CudnnConvDescParam>()
                    .input<CudnnFilterDescParam>()
                    .input<Param<int>>()
                    .output<RefArraySizeParam<int>>()
                    .output<ArrayParam<
                            int, Param<cudnnConvolutionBwdFilterAlgoPerf_t>>>()
                    .ret<Param<cudnnStatus_t>>()
                    .build(&cudnnGetConvolutionBackwardFilterAlgorithm_v7);
    GetConvolutionBackwardFilterAlgorithmMaxCount =
            FunctionCacheBuilder<>()
                    .input<Param<cudnnHandle_t>>()
                    .output<RefParam<int>>()
                    .ret<Param<cudnnStatus_t>>()
                    .build(&cudnnGetConvolutionBackwardFilterAlgorithmMaxCount);
 #endif
 }

 }  // namespace cuda
 }  // namespace megdnn

 MEGDNN_VERSION_SYMBOL(CUDA, CUDA_VERSION);
 MEGDNN_VERSION_SYMBOL3(CUDNN, CUDNN_MAJOR, CUDNN_MINOR, CUDNN_PATCHLEVEL);
--- a/dnn/src/cuda/handle.h
+++ b/dnn/src/cuda/handle.h
@@ -124,6 +124,10 @@ class HandleImpl: public HandleImplHelper {

        size_t image2d_pitch_alignment() const override;
        HandleVendorType vendor_type() const override;

        class CUDNN;

        CUDNN& cudnn();
    private:
        bool m_is_tegra_k1;
        int m_device_id;
@@ -156,9 +160,34 @@ class HandleImpl: public HandleImplHelper {
        //! device ptr to const scalars
        ConstScalars* m_const_scalars;

        std::unique_ptr<CUDNN> m_cudnn_api_cache;

        void initialize_cusolver();
 };

 class HandleImpl::CUDNN {
    cudnnHandle_t m_handle;
 public:
    CUDNN(cudnnHandle_t handle);
 #define WRAP_CUDNN_API(NAME) thin_function<decltype(cudnn##NAME)> NAME;
    WRAP_CUDNN_API(GetConvolutionForwardWorkspaceSize);
 #if CUDNN_MAJOR >= 7
    WRAP_CUDNN_API(GetConvolutionForwardAlgorithm_v7);
    WRAP_CUDNN_API(GetConvolutionForwardAlgorithmMaxCount);
 #endif
 #if CUDNN_MAJOR >= 7
    WRAP_CUDNN_API(GetConvolutionBackwardDataAlgorithm_v7);
    WRAP_CUDNN_API(GetConvolutionBackwardDataAlgorithmMaxCount);
 #endif
    WRAP_CUDNN_API(GetConvolutionBackwardDataWorkspaceSize);
 #if CUDNN_MAJOR >= 7
    WRAP_CUDNN_API(GetConvolutionBackwardFilterAlgorithmMaxCount);
    WRAP_CUDNN_API(GetConvolutionBackwardFilterAlgorithm_v7);
 #endif
    WRAP_CUDNN_API(GetConvolutionBackwardFilterWorkspaceSize);
 #undef WRAP_CUDNN_API
 };

 } // namespace cuda
 } // namespace megdnn

--- a/dnn/src/cuda/handle_create.cpp
+++ b/dnn/src/cuda/handle_create.cpp
@@ -24,6 +24,7 @@
 #include "src/cuda/convolution/opr_impl.h"
 #include "src/cuda/convolution3d/opr_impl.h"
 #include "src/cuda/convpooling/opr_impl.h"
 #include "src/cuda/correlation/opr_impl.h"
 #include "src/cuda/cumsum/opr_impl.h"
 #include "src/cuda/cvt_color/opr_impl.h"
 #include "src/cuda/dct/opr_impl.h"
--- a/dnn/src/cuda/images2neibs/kernel.cu
+++ b/dnn/src/cuda/images2neibs/kernel.cu
@@ -24,7 +24,7 @@ namespace images2neibs {
 template <typename T>
 __global__ void forward_kernel(const T *src, T *dst,
        int N, int C, int IH, int IW, int OH, int OW,
        int ph, int pw, int sh, int sw, int WH, int WW)
        int ph, int pw, int sh, int sw, int dh, int dw, int WH, int WW)
 {
    int NC = N * C;
    int WP = WH*WW;
@@ -37,8 +37,8 @@ __global__ void forward_kernel(const T *src, T *dst,
            if (op < OH * OW) {
                int oh = op / OW;
                int ow = op % OW;
                int ih = -ph + sh * oh + wh;
                int iw = -pw + sw * ow + ww;
                int ih = -ph + sh * oh + wh* dh;
                int iw = -pw + sw * ow + ww* dw;
                int dst_pos = nc * OH * OW * WH * WW + op * WH * WW + wp;
                int src_pos = nc * IH * IW + ih * IW + iw;
                dst[dst_pos] = (ih >= 0 && ih < IH && iw >= 0 && iw < IW)
@@ -52,7 +52,7 @@ __global__ void forward_kernel(const T *src, T *dst,

 template <typename T>
 void forward(const T* src, T* dst, int N, int C, int IH, int IW, int OH, int OW,
             int ph, int pw, int sh, int sw, int wh, int ww,
             int ph, int pw, int sh, int sw, int dh, int dw, int wh, int ww,
             cudaStream_t stream) {
    int spatial_size = OH * OW;
    int kernel_size = wh * ww;
@@ -63,7 +63,7 @@ void forward(const T* src, T* dst, int N, int C, int IH, int IW, int OH, int OW,
    int by = N * C;

    forward_kernel<<<dim3(bx, std::min(grid_y_max, by)), dim3(tx, ty), 0,
                     stream>>>(src, dst, N, C, IH, IW, OH, OW, ph, pw, sh, sw,
                     stream>>>(src, dst, N, C, IH, IW, OH, OW, ph, pw, sh, sw, dh, dw,
                               wh, ww);
    after_kernel_launch();
 }
@@ -73,7 +73,7 @@ void forward(const T* src, T* dst, int N, int C, int IH, int IW, int OH, int OW,
 template <typename T>
 __global__ void backward_kernel(const T *diff, T *grad,
        int N, int C, int IH, int IW, int OH, int OW,
        int ph, int pw, int sh, int sw, int WH, int WW)
        int ph, int pw, int sh, int sw, int dh, int dw, int WH, int WW)
 {
    int id = threadIdx.x + blockIdx.x * blockDim.x;
    if (id < N*C*IH*IW) {
@@ -82,17 +82,20 @@ __global__ void backward_kernel(const T *diff, T *grad,
        int iw = id % (IH*IW) % IW;
        grad[nc*IH*IW + ih*IW + iw] = 0.0f;
        int oh_max = min((ih+ph) / sh, OH-1);
        int oh_min = max((ih+ph-(WH-1)+sh-1) / sh, 0);
        int oh_min = max((ih+ph-(WH-1)*dh+sh-1) / sh, 0);
        int ow_max = min((iw+pw) / sw, OW-1);
        int ow_min = max((iw+pw-(WW-1)+sw-1) / sw, 0);
        int ow_min = max((iw+pw-(WW-1)*dw+sw-1) / sw, 0);
        for (int oh = oh_min; oh <= oh_max; ++oh)
        for (int ow = ow_min; ow <= ow_max; ++ow)
        {
            int wh = ih+ph - sh*oh;
            int ww = iw+pw - sw*ow;
            grad[nc*IH*IW + ih*IW + iw] +=
                diff[nc*OH*OW*WH*WW + oh*OW*WH*WW + ow*WH*WW +
                        wh*WW + ww];
            if ((ih+ph - sh*oh)%dh==0 && (iw+pw - sw*ow)%dw==0){
                int wh = ih+ph - sh*oh - (ih+ph - sh*oh)/dh * (dh-1);
                int ww = iw+pw - sw*ow - (iw+pw - sw*ow)/dw * (dw-1);
                grad[nc*IH*IW + ih*IW + iw] +=
                    diff[nc*OH*OW*WH*WW + oh*OW*WH*WW + ow*WH*WW +
                            wh*WW + ww];

            }
        }
    }
 }
@@ -100,23 +103,23 @@ __global__ void backward_kernel(const T *diff, T *grad,
 template <typename T>
 void backward(const T *diff, T *grad,
        int N, int C, int IH, int IW, int OH, int OW,
        int ph, int pw, int sh, int sw, int wh, int ww,
        int ph, int pw, int sh, int sw, int dh, int dw, int wh, int ww,
        cudaStream_t stream)
 {
    int threads = NR_THREADS;
    int blocks = DIVUP(N*C*IH*IW, threads);
    backward_kernel<<<blocks, threads, 0, stream>>>(diff, grad,
            N, C, IH, IW, OH, OW,
            ph, pw, sh, sw, wh, ww);
            ph, pw, sh, sw, dh, dw, wh, ww);
    after_kernel_launch();
 }

 #define INST(T) \
    template void forward<T>(const T *, T *, int, int, int, int, int, int, \
            int, int, int, int, int, int, \
            int, int, int, int, int, int, int, int, \
            cudaStream_t); \
    template void backward<T>(const T *, T *, int, int, int, int, int, int, \
            int, int, int, int, int, int, \
            int, int, int, int, int, int, int, int, \
            cudaStream_t);
 #define cb(DType) \
    INST(DTypeTrait<DType>::ctype)
--- a/dnn/src/cuda/images2neibs/kernel.cuh
+++ b/dnn/src/cuda/images2neibs/kernel.cuh
@@ -18,13 +18,13 @@ namespace images2neibs {
 template <typename T>
 void forward(const T *src, T *dst,
        int N, int C, int IH, int IW, int OH, int OW,
        int ph, int pw, int sh, int sw, int wh, int ww,
        int ph, int pw, int sh, int sw, int dh, int dw, int wh, int ww,
        cudaStream_t stream);

 template <typename T>
 void backward(const T *diff, T *grad,
        int N, int C, int IH, int IW, int OH, int OW,
        int ph, int pw, int sh, int sw, int wh, int ww,
        int ph, int pw, int sh, int sw, int dh, int dw, int wh, int ww,
        cudaStream_t stream);
       
 } // namespace images2neibs
--- a/dnn/src/cuda/images2neibs/opr_impl.cpp
+++ b/dnn/src/cuda/images2neibs/opr_impl.cpp
@@ -27,13 +27,14 @@ void Images2NeibsForwardImpl::exec(_megdnn_tensor_in src,
    int OH = dst.layout[2], OW = dst.layout[3];
    int ph = param().pad_h, pw = param().pad_w;
    int sh = param().stride_h, sw = param().stride_w;
    int dh = param().dilate_h, dw = param().dilate_w;
    int wh = param().window_h, ww = param().window_w;
 #define cb(DType) \
    if (src.layout.dtype.enumv() == DTypeTrait<DType>::enumv) { \
        using T = DTypeTrait<DType>::ctype; \
        images2neibs::forward(src.ptr<T>(), dst.ptr<T>(), \
                N, C, IH, IW, OH, OW, \
                ph, pw, sh, sw, wh, ww, \
                ph, pw, sh, sw, dh, dw, wh, ww, \
                stream); \
        return; \
    }
@@ -53,13 +54,14 @@ void Images2NeibsBackwardImpl::exec(_megdnn_tensor_in diff,
    int OH = diff.layout[2], OW = diff.layout[3];
    int ph = param().pad_h, pw = param().pad_w;
    int sh = param().stride_h, sw = param().stride_w;
    int dh = param().dilate_h, dw = param().dilate_w;
    int wh = param().window_h, ww = param().window_w;
 #define cb(DType) \
    if (diff.layout.dtype == DType()) { \
        using T = DTypeTrait<DType>::ctype; \
        images2neibs::backward(diff.ptr<T>(), grad.ptr<T>(), \
                N, C, IH, IW, OH, OW, \
                ph, pw, sh, sw, wh, ww, \
                ph, pw, sh, sw, dh, dw, wh, ww, \
                stream); \
        return; \
    }
--- a/dnn/src/cuda/local_share/forward/algo.h
+++ b/dnn/src/cuda/local_share/forward/algo.h
@@ -89,7 +89,8 @@ public:
    void exec(const ExecArgs& args) const override;

    AlgoAttribute attribute() const override {
        return AlgoAttribute::REPRODUCIBLE;
        return AlgoAttribute::REPRODUCIBLE |
               AlgoAttribute::USABLE_DEPEND_ON_SHAPE;
    }

    const char* name() const override {
@@ -108,7 +109,8 @@ public:
    void exec(const ExecArgs& args) const override;

    AlgoAttribute attribute() const override {
        return AlgoAttribute::REPRODUCIBLE;
        return AlgoAttribute::REPRODUCIBLE |
               AlgoAttribute::USABLE_DEPEND_ON_SHAPE;
    }

    const char* name() const override {
--- a/dnn/src/cuda/matrix_mul/algos.h
+++ b/dnn/src/cuda/matrix_mul/algos.h
@@ -114,7 +114,9 @@ public:
    void exec(const ExecArgs& args) const override;
    MEGDNN_DECL_ALGO_TYPE(CUDA_CUBLAS)
    AlgoAttribute attribute() const override {
        return AlgoAttribute::REPRODUCIBLE;
        return AlgoAttribute::REPRODUCIBLE |
               AlgoAttribute::USABLE_DEPEND_ON_SHAPE |
               AlgoAttribute::ACCURACY_DEPEND_ON_BATCH;
    }
 };

@@ -141,7 +143,8 @@ public:
    void exec(const ExecArgs& args) const override;
    MEGDNN_DECL_ALGO_TYPE(CUDA_CUBLASLT)
    AlgoAttribute attribute() const override {
        return AlgoAttribute::REPRODUCIBLE;
        return AlgoAttribute::REPRODUCIBLE |
               AlgoAttribute::ACCURACY_DEPEND_ON_BATCH;
    }
 };
 #endif
@@ -231,7 +234,8 @@ public:
    const char* name() const override { return m_name.c_str(); }
    void exec(const ExecArgs& args) const override;
    AlgoAttribute attribute() const override {
        return AlgoAttribute::REPRODUCIBLE;
        return AlgoAttribute::REPRODUCIBLE |
               AlgoAttribute::USABLE_DEPEND_ON_SHAPE;
    }
    MEGDNN_DECL_ALGO_TYPE(CUDA_FLOAT32_SIMT_SPLIT_K)

--- a/dnn/src/cuda/matrix_mul/cutlass_float32_simt_split_k.cpp
+++ b/dnn/src/cuda/matrix_mul/cutlass_float32_simt_split_k.cpp
@@ -23,12 +23,20 @@ using namespace cutlass_wrapper;
 bool MatrixMulForwardImpl::AlgoFloat32SIMTSplitK::is_available(
        const SizeArgs& args) const {
    auto&& param = args.opr->param();
    int n = args.layout_c.shape[1],
    int m = args.layout_c.shape[0], n = args.layout_c.shape[1],
        k = args.layout_a.shape[param.transposeA ? 0 : 1];
    return args.opr->param().format == param::MatrixMul::Format::DEFAULT &&
           args.layout_a.dtype == dtype::Float32() &&
           args.layout_b.dtype == dtype::Float32() &&
           args.layout_c.dtype == dtype::Float32() && k > n;
    bool available =
            args.opr->param().format == param::MatrixMul::Format::DEFAULT &&
            args.layout_a.dtype == dtype::Float32() &&
            args.layout_b.dtype == dtype::Float32() &&
            args.layout_c.dtype == dtype::Float32() && k > n;
    auto&& device_prop = cuda::current_device_prop();
    int y_grid_limit = device_prop.maxGridSize[1];
    // limit y grid
    available &= ((m + m_algo_param.threadblock_m - 1) /
                          m_algo_param.threadblock_m <=
                  y_grid_limit);
    return available;
 }

 size_t MatrixMulForwardImpl::AlgoFloat32SIMTSplitK::get_workspace_in_bytes(
@@ -36,7 +44,7 @@ size_t MatrixMulForwardImpl::AlgoFloat32SIMTSplitK::get_workspace_in_bytes(
    auto&& param = args.opr->param();
    int m = args.layout_c.shape[0], n = args.layout_c.shape[1],
        k = args.layout_a.shape[param.transposeA ? 0 : 1];
    int split_k_slices = k / n;
    int split_k_slices = std::max(1, k / n);
    return args.layout_c.dtype.size(m * n * split_k_slices);
 }

@@ -49,7 +57,7 @@ void MatrixMulForwardImpl::AlgoFloat32SIMTSplitK::exec(
    int m = args.tensor_c.layout.shape[0], n = args.tensor_c.layout.shape[1],
        k = args.tensor_a.layout.shape[param.transposeA ? 0 : 1];
    GemmCoord problem_size{m, n, k};
    int split_k_slices = k / n;
    int split_k_slices = std::max(1, k / n);
    auto&& stream = cuda_stream(args.opr->handle());
    int* workspace = reinterpret_cast<int*>(args.workspace.raw_ptr);
    return cutlass_matrix_mul_float32_simt(
--- a/dnn/src/cuda/matrix_mul/uint4x4x32_wmma/preprocess_quantize_sum.cu
+++ b/dnn/src/cuda/matrix_mul/uint4x4x32_wmma/preprocess_quantize_sum.cu
@@ -43,6 +43,8 @@

 namespace {

 using namespace megdnn::cuda;

 template <int block_size_log2, int max_nr_threads_per_row>
 __global__ void reduce_column_with_scale_u4(const uint8_t* src, int32_t scale,
                                            int rows, int cols_int32,
--- a/dnn/src/cuda/topk/topk_radix.cu
+++ b/dnn/src/cuda/topk/topk_radix.cu
@@ -355,8 +355,8 @@ static __global__ void kern_reduce_block_cnt(const ctype* input_data,
 static MEGDNN_NOINLINE cudaError_t
 invoke_cub_scan(const uint64_t* input, uint64_t* output, void* workspace,
                size_t& workspace_size, uint32_t size, cudaStream_t stream) {
    return cub::DeviceScan::InclusiveSum(workspace, workspace_size, input,
                                         output, size, stream);
    return cub::DeviceScan::InclusiveSum(workspace, workspace_size,
                                                 input, output, size, stream);
 }

 static __global__ void kern_init_zero(uint64_t* dst) {
--- a/dnn/src/fallback/batched_matrix_mul/opr_impl.cpp
+++ b/dnn/src/fallback/batched_matrix_mul/opr_impl.cpp
@@ -11,7 +11,6 @@
 */
 #include "./opr_impl.h"
 #include "./algos.h"
 #include "hcc_detail/hcc_defs_prologue.h"

 #include "src/common/algo_chooser.h"
 #include "src/common/utils.cuh"
--- a/dnn/src/naive/correlation/opr_impl.cpp
+++ b/dnn/src/naive/correlation/opr_impl.cpp
@@ -0,0 +1,384 @@
 /**
 * \file dnn/src/naive/correlation/opr_impl.cpp
 * MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
 *
 * Copyright (c) 2014-2021 Megvii Inc. All rights reserved.
 *
 * Unless required by applicable law or agreed to in writing,
 * software distributed under the License is distributed on an
 * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or
 * implied.
 */
 #include "src/naive/correlation/opr_impl.h"
 #include <algorithm>
 #include "src/common/utils.h"
 #include "src/naive/handle.h"
 #define ROUND_OFF 50000
 using namespace megdnn;
 using namespace naive;
 using namespace std;
 namespace {

 using Param = megdnn::Correlation::Param;

 template <typename T>
 void forward(_megdnn_tensor_in data1, _megdnn_tensor_in data2,
             _megdnn_tensor_out dst, const Param& param) {
    // data1 treat as no-padding tensor
    int total_nr_elems = dst.layout.total_nr_elems();

    int stride1 = param.stride1, stride2 = param.stride2;
    int kernel_size = param.kernel_size;
    int kernel_radius = (kernel_size - 1) / 2;
    int max_displacement = param.max_displacement;
    int pad_size = param.pad_size;

    int tchannels = dst.layout[1];
    int theight = dst.layout[2], twidth = dst.layout[3];
    int bchannels = data1.layout[1];
    int bheight = data1.layout[2], bwidth = data1.layout[3];

    int neighborhood_grid_radius = max_displacement / stride2;
    int neighborhood_grid_width = neighborhood_grid_radius * 2 + 1;

    for (int idx = 0; idx < total_nr_elems; ++idx) {
        int x = idx % twidth;
        int y = (idx / twidth) % theight;
        int c = (idx / twidth / theight) % tchannels;
        int n = idx / twidth / theight / tchannels;

        // get src center position in image1
        int x1 = x * stride1 + kernel_radius + max_displacement - pad_size;
        int y1 = y * stride1 + kernel_radius + max_displacement - pad_size;

        // get offset of center in image2
        int s2o = (c % neighborhood_grid_width - neighborhood_grid_radius) *
                  stride2;
        int s2p = (c / neighborhood_grid_width - neighborhood_grid_radius) *
                  stride2;

        int x2 = x1 + s2o;
        int y2 = y1 + s2p;

        // compute kernel correlation
        float sum = 0.;
        for (int i = -kernel_radius; i <= kernel_radius; i++) {
            for (int j = -kernel_radius; j <= kernel_radius; j++) {
                int in_x1 = x1 + i;
                int in_y1 = y1 + j;
                int in_x2 = x2 + i;
                int in_y2 = y2 + j;

                for (int channel = 0; channel < bchannels; channel++) {
                    float tmp1 = 0.;
                    float tmp2 = 0.;
                    if (in_x1 >= 0 && in_x1 < bwidth && in_y1 >= 0 &&
                        in_y1 < bheight) {
                        int idx1 =
                                ((n * bchannels + channel) * bheight + in_y1) *
                                        bwidth +
                                in_x1;
                        tmp1 = data1.ptr<T>()[idx1];
                    }

                    if (in_x2 >= 0 && in_x2 < bwidth && in_y2 >= 0 &&
                        in_y2 < bheight) {
                        int idx2 =
                                ((n * bchannels + channel) * bheight + in_y2) *
                                        bwidth +
                                in_x2;
                        tmp2 = data2.ptr<T>()[idx2];
                    }

                    if (param.is_multiply) {
                        sum += tmp1 * tmp2;
                    } else {
                        sum += fabsf(tmp1 - tmp2);
                    }
                }
            }
        }

        const int sumelems =
                (kernel_radius * 2 + 1) * (kernel_radius * 2 + 1) * bchannels;
        dst.ptr<T>()[idx] = sum / sumelems;
    }
 }

 template <typename T>
 void backward_data1(_megdnn_tensor_in diff, _megdnn_tensor_in data1,
                    _megdnn_tensor_in data2, _megdnn_tensor_out grad1,
                    const Param& param) {
    // data1 treat as no-padding tensor
    // int total_nr_elems = diff.layout.total_nr_elems();
    int total_nr_elems = grad1.layout.total_nr_elems();

    int stride1 = param.stride1, stride2 = param.stride2;
    int kernel_size = param.kernel_size;
    int kernel_radius = (kernel_size - 1) / 2;
    int max_displacement = param.max_displacement;
    int pad_size = param.pad_size;

    int tchannels = diff.layout[1];
    int theight = diff.layout[2], twidth = diff.layout[3];
    int bchannels = grad1.layout[1];
    int bheight = grad1.layout[2], bwidth = grad1.layout[3];

    int neighborhood_grid_radius = max_displacement / stride2;
    int neighborhood_grid_width = neighborhood_grid_radius * 2 + 1;

    for (int idx = 0; idx < total_nr_elems; ++idx) {
        // idx for grad1

        int x = idx % bwidth;
        int y = (idx / bwidth) % bheight;
        int c = (idx / bwidth / bheight) % bchannels;
        int n = idx / bwidth / bheight / bchannels;

        float tmp1 = data1.ptr<T>()[idx];
        // Get X,Y ranges and clamp
        // round_off is a trick to enable integer division with ceil, even for
        // negative numbers We use a large offset, for the inner part not to
        // become negative.
        const int round_off = ROUND_OFF;
        const int round_off_s1 = stride1 * round_off;

        // we show cal the x_min,y_min,x_max,y_max of diff for grad1(x,y)
        // for diff_x_min, diff_y_min, x,y at the position of right-down
        // ceil (l - 2*kernel_radius - max_displacement + pad_size) / stride1
        int xmin = (x + pad_size - 2 * kernel_radius - max_displacement +
                    round_off_s1 - 1) /
                           stride1 +
                   1 - round_off;
        int ymin = (y + pad_size - 2 * kernel_radius - max_displacement +
                    round_off_s1 - 1) /
                           stride1 +
                   1 - round_off;
        // floor (l - max_displacement + pad_size) / stride1
        int xmax = (x + pad_size - max_displacement + round_off_s1) / stride1 -
                   round_off;
        int ymax = (y + pad_size - max_displacement + round_off_s1) / stride1 -
                   round_off;

        float sum = 0.;
        if (xmax >= 0 && ymax >= 0 && (xmin <= twidth - 1) &&
            (ymin <= theight - 1)) {
            xmin = max(0, xmin);
            xmax = min(twidth - 1, xmax);

            ymin = max(0, ymin);
            ymax = min(theight - 1, ymax);

            for (int p = -neighborhood_grid_radius;
                 p <= neighborhood_grid_radius; p++) {
                for (int o = -neighborhood_grid_radius;
                     o <= neighborhood_grid_radius; o++) {
                    // Get bottom1 data:
                    int s2o = stride2 * o;
                    int s2p = stride2 * p;
                    int x2 = x + s2p, y2 = y + s2o;

                    int idx2 =
                            ((n * bchannels + c) * bheight + y2) * bwidth + x2;
                    float tmp2 = 0.;

                    if (x2 >= 0 && x2 < bwidth && y2 >= 0 && y2 < bheight) {
                        tmp2 = data2.ptr<T>()[idx2];
                    }

                    int op = (p + neighborhood_grid_radius) *
                                     neighborhood_grid_width +
                             (o + neighborhood_grid_radius);
                    int diff_channels_offset = (n * tchannels + op);

                    for (int diff_y = ymin; diff_y <= ymax; diff_y++) {
                        for (int diff_x = xmin; diff_x <= xmax; diff_x++) {
                            int idxtopdiff =
                                    (diff_channels_offset * theight + diff_y) *
                                            twidth +
                                    diff_x;

                            if (param.is_multiply) {
                                sum += diff.ptr<T>()[idxtopdiff] * tmp2;
                            } else {
                                T sign = (tmp1 > tmp2) ? T(1.) : T(-1.);
                                sum += diff.ptr<T>()[idxtopdiff] * sign;
                            }
                        }
                    }
                }
            }
        }

        const int sumelems =
                (kernel_radius * 2 + 1) * (kernel_radius * 2 + 1) * bchannels;
        grad1.ptr<T>()[idx] = sum / sumelems;
    }
 }

 template <typename T>
 void backward_data2(_megdnn_tensor_in diff, _megdnn_tensor_in data1,
                    _megdnn_tensor_in data2, _megdnn_tensor_out grad2,
                    const Param& param) {
    // data1 treat as no-padding tensor
    int total_nr_elems = grad2.layout.total_nr_elems();

    int stride1 = param.stride1, stride2 = param.stride2;
    int kernel_size = param.kernel_size;
    int kernel_radius = (kernel_size - 1) / 2;
    int max_displacement = param.max_displacement;
    int pad_size = param.pad_size;

    int tchannels = diff.layout[1];
    int theight = diff.layout[2], twidth = diff.layout[3];
    int bchannels = grad2.layout[1];
    int bheight = grad2.layout[2], bwidth = grad2.layout[3];

    int neighborhood_grid_radius = max_displacement / stride2;
    int neighborhood_grid_width = neighborhood_grid_radius * 2 + 1;

    for (int idx = 0; idx < total_nr_elems; ++idx) {
        int x = idx % bwidth;
        int y = (idx / bwidth) % bheight;
        int c = (idx / bwidth / bheight) % bchannels;
        int n = idx / bwidth / bheight / bchannels;

        T tmp2 = data2.ptr<T>()[idx];

        T sum = T(0.f);

        for (int p = -neighborhood_grid_radius; p <= neighborhood_grid_radius;
             p++) {
            for (int o = -neighborhood_grid_radius;
                 o <= neighborhood_grid_radius; o++) {
                int s2o = o * stride2;
                int s2p = p * stride2;

                int x1 = x - s2o;
                int y1 = y - s2p;

                const int round_off = ROUND_OFF;
                const int round_off_s1 = stride1 * round_off;

                int xmin = (x1 + pad_size - 2 * kernel_radius -
                            max_displacement + round_off_s1 - 1) /
                                   stride1 +
                           1 - round_off;
                int ymin = (y1 + pad_size - 2 * kernel_radius -
                            max_displacement + round_off_s1 - 1) /
                                   stride1 +
                           1 - round_off;
                int xmax = (x1 + pad_size - max_displacement + round_off_s1) /
                                   stride1 -
                           round_off;
                int ymax = (y1 + pad_size - max_displacement + round_off_s1) /
                                   stride1 -
                           round_off;

                if (xmax >= 0 && ymax >= 0 && (xmin <= twidth - 1) &&
                    (ymin <= theight - 1)) {
                    xmin = max(0, xmin);
                    xmax = min(twidth - 1, xmax);

                    ymin = max(0, ymin);
                    ymax = min(theight - 1, ymax);

                    int idx1 =
                            ((n * bchannels + c) * bheight + y1) * bwidth + x1;
                    T tmp1 = T(0.f);
                    if (x1 >= 0 && x1 < bwidth && y1 >= 0 && y1 < bheight) {
                        tmp1 = data1.ptr<T>()[idx1];
                    }

                    int op = (p + neighborhood_grid_radius) *
                                     neighborhood_grid_width +
                             (o + neighborhood_grid_radius);
                    int diff_channels_offset = (n * tchannels + op);
                    for (int diff_y = ymin; diff_y <= ymax; diff_y++) {
                        for (int diff_x = xmin; diff_x <= xmax; diff_x++) {
                            int idxtopdiff =
                                    (diff_channels_offset * theight + diff_y) *
                                            twidth +
                                    diff_x;

                            if (param.is_multiply) {
                                sum += diff.ptr<T>()[idxtopdiff] * tmp1;
                            } else {
                                T sign = (tmp1 >= tmp2) ? T(-1.f) : T(1.f);
                                sum += diff.ptr<T>()[idxtopdiff] * sign;
                            }
                        }
                    }
                }
            }
        }

        const int sumelems =
                (kernel_radius * 2 + 1) * (kernel_radius * 2 + 1) * bchannels;
        grad2.ptr<T>()[idx] = sum / sumelems;
    }
 }

 }  // namespace

 namespace megdnn {
 namespace naive {

 void CorrelationForwardImpl::exec(_megdnn_tensor_in data1,
                                  _megdnn_tensor_in data2,
                                  _megdnn_tensor_out dst,
                                  _megdnn_workspace workspace) {
    check_exec(data1.layout, data2.layout, dst.layout, workspace.size);
 #define cb(DType)                                                             \
    if (data1.layout.dtype == DType()) {                                      \
        MEGDNN_DISPATCH_CPU_KERN_OPR(                                         \
                forward<typename DTypeTrait<DType>::ctype>(data1, data2, dst, \
                                                           param()));         \
        return;                                                               \
    }
    MEGDNN_FOREACH_COMPUTING_DTYPE_FLOAT(cb)
 #undef cb
    megdnn_throw("bad dtype");
 }

 void CorrelationBackwardData1Impl::exec(_megdnn_tensor_in diff,
                                        _megdnn_tensor_in data1,
                                        _megdnn_tensor_in data2,
                                        _megdnn_tensor_out grad1,
                                        _megdnn_workspace workspace) {
    check_exec(diff.layout, data1.layout, data2.layout, grad1.layout,
               workspace.size);
 #define cb(DType)                                                  \
    if (diff.layout.dtype == DType()) {                            \
        MEGDNN_DISPATCH_CPU_KERN_OPR(                              \
                backward_data1<typename DTypeTrait<DType>::ctype>( \
                        diff, data1, data2, grad1, param()));      \
        return;                                                    \
    }
    MEGDNN_FOREACH_COMPUTING_DTYPE_FLOAT(cb)
 #undef cb
    megdnn_throw("bad dtype");
 }

 void CorrelationBackwardData2Impl::exec(_megdnn_tensor_in diff,
                                        _megdnn_tensor_in data1,
                                        _megdnn_tensor_in data2,
                                        _megdnn_tensor_out grad2,
                                        _megdnn_workspace workspace) {
    check_exec(diff.layout, data1.layout, data2.layout, grad2.layout,
               workspace.size);
 #define cb(DType)                                                  \
    if (diff.layout.dtype == DType()) {                            \
        MEGDNN_DISPATCH_CPU_KERN_OPR(                              \
                backward_data2<typename DTypeTrait<DType>::ctype>( \
                        diff, data1, data2, grad2, param()));      \
        return;                                                    \
    }
    MEGDNN_FOREACH_COMPUTING_DTYPE_FLOAT(cb)
 #undef cb
    megdnn_throw("bad dtype");
 }

 }  // namespace naive
 }  // namespace megdnn
 // vim: syntax=cpp.doxygen
--- a/dnn/src/naive/correlation/opr_impl.h
+++ b/dnn/src/naive/correlation/opr_impl.h
@@ -0,0 +1,58 @@
 /**
 * \file dnn/src/naive/correlation/opr_impl.h
 * MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
 *
 * Copyright (c) 2014-2021 Megvii Inc. All rights reserved.
 *
 * Unless required by applicable law or agreed to in writing,
 * software distributed under the License is distributed on an
 * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or
 * implied.
 */
 #pragma once
 #include "megdnn/oprs.h"

 namespace megdnn {
 namespace naive {

 class CorrelationForwardImpl final : public CorrelationForward {
 public:
    using CorrelationForward::CorrelationForward;
    void exec(_megdnn_tensor_in data1, _megdnn_tensor_in data2,
              _megdnn_tensor_out dst, _megdnn_workspace workspace) override;
    size_t get_workspace_in_bytes(const TensorLayout&, const TensorLayout&,
                                  const TensorLayout&) override {
        return 0;
    }
 };

 class CorrelationBackwardData1Impl final : public CorrelationBackwardData1 {
 public:
    using CorrelationBackwardData1::CorrelationBackwardData1;
    void exec(_megdnn_tensor_in diff, _megdnn_tensor_in data1,
              _megdnn_tensor_in data2, _megdnn_tensor_out grad1,
              _megdnn_workspace workspace) override;
    size_t get_workspace_in_bytes(const TensorLayout&, const TensorLayout&,
                                  const TensorLayout&,
                                  const TensorLayout&) override {
        return 0;
    }
 };

 class CorrelationBackwardData2Impl final : public CorrelationBackwardData2 {
 public:
    using CorrelationBackwardData2::CorrelationBackwardData2;
    void exec(_megdnn_tensor_in diff, _megdnn_tensor_in data1,
              _megdnn_tensor_in data2, _megdnn_tensor_out grad2,
              _megdnn_workspace workspace) override;
    size_t get_workspace_in_bytes(const TensorLayout&, const TensorLayout&,
                                  const TensorLayout&,
                                  const TensorLayout&) override {
        return 0;
    }
 };

 }  // namespace naive
 }  // namespace megdnn

 // vim: syntax=cpp.doxygen
--- a/dnn/src/naive/handle.cpp
+++ b/dnn/src/naive/handle.cpp
@@ -28,6 +28,7 @@
 #include "src/naive/convolution/opr_impl.h"
 #include "src/naive/convolution3d/opr_impl.h"
 #include "src/naive/convpooling/opr_impl.h"
 #include "src/naive/correlation/opr_impl.h"
 #include "src/naive/cumsum/opr_impl.h"
 #include "src/naive/cvt_color/opr_impl.h"
 #include "src/naive/dct/opr_impl.h"
@@ -37,6 +38,7 @@
 #include "src/naive/elemwise/opr_impl.h"
 #include "src/naive/elemwise_multi_type/opr_impl.h"
 #include "src/naive/eye/opr_impl.h"
 #include "src/naive/fake_quant/opr_impl.h"
 #include "src/naive/flip/opr_impl.h"
 #include "src/naive/gaussian_blur/opr_impl.h"
 #include "src/naive/group_local/opr_impl.h"
@@ -74,13 +76,11 @@
 #include "src/naive/tensor_remap/opr_impl.h"
 #include "src/naive/tile/opr_impl.h"
 #include "src/naive/topk/opr_impl.h"
 #include "src/naive/tqt/opr_impl.h"
 #include "src/naive/transpose/opr_impl.h"
 #include "src/naive/type_cvt/opr_impl.h"
 #include "src/naive/warp_affine/opr_impl.h"
 #include "src/naive/warp_perspective/opr_impl.h"
 #include "src/naive/remap/opr_impl.h"
 #include "src/naive/fake_quant/opr_impl.h"
 #include "src/naive/tqt/opr_impl.h"

 static size_t g_image2d_pitch_alignment = 1;

--- a/dnn/src/naive/images2neibs/opr_impl.cpp
+++ b/dnn/src/naive/images2neibs/opr_impl.cpp
@@ -33,20 +33,25 @@ void Images2NeibsForwardImpl::exec_internal(_megdnn_tensor_in src,
    int pad_w = static_cast<int>(param().pad_w);
    int stride_h = static_cast<int>(param().stride_h);
    int stride_w = static_cast<int>(param().stride_w);
    int dilate_h = static_cast<int>(param().dilate_h);
    int dilate_w = static_cast<int>(param().dilate_w);
    int equ_window_h = dilate_h * (window_h-1) + 1;
    int equ_window_w = dilate_w * (window_w-1) + 1;
    for (int n = 0; n < N; ++n)
    for (int c = 0; c < C; ++c)
    {
        int ih = -pad_h;
        for (; ih+window_h <= IH+pad_h; ih += stride_h) {
        for (; ih+equ_window_h <= IH+pad_h; ih += stride_h) {
            int iw = -pad_w;
            for (; iw+window_w <= IW+pad_w; iw += stride_w) {
            for (; iw+equ_window_w <= IW+pad_w; iw += stride_w) {
                for (int kh = 0; kh < window_h; ++kh)
                for (int kw = 0; kw < window_w; ++kw)
                {
                    int ih2 = ih+dilate_h*kh, iw2 = iw+dilate_w*kw;
                    dptr[idx*window_h*window_w + kh*window_w + kw] =
                        (ih+kh) >= 0 && (ih+kh) < IH &&
                        (iw+kw) >= 0 && (iw+kw) < IW ?
                        sptr[n*C*IH*IW + c*IH*IW + (ih+kh)*IW + (iw+kw)] : 0.0f;
                        ih2 >= 0 && ih2 < IH &&
                        iw2 >= 0 && iw2 < IW ?
                        sptr[n*C*IH*IW + c*IH*IW + ih2*IW + iw2] : 0.0f;
                }
                ++idx;
            }
@@ -86,18 +91,22 @@ void Images2NeibsBackwardImpl::exec_internal(_megdnn_tensor_in diff,
    int pad_w = static_cast<int>(param().pad_w);
    int stride_h = static_cast<int>(param().stride_h);
    int stride_w = static_cast<int>(param().stride_w);
    int dilate_h = static_cast<int>(param().dilate_h);
    int dilate_w = static_cast<int>(param().dilate_w);
    int equ_window_h = dilate_h * (window_h-1) + 1;
    int equ_window_w = dilate_w * (window_w-1) + 1;
    memset(sptr, 0, sizeof(T) * N*C*IH*IW);
    for (int n = 0; n < N; ++n)
    for (int c = 0; c < C; ++c)
    {
        int ih = -pad_h;
        for (; ih+window_h <= IH+pad_h; ih += stride_h) {
        for (; ih+equ_window_h <= IH+pad_h; ih += stride_h) {
            int iw = -pad_w;
            for (; iw+window_w <= IW+pad_w; iw += stride_w) {
            for (; iw+equ_window_w <= IW+pad_w; iw += stride_w) {
                for (int kh = 0; kh < window_h; ++kh)
                for (int kw = 0; kw < window_w; ++kw)
                {
                    int ih2 = ih+kh, iw2 = iw+kw;
                    int ih2 = ih+dilate_h*kh, iw2 = iw+dilate_w*kw;
                    if (ih2 >= 0 && ih2 < IH && iw2 >= 0 && iw2 < IW) {
                        sptr[n*C*IH*IW + c*IH*IW + ih2*IW + iw2] +=
                            dptr[idx*window_h*window_w + kh*window_w + kw];
--- a/dnn/src/rocm/convolution/chanwise/bwd_data.cpp.hip
+++ b/dnn/src/rocm/convolution/chanwise/bwd_data.cpp.hip
@@ -147,7 +147,7 @@ void chanwise::run_bwd_data(T* src_grad, const T* dst_grad, const T* flt,
    dim3 nr_block(param.src_chl,
                  std::min(512, max(nr_out_dimx / (nr_thread * 4), 1)));
    uint32_t shared = param.chl_mul * param.flt_h * param.flt_w * sizeof(T);
    kern<<<nr_block, nr_thread, shared, stream>>>(src_grad, dst_grad, flt,
    hipLaunchKernelGGL(kern, nr_block, nr_thread, shared, stream, src_grad, dst_grad, flt,
                                                  param);
    after_kernel_launch();
 }
--- a/dnn/src/rocm/convolution/chanwise/fwd.cpp.hip
+++ b/dnn/src/rocm/convolution/chanwise/fwd.cpp.hip
@@ -105,7 +105,7 @@ void chanwise::run_fwd(T* dst, const T* src, const T* flt, const Param& param,
    dim3 nr_block(param.src_chl,
                  std::min(512, max(nr_out_dimx / (nr_thread * 4), 1)));
    uint32_t shared = param.chl_mul * param.flt_h * param.flt_w * sizeof(T);
    kern<<<nr_block, nr_thread, shared, stream>>>(dst, src, flt, param);
    hipLaunchKernelGGL(kern, nr_block, nr_thread, shared, stream, dst, src, flt, param);
    after_kernel_launch();
 }

--- a/dnn/src/rocm/convolution/forward/inplace_matmul_impl.cpp.hip
+++ b/dnn/src/rocm/convolution/forward/inplace_matmul_impl.cpp.hip
@@ -314,7 +314,7 @@ void convolution::exec_inplace_matmul_fwd(
            } else {                                                          \
                kptr = conv_kernel<BY, BX, false, BufferFetcherTexture>;      \
            }                                                                 \
            kptr<<<blocks, threads, 0, stream>>>(                             \
            hipLaunchKernelGGL(kptr, blocks, threads, 0, stream,                    \
                    src_tex.val, filter_tex.val, dst, INP_BS, OUT_BS, IC, IH, \
                    IW, OC, OH, OW, FH, FW, SH, SW, PH, PW);                  \
        } else {                                                              \
@@ -324,7 +324,7 @@ void convolution::exec_inplace_matmul_fwd(
            } else {                                                          \
                kptr = conv_kernel<BY, BX, false, BufferFetcherRaw>;          \
            }                                                                 \
            kptr<<<blocks, threads, 0, stream>>>(                             \
            hipLaunchKernelGGL(kptr, blocks, threads, 0, stream,              \
                    src_buf, filter_buf, dst, INP_BS, OUT_BS, IC, IH, IW, OC, \
                    OH, OW, FH, FW, SH, SW, PH, PW);                          \
        }                                                                     \
--- a/dnn/src/rocm/handle.cpp
+++ b/dnn/src/rocm/handle.cpp
@@ -36,6 +36,7 @@
 #include "src/rocm/argmxx/opr_impl.h"
 #include "src/rocm/sleep/opr_impl.h"
 #include "src/rocm/batch_normalization/opr_impl.h"
 #include "src/rocm/param_pack/opr_impl.h"

 #include <miopen/version.h>
 #include <hip/hip_version.h>
@@ -174,6 +175,7 @@ MEGDNN_SPECIALIZE_CREATE_OPERATOR(ArgminForward);
 MEGDNN_SPECIALIZE_CREATE_OPERATOR(SleepForward);
 MEGDNN_SPECIALIZE_CREATE_OPERATOR(BNForward);
 MEGDNN_SPECIALIZE_CREATE_OPERATOR(BNBackward);
 MEGDNN_SPECIALIZE_CREATE_OPERATOR(ParamPackConcat);

 #pragma GCC diagnostic push
 #pragma GCC diagnostic ignored "-Wpragmas"
--- a/dnn/src/rocm/handle.h
+++ b/dnn/src/rocm/handle.h
@@ -18,7 +18,6 @@
 #include "src/common/utils.h"
 #include "src/rocm/miopen_with_check.h"

 #include <rocblas-types.h>
 #include <rocblas.h>
 #include <atomic>
 #include <mutex>
--- a/dnn/src/rocm/matrix_mul/algos.h
+++ b/dnn/src/rocm/matrix_mul/algos.h
@@ -100,7 +100,8 @@ public:
    const char* name() const override { return "BLAS"; }
    void exec(const ExecArgs& args) const override;
    AlgoAttribute attribute() const override {
        return AlgoAttribute::REPRODUCIBLE;
        return AlgoAttribute::REPRODUCIBLE |
               AlgoAttribute::USABLE_DEPEND_ON_SHAPE;
    }
    MEGDNN_DECL_ALGO_TYPE(ROCM_BLAS)
 };
--- a/dnn/src/rocm/miopen_with_check.h
+++ b/dnn/src/rocm/miopen_with_check.h
@@ -11,9 +11,7 @@

 #pragma once

 #ifndef __HIP_PLATFORM_HCC__
 #define __HIP_PLATFORM_HCC__
 #endif
 #include "hcc_detail/hcc_defs_prologue.h"

 #include <miopen/version.h>
 #pragma GCC diagnostic push
--- a/dnn/src/rocm/param_pack/opr_impl.cpp
+++ b/dnn/src/rocm/param_pack/opr_impl.cpp
@@ -0,0 +1,65 @@
 /**
 * \file dnn/src/rocm/param_pack/opr_impl.cpp
 * MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
 *
 * Copyright (c) 2014-2020 Megvii Inc. All rights reserved.
 *
 * Unless required by applicable law or agreed to in writing,
 * software distributed under the License is distributed on an
 * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 */

 #include "hcc_detail/hcc_defs_prologue.h"
 #include "src/rocm/param_pack/opr_impl.h"
 #include "src/rocm/param_pack/param_pack.h.hip"
 #include "src/rocm/utils.h"

 namespace megdnn {
 namespace rocm {

 size_t ParamPackConcatImpl::get_workspace_in_bytes(const TensorShapeArray& srcs,
                                                   const TensorShape&,
                                                   const TensorShape&) {
    return sizeof(size_t) * srcs.size();
 }

 template <typename T>
 void ParamPackConcatImpl::exec_internal(_megdnn_tensor_in srcs,
                                        _megdnn_tensor_in offsets,
                                        _megdnn_tensor_out dst,
                                        _megdnn_workspace workspace) {
    size_t inp_size = srcs.layout.shape[0],
           out_size = dst.layout.total_nr_elems();
    auto stream = hip_stream(this->handle());

    auto src_cpu = static_cast<const T**>(srcs.raw_ptr);
    megdnn_assert_internal(src_cpu);
    auto src_gpu = reinterpret_cast<const T**>(workspace.raw_ptr);

    auto offsets_gpu = offsets.ptr<int32_t>();

    hip_check(hipMemcpyAsync(src_gpu, src_cpu, sizeof(const T*) * inp_size,
                               hipMemcpyHostToDevice, stream));

    param_pack::concat_proxy<T>(src_gpu, dst.ptr<T>(), inp_size, out_size,
                                offsets_gpu, stream);
 }

 void ParamPackConcatImpl::exec(_megdnn_tensor_in srcs,
                               _megdnn_tensor_in offsets,
                               _megdnn_tensor_out dst,
                               _megdnn_workspace workspace) {
    check_exec(dst.layout, offsets.layout, srcs.layout);
 #define cb(DType)                                            \
    if (dst.layout.dtype == DType()) {                       \
        using ctype = typename DTypeTrait<DType>::ctype;     \
        exec_internal<ctype>(srcs, offsets, dst, workspace); \
        return;                                              \
    }
    MEGDNN_FOREACH_COMPUTING_DTYPE(cb)
    megdnn_throw("bad type");
 #undef cb
 }

 }  // namespace rocm
 }  // namespace megdnn
Author	SHA1	Message	Date
Megvii Engine Team	47138c06cf	perf(dist): add fastpath for bcast params GitOrigin-RevId: `aa40b3cd72`	4 years ago
Megvii Engine Team	1a21dfdf5c	refactor(dtr): import dtr as submodule GitOrigin-RevId: `abecd0f176`	4 years ago
Megvii Engine Team	6964576e89	perf(dist): speed up bcast_val GitOrigin-RevId: `21c4123b09`	4 years ago
Megvii Engine Team	052a600f03	feat(mge/module): add python wrapper for unfold GitOrigin-RevId: `562103186f`	4 years ago
Megvii Engine Team	77a4bebff8	feat(mge/tools): add bash_completion for user to hint script BREAKING CHANGE: fix(mge/tools): change help script for other shell BREAKING CHANGE: fix(mge/tools): fix double -h error BREAKING CHANGE: fix(mge/tools): add support for _ parameter BREAKING CHANGE: GitOrigin-RevId: `537c3a2aae`	4 years ago
Megvii Engine Team	bff7d163a5	refactor(mge/jit): using static global_enable for apply ctx insted of global variable GitOrigin-RevId: `dd82b53faf`	4 years ago
Megvii Engine Team	2b7da42287	refactor(mge/jit): remove is_compiled flag in cpp tensor GitOrigin-RevId: `15f90af735`	4 years ago
Megvii Engine Team	2762a55248	refactor(mge/jit): make trace return any kind of output GitOrigin-RevId: `fd1265c661`	4 years ago
Megvii Engine Team	a29bf679a8	refactor(mge/functional): remove dependence to trace in functional implementations GitOrigin-RevId: `0b18479fcc`	4 years ago
Megvii Engine Team	2621c37801	feat(mge/tools): optimize statistical tools BREAKING CHANGE: GitOrigin-RevId: `cd2a1acd11`	4 years ago
Megvii Engine Team	fb69697f6e	docs(mge/module): add note about weight shape in conv GitOrigin-RevId: `43e1f15968`	4 years ago
Megvii Engine Team	186768e341	docs(misc): correct docstring format broadly GitOrigin-RevId: `45234ca07e`	4 years ago
Megvii Engine Team	aa0e11febe	fix(imperative/dataset): fix VOC dataset GitOrigin-RevId: `a46a4c1173`	4 years ago
Megvii Engine Team	2c2beac915	fix(mge/utils): fix bug of VarNode inplace operations GitOrigin-RevId: `fa9eec7079`	4 years ago
Megvii Engine Team	4fa9096d20	feat(megbrain): add correlation opr GitOrigin-RevId: `6d44598891`	4 years ago
Megvii Engine Team	b3aae4bb16	feat(dnn/cuda): add correlation kernel GitOrigin-RevId: `25e58b61e6`	4 years ago
Megvii Engine Team	866ab6e62e	fix(imperative/dataloader): collect garbage explicitly before starting a process in dataloader GitOrigin-RevId: `3c3b51ad9c`	4 years ago
Megvii Engine Team	67167cb39d	feat(imperative/autodiff): add grad clip GitOrigin-RevId: `f344f4a233`	4 years ago
Megvii Engine Team	d4dd134d1a	refactor(mge/dtr): update dtr api GitOrigin-RevId: `dc366c65be`	4 years ago
Megvii Engine Team	9122ff2e40	fix(mge/dtr): filter bad ops GitOrigin-RevId: `380262dca0`	4 years ago
Megvii Engine Team	71e007c310	fix(mge/dump): fix dump_with_testcase_mge Varnode type mismatch GitOrigin-RevId: `05618e5ac5`	4 years ago
Megvii Engine Team	7d18102795	fix(imperative/tensor): init m_offset when constructing a Tensor with DeviceTensorND GitOrigin-RevId: `b340e27c47`	4 years ago
Megvii Engine Team	af62add02d	fix(api_cache): fix serialization for conv_desc GitOrigin-RevId: `95dbc9c685`	4 years ago
Megvii Engine Team	d0a68c431a	fix(api_cache): lock api cache for thread safety GitOrigin-RevId: `8a244677c3`	4 years ago
Megvii Engine Team	e8061e6628	perf(cuda/conv): cache serval cudnn api GitOrigin-RevId: `188c62cdd6`	4 years ago
Megvii Engine Team	fe8488bf0a	feat(dnn/apicache): add generic apicache GitOrigin-RevId: `40b8ac2ab6`	4 years ago
Megvii Engine Team	45b9b5ed14	test(mge/distributed): add gather scatter reduce broadcast grad test GitOrigin-RevId: `8245e11f1d`	4 years ago
Megvii Engine Team	78d1e2b2ca	fix(mge/distributed): fix gather scatter reduce broadcast autodiff GitOrigin-RevId: `1c2250a079`	4 years ago
Megvii Engine Team	27ff38dd21	feat(functional/loss): add reduction choices to loss functions GitOrigin-RevId: `a29e6bb4cf`	4 years ago
Megvii Engine Team	927369dacb	fix(mgb/dnn): fix cudnn8.0.4 convbias with z GitOrigin-RevId: `09453d8a12`	4 years ago
Megvii Engine Team	0a1bd90f6b	test(param_pack): more cases for param pack concat GitOrigin-RevId: `0700b548ab`	4 years ago
Megvii Engine Team	49a8134dd9	fix(param_pack): impl param pack concat in imperative_rt GitOrigin-RevId: `91edd9c0bf`	4 years ago
Megvii Engine Team	acb239927b	fix(mge/jit): error out if dump bn in training mode GitOrigin-RevId: `edc7ea2962`	4 years ago
Megvii Engine Team	dc12d319f6	fix(mge/network_visualize): fix warning 'span dist too large' GitOrigin-RevId: `8db691b8ac`	4 years ago
Megvii Engine Team	2f1e7c5352	feat(build): change make to ninja GitOrigin-RevId: `ccff72e39a`	4 years ago
Megvii Engine Team	e70c9a3839	fix(cmake): fix cmake depends reason: some target call python3 to generate some headers, if PYTHON_EXECUTABLE changed, headers will be invalid, which will lead to rebuild most of obj, because this headers will be include in most cpp/c, what`s more, this target do not depends python3 version(python API) the output is same when use python3.5 or python3.x so change use PYTHON_EXECUTABLE to PYTHON3_EXECUTABLE_WITHOUT_VERSION, when PYTHON_EXECUTABLE/PYTHON_LIBRARY/PYTHON_INCLUDE_DIR changed, can reuse the build obj GitOrigin-RevId: `c2ffe3ead3`	4 years ago
Megvii Engine Team	e5d8e4279b	feat(mge): add a tool which can analyze the file generated by compare_binary_iodump.py GitOrigin-RevId: `9acab0a49f`	4 years ago
Megvii Engine Team	0678a169be	feat(mgb): load_and_run can set both fastrun and reproducible GitOrigin-RevId: `14e1de9952`	4 years ago
Megvii Engine Team	747b53c018	feat(dnn): add conv_bwd_data and conv_bwd_filter accuracy shake check GitOrigin-RevId: `4069e083d2`	4 years ago
Megvii Engine Team	a405461c01	fix(mge/elemwise): fix problem that elemwise.mode is not comparable with string mode GitOrigin-RevId: `82e39be0a9`	4 years ago
Megvii Engine Team	1aba1918a6	fix(imperative/ops): add check_dtype for Elemwise in infer_attrs GitOrigin-RevId: `c777855753`	4 years ago
Megvii Engine Team	53539fed01	fix(mge/quantization): fix a compute problem GitOrigin-RevId: `8aaedea959`	4 years ago
Megvii Engine Team	94b5e1b046	fix(mge/core): support non-Tensor args for `core.tensor.utils.get_device` GitOrigin-RevId: `51dc03db9e`	4 years ago
Megvii Engine Team	609b37b9ed	perf(fastrun): cache persistent cache GitOrigin-RevId: `21f7d4c19d`	4 years ago
Megvii Engine Team	0368eff688	fix(mgb): fix fastrun workspace limit GitOrigin-RevId: `b9797eefcf`	4 years ago
Megvii Engine Team	b6dc4c824d	fix(tensor): del valid tensors when compnode finalizing GitOrigin-RevId: `bace1f2b51`	4 years ago
Megvii Engine Team	1ecf2ab4ec	fix(mge/autodiff): check tensors to be attached GitOrigin-RevId: `c4f3f80876`	4 years ago
Megvii Engine Team	2c61b8bff4	feat(mge/functional): argmin and argmax support negtive axis GitOrigin-RevId: `a1bd1102a6`	4 years ago
Megvii Engine Team	cb84efa74f	fix(mge): fix dumping backward graph GitOrigin-RevId: `430f110053`	4 years ago
Megvii Engine Team	bb94f2aa3d	feat(mge/module): add __repr__ method for qat and quantized module GitOrigin-RevId: `0d78cbab9a`	4 years ago
Megvii Engine Team	5dbf3612b2	test(mge/optimizer): update optimizer test to make sure grad not change GitOrigin-RevId: `e207672116`	4 years ago
Megvii Engine Team	e9d6361e33	fix(mge/optimizer): fix optimizer update step inplace add grad GitOrigin-RevId: `d677d1ca6b`	4 years ago
Megvii Engine Team	a3e098c80c	fix(mge/utils): fix network multiple outputs issue GitOrigin-RevId: `d22e639cd3`	4 years ago
Megvii Engine Team	ecb11202f5	fix(build): fix naive build GitOrigin-RevId: `0050ff5d9c`	4 years ago
Megvii Engine Team	ce7913e326	fix(mge/imperative): fix proxy graph comp_node finalize GitOrigin-RevId: `22ad0d0fc6`	4 years ago
Megvii Engine Team	0d332cf005	feat(imperative/opr): rebase rng refactoring to dev & add python module GitOrigin-RevId: `ee5984c52d`	4 years ago
Megvii Engine Team	596dc982bb	feat(imperative): add rng opdef GitOrigin-RevId: `b62dcea7f5`	4 years ago
Megvii Engine Team	471a29a5f7	feat(log): opt log, enable mgb sdk log at opt build more info: 16cd674c56 * change MGE_OVERRIDE_LOG_LEVEL to RUNTIME_OVERRIDE_LOG_LEVEL * use ::std::getenv not MGB_GETENV for special ENV GitOrigin-RevId: ee0f9c0f72e627c331c00100f6a21adc927081df	4 years ago
Megvii Engine Team	800f119c2e	feat(mge/tools): module_status-add-functions BREAKING CHANGE: GitOrigin-RevId: `ced3da3a12`	4 years ago
Megvii Engine Team	3ef4fbdc36	fix(mge/module): fix a mistaken in BN docs GitOrigin-RevId: `086a70649e`	4 years ago
Megvii Engine Team	6d08be373a	fix(data): fix the sampling scale in Lighting GitOrigin-RevId: `d0a1ca50e2`	4 years ago
Megvii Engine Team	aab6b37019	Revert "perf(opr): use pin mem for param_pack_concat" This reverts commit `95a0997372` GitOrigin-RevId: `12ab59b8aa`	4 years ago
Megvii Engine Team	11de49b943	feat(mge/module): python wrapper for conv_transpose3d GitOrigin-RevId: `61097b8713`	4 years ago
Megvii Engine Team	c8f18163a2	refactor(mgb): simplify fast run proccess with storing algo desc instead of algo name GitOrigin-RevId: `04b86fde50`	4 years ago
Megvii Engine Team	f4efab3bf6	feat(lite): add cmake build for rknn and mge in lite GitOrigin-RevId: `c6c77a4b61`	4 years ago
Megvii Engine Team	46988d4534	refactor(mgb): code refactor of fast run GitOrigin-RevId: `2c4b8e06bb`	4 years ago
Megvii Engine Team	a1e3834229	fix(mgb): fix attribute check compatibility when profiling and read_from_cache in fast run GitOrigin-RevId: `b865d467b0`	4 years ago
Megvii Engine Team	8109a05a5e	fix(mgb/dnn): fix cub potential issues Wrap cub with CUB_NS_PREFIX and remove dependency on Thrust to avoid potential linking issues GitOrigin-RevId: `53893b0a39`	4 years ago
Megvii Engine Team	331995e7f6	refactor(imperative): alloc enum type class on heap GitOrigin-RevId: `d2b2acea22`	4 years ago
Megvii Engine Team	3f5238fb38	feat(mgb/dnn): add accuracy shake checker GitOrigin-RevId: `0bb52078a1`	4 years ago
Megvii Engine Team	be6fb6b7c1	feat(mgb/dnn): add accuracy_depend_on_batch attribute GitOrigin-RevId: `2be1db43d0`	4 years ago
Megvii Engine Team	1cadf9d8d7	fix(mgb): add usable-depend-on-shape attr GitOrigin-RevId: `3a14fa6b6f`	4 years ago
Megvii Engine Team	fbd578cc9e	fix(externcopr): check loader imp dynmaic param GitOrigin-RevId: `5f38c120e3`	4 years ago
Megvii Engine Team	cfbd4f9bf1	fix(device): fix get_device_count for xpu GitOrigin-RevId: `6c77f507f3`	4 years ago
Megvii Engine Team	3bfe87ecb5	perf(tensor): always copy to pinned memory before h2d GitOrigin-RevId: `f945f2435f`	4 years ago
Megvii Engine Team	45a031fec3	test(dist): remove redundant assert for reduce_sum and gather GitOrigin-RevId: `bc7393bb63`	4 years ago
Megvii Engine Team	17ef140cc7	fix(rocm): enable var_releaser for rocm GitOrigin-RevId: `a42185aa6f`	4 years ago
Megvii Engine Team	64a3314be9	perf(opr): use pin mem for param_pack_concat GitOrigin-RevId: `95a0997372`	4 years ago
Megvii Engine Team	04b96602ae	feat(opr-mm): add backend argument for remote send/recv GitOrigin-RevId: `841a0e45ab`	4 years ago
Megvii Engine Team	0083f4c4f3	build(rocm): support rocm-3.9 GitOrigin-RevId: `85f8911736`	4 years ago
Megvii Engine Team	d4eb389042	feat(distributed): auto detect device and backend when init group GitOrigin-RevId: `90be2d5b4d`	4 years ago
Megvii Engine Team	d3154a45bc	feat(distributed): support distributed opr for rocm GitOrigin-RevId: `4840100d07`	4 years ago
Megvii Engine Team	928a57f83c	build(rocm): partially support hcc compilation GitOrigin-RevId: `ca9f1f8e8e`	4 years ago
Megvii Engine Team	100a502764	fix(dnn): replace kernel launch syntax with macro for hcc GitOrigin-RevId: `f9e69d4825`	4 years ago
Megvii Engine Team	07ab8cb6b6	feat(dnn): add param_pack for rocm GitOrigin-RevId: `2180504c71`	4 years ago
Megvii Engine Team	6ac68090c8	refactor(mgb/opr): make trt batch flag only depend on inputs dimension GitOrigin-RevId: `f2f1a10762`	4 years ago
Megvii Engine Team	00fa02a9b8	fix(mgb): fix attribute uncomplete filter when get_profile_result_from_cache in fast run GitOrigin-RevId: `599437ad72`	4 years ago
Megvii Engine Team	9be7cb8924	test(mge/module): add module reset attribute test GitOrigin-RevId: `6c9adc4a70`	4 years ago
Megvii Engine Team	7a9d1f57cd	fix(mge/module): fix module attribute update mistake GitOrigin-RevId: `842a8fcb6e`	4 years ago
Megvii Engine Team	38fdd762fa	build(cuda): link to cuda_stub GitOrigin-RevId: `073d3313cf`	4 years ago
Megvii Engine Team	4b2b623b8b	fix(dnn/cuda): fix cutlass matmul splitk limit GitOrigin-RevId: `fc9a7c638c`	4 years ago
Megvii Engine Team	f295e35839	fix(mgb): fix fast run crash when profile heuristic strategy GitOrigin-RevId: `6046a2db0c`	4 years ago
Megvii Engine Team	50f182a08b	fix(whl): support wheel version without sdk info, which caused by python official PYPI do not support LOCAL_VERSION; sync lite setup.py GitOrigin-RevId: `014f7f2204`	4 years ago
Megvii Engine Team	976b351ada	feat(functional/nn): support F.warp_perspective with `mat_idx` GitOrigin-RevId: `66910c8bd8`	4 years ago
Megvii Engine Team	899d70a8fa	fix(imperative): add __init__.py for tools GitOrigin-RevId: `c260c572f2`	4 years ago
Megvii Engine Team	4a8b01fca7	chore(release): bump version	4 years ago