feat(dnn/fallback): add im2col filterpreprocess function

GitOrigin-RevId: 61c54ad258
5 years ago · edd7e16701
--- a/dnn/src/fallback/conv_bias/im2col/algos.cpp
+++ b/dnn/src/fallback/conv_bias/im2col/algos.cpp
@@ -31,35 +31,10 @@ using namespace im2col;
 *  *Through witch can convenient get the needed ptr
 */
 struct Im2colBundelIndex {
    static constexpr size_t BUNDLE_PADDING_INDEX = 0_z;
    static constexpr size_t BUNDLE_PACKA_INDEX = 1_z;
    static constexpr size_t BUNDLE_THREAD_INDEX = 2_z;
 };

 using Pack_Mode=fallback::MatrixMulImpl::AlgoBase::PackMode;

 //! Process one input channel copy padding
 static void copy_padding_kern(WorkspaceBundle& bundle,
                              const ConvBiasImpl::NCBKernParam& param,
                              const ConvBiasImpl::NCBKernIndex& ncb_index,
                              StrategyBase* im2colstrategy, size_t pack_oc_size) {
    im2colstrategy->copy_padding_kern(bundle, param, ncb_index, pack_oc_size);
 }

 //! packA_kern
 static void packA_kern(
        WorkspaceBundle& bundle,
        const fallback::ConvBiasImpl::NCBKernParam& param,
        fallback::MatrixMulImpl::KernSizeParam matmulparam,
        fallback::MatrixMulImpl::AlgoBase* matmul_algo,
        const fallback::ConvBiasImpl::NCBKernIndex& ncb_index,
        StrategyBase* im2colstrategy,
        const fallback::MatrixMulImpl::AlgoBase::MatmulDescription& matmul_desc,
        size_t pack_oc_size) {
    im2colstrategy->packA_kern(bundle, param, matmulparam, matmul_algo,
                               ncb_index, matmul_desc, pack_oc_size);
 }

 /*!
 * *\brief Im2colKerns collects all the im2col kerns in it
 */
@@ -124,8 +99,8 @@ public:

    WorkspaceBundle get_thread_bundle(
            const fallback::ConvBiasImpl::NCBKernSizeParam& param,
            fallback::MatrixMulImpl::KernSizeParam im2col_kern_param,
            MatrixMulImpl::AlgoBase* matmul_algo, size_t ohw_tile_size,
            const fallback::MatrixMulImpl::KernSizeParam& im2col_kern_param,
            const MatrixMulImpl::AlgoBase* matmul_algo, size_t ohw_tile_size,
            size_t oc_tile_size) {
        size_t IC = param.filter_meta.icpg, FH = param.filter_meta.spatial[0],
               FW = param.filter_meta.spatial[1];
@@ -205,8 +180,8 @@ public:
    }
    WorkspaceBundle get_thread_bundle(
            const fallback::ConvBiasImpl::NCBKernSizeParam& param,
            fallback::MatrixMulImpl::KernSizeParam im2col_kern_param,
            MatrixMulImpl::AlgoBase* matmul_algo, size_t ohw_tile_size,
            const fallback::MatrixMulImpl::KernSizeParam& im2col_kern_param,
            const MatrixMulImpl::AlgoBase* matmul_algo, size_t ohw_tile_size,
            size_t oc_tile_size) {
        size_t IC = param.filter_meta.icpg, FH = param.filter_meta.spatial[0],
               FW = param.filter_meta.spatial[1];
@@ -288,8 +263,8 @@ public:
    }
    WorkspaceBundle get_thread_bundle(
            const fallback::ConvBiasImpl::NCBKernSizeParam& param,
            fallback::MatrixMulImpl::KernSizeParam im2col_kern_param,
            MatrixMulImpl::AlgoBase* matmul_algo, size_t ohw_tile_size,
            const fallback::MatrixMulImpl::KernSizeParam& im2col_kern_param,
            const MatrixMulImpl::AlgoBase* matmul_algo, size_t ohw_tile_size,
            size_t oc_tile_size) {
        size_t IC = param.filter_meta.icpg, FH = param.filter_meta.spatial[0],
               FW = param.filter_meta.spatial[1];
@@ -322,15 +297,16 @@ public:
    }
 };

 fallback::MatrixMulImpl::KernSizeParam
 ConvBiasImpl::AlgoIm2col ::get_matmul_kern_param(const NCBKernSizeParam& param,
                                                 size_t ohw_tile_size,
                                                 size_t oc_tile_size) const {
 namespace {
 static fallback::MatrixMulImpl::KernSizeParam get_matmul_kern_param(
        const fallback::ConvBiasImpl::NCBKernSizeParam& param,
        size_t ohw_tile_size, size_t oc_tile_size) {
    auto format = param::MatrixMul::Format::DEFAULT;
    size_t pack_oc_size = pack_size(param.filter_meta.format);
    if (param.filter_meta.format == param::ConvBias::Format::NCHW44) {
        format = param::MatrixMul::Format::MK4;
    } else if(param.filter_meta.format == param::ConvBias::Format::NCHW44_DOT){
    } else if (param.filter_meta.format ==
               param::ConvBias::Format::NCHW44_DOT) {
        format = param::MatrixMul::Format::MK4_DOT;
    }
    size_t M = oc_tile_size;
@@ -358,10 +334,23 @@ ConvBiasImpl::AlgoIm2col ::get_matmul_kern_param(const NCBKernSizeParam& param,
            format};
 }

 void ConvBiasImpl::AlgoIm2col::choice_ohw_oc_block(
        const NCBKernSizeParam& param, size_t& oc_tile_size,
        size_t& ohw_tile_size, size_t block_m, size_t block_n,
        fallback::MatrixMulImpl::AlgoBase::PackMode pack_mode) const {
 static void choice_ohw_oc_block(
        const fallback::ConvBiasImpl::NCBKernSizeParam& param,
        size_t& oc_tile_size, size_t& ohw_tile_size, size_t block_m,
        size_t block_n, const size_t m_ohw_tile_size,
        fallback::MatrixMulImpl::AlgoBase::PackMode pack_mode) {
    //! calculate m_oc_tile_size in choice_ohw_oc_block() fucntion,
    //! when ohw_tile_size < this value ohw_tile_size = ohw
    static constexpr size_t DEFAULT_OHW_MIN_TILE_SIZE = 32;
    //! when nr_threads > 1 and round(ohw,nr_threads)>nr_threads,
    //! oc_tile_size = DEFAULT_OC_TILE_SIZE
    static constexpr size_t DEFAULT_OC_TILE_SIZE = 512;
    //! when oc_tile_size > this value m_oc_tile_size =
    //! DEFAULT_OC_MAX_TILE_SIZE
    static constexpr size_t DEFAULT_OC_MAX_TILE_SIZE = 1024;
    //! when oc_tile_size < this value oc_tile_size =
    //! DEFAULT_OC_MIN_TILE_SIZE the purpose is aligning the calculation
    static constexpr size_t DEFAULT_OC_MIN_TILE_SIZE = 128;
    size_t nr_threads = param.nr_threads;
    size_t OC = param.filter_meta.ocpg;
    size_t ohw = param.osz[0] * param.osz[1];
@@ -393,8 +382,74 @@ void ConvBiasImpl::AlgoIm2col::choice_ohw_oc_block(
    }
 }

 WorkspaceBundle ConvBiasImpl::AlgoIm2col::get_bundle(
        const NCBKernSizeParam& param) const {
 static size_t packA_group_size(
        const MatrixMulImpl::AlgoBase* matmul_algo,
        const fallback::MatrixMulImpl::KernSizeParam& matmul_param,
        const fallback::MatrixMulImpl::AlgoBase::MatmulDescription& matmul_desc,
        size_t packa_parallel_times) {
    if (matmul_desc.packmode ==
        fallback::MatrixMulImpl::AlgoBase::PackMode::DEFAULT) {
        return matmul_algo->get_bundle(matmul_param).get_size(0);
    } else if (matmul_desc.packmode ==
               fallback::MatrixMulImpl::AlgoBase::PackMode::ONLY_PACKA) {
        return packa_parallel_times *
               matmul_algo->get_bundle(matmul_param).get_size(0);
    }
    megdnn_assert(matmul_desc.packmode ==
                  fallback::MatrixMulImpl::AlgoBase::PackMode::NO_PACK);
    //! nopack mode return 0;
    return 0;
 }

 static WorkspaceBundle get_thread_bundle(
        const fallback::ConvBiasImpl::NCBKernSizeParam& param,
        const MatrixMulImpl::AlgoBase* matmul_algo,
        const fallback::MatrixMulImpl::KernSizeParam& matmul_param,
        const fallback::MatrixMulImpl::AlgoBase::MatmulDescription& matmul_desc,
        size_t oc_tile_size, size_t ohw_tile_size) {
    if (matmul_desc.packmode == Pack_Mode::DEFAULT) {
        MIDOUT_BEGIN(
                megdnn_fallback_im2col,
                midout_iv("ConvBiasImpl::AlgoIm2col::get_bundle_dft"_hash)) {
            Im2colKerns<Pack_Mode::DEFAULT> defaultkern;
            return defaultkern.get_thread_bundle(param, matmul_param,
                                                 matmul_algo, ohw_tile_size,
                                                 oc_tile_size);
        }
        MIDOUT_END();
    } else if (matmul_desc.packmode ==
               fallback::MatrixMulImpl::AlgoBase::PackMode::ONLY_PACKA) {
        MIDOUT_BEGIN(
                megdnn_fallback_im2col,
                midout_iv(
                        "ConvBiasImpl::AlgoIm2col::get_bundle_onlypacka"_hash)) {
            Im2colKerns<Pack_Mode::ONLY_PACKA> onlypackakern;
            return onlypackakern.get_thread_bundle(param, matmul_param,
                                                   matmul_algo, ohw_tile_size,
                                                   oc_tile_size);
        }
        MIDOUT_END();
    } else {
        megdnn_assert(matmul_desc.packmode ==
                      fallback::MatrixMulImpl::AlgoBase::PackMode::NO_PACK);
        MIDOUT_BEGIN(
                megdnn_fallback_im2col,
                midout_iv(
                        "ConvBiasImpl::AlgoIm2col::get_thread_bundle_nopack"_hash)) {
            Im2colKerns<Pack_Mode::NO_PACK> nopackkern;
            return nopackkern.get_thread_bundle(param, matmul_param,
                                                matmul_algo, ohw_tile_size,
                                                oc_tile_size);
        }
        MIDOUT_END();
    }
    return {nullptr, {}};
 }

 static WorkspaceBundle get_bundle(
        const fallback::ConvBiasImpl::NCBKernSizeParam& param,
        MatrixMulImpl::AlgoBase* matmul_algo, size_t oc_tile_size,
        size_t ohw_tile_size) {
    UNPACK_CONV_F32_NCB_KERN_SIZES(param);
    MEGDNN_MARK_USED_VAR(OC);
    MEGDNN_MARK_USED_VAR(OH);
@@ -410,23 +465,20 @@ WorkspaceBundle ConvBiasImpl::AlgoIm2col::get_bundle(
    size_t padding = 0, packa_size = 0, packa_group_size = 0;
    size_t nr_threads = param.nr_threads;
    size_t GROUP = param.filter_meta.group;
    fallback::MatrixMulImpl::AlgoBase::MatmulDescription mdesc =
            m_matmul_algo->matmul_description();
    bool need_pack = mdesc.packmode == Pack_Mode::DEFAULT;
    bool only_packA = mdesc.packmode == Pack_Mode::ONLY_PACKA;
    size_t oc_tile_size = 0, ohw_tile_size = 0;
    choice_ohw_oc_block(param, oc_tile_size, ohw_tile_size,
                        mdesc.innerblocksize.m, mdesc.innerblocksize.n,
                        mdesc.packmode);
    if (need_pack || only_packA) {
        auto im2col_kern_param = get_matmul_kern_param(
                param, ohw_tile_size, only_packA ? oc_tile_size : OC);
        size_t oc_parallel_times = div_ceil<size_t>(OC, oc_tile_size);
        WorkspaceBundle wb = m_matmul_algo->get_bundle(im2col_kern_param);
        packa_group_size = only_packA ? oc_parallel_times * wb.get_size(0)
                                      : wb.get_size(0);
    } else {  //! not support pack,not need pack
    fallback::MatrixMulImpl::AlgoBase::MatmulDescription matmul_desc =
            matmul_algo->matmul_description();
    bool default_pack = matmul_desc.packmode == Pack_Mode::DEFAULT;

    //! packmode is default should use oc
    //! packmode is onlypackA should use oc_tile_size
    auto im2col_kern_param = get_matmul_kern_param(
            param, ohw_tile_size, default_pack ? OC : oc_tile_size);
    if (is_enable_filter_preprocess(param)) {
        packa_group_size = 0;
    } else {
        size_t oc_parallel_times = div_ceil<size_t>(OC, oc_tile_size);
        packa_group_size = packA_group_size(matmul_algo, im2col_kern_param,
                                            matmul_desc, oc_parallel_times);
    }

    if (no_need_pading) {
@@ -437,50 +489,27 @@ WorkspaceBundle ConvBiasImpl::AlgoIm2col::get_bundle(
    }

    packa_size = GROUP * packa_group_size;  //! for packA  size = GROUP * a_size
    WorkspaceBundle ws = {nullptr, {}};
    auto im2col_kern_param =
            get_matmul_kern_param(param, ohw_tile_size, oc_tile_size);

    if (m_matmul_algo->packmode() == Pack_Mode::DEFAULT) {
        MIDOUT_BEGIN(
                megdnn_fallback_im2col,
                midout_iv("ConvBiasImpl::AlgoIm2col::get_bundle_dft"_hash)) {
            Im2colKerns<Pack_Mode::DEFAULT> defaultkern;
            ws = defaultkern.get_thread_bundle(param, im2col_kern_param,
                                               m_matmul_algo, ohw_tile_size,
                                               oc_tile_size);
        }
        MIDOUT_END();
    } else if (m_matmul_algo->packmode() == Pack_Mode::ONLY_PACKA) {
        MIDOUT_BEGIN(
                megdnn_fallback_im2col,
                midout_iv("ConvBiasImpl::AlgoIm2col::get_bundle_packa"_hash)) {
            Im2colKerns<Pack_Mode::ONLY_PACKA> onlypackakern;
            ws = onlypackakern.get_thread_bundle(param, im2col_kern_param,
                                                 m_matmul_algo, ohw_tile_size,
                                                 oc_tile_size);
        }
        MIDOUT_END();
    } else {
        MIDOUT_BEGIN(
                megdnn_fallback_im2col,
                midout_iv("ConvBiasImpl::AlgoIm2col::get_bundle_other"_hash)) {
            Im2colKerns<Pack_Mode::NO_PACK> nopackkern;
            ws = nopackkern.get_thread_bundle(param, im2col_kern_param,
                                              m_matmul_algo, ohw_tile_size,
                                              oc_tile_size);
        }
        MIDOUT_END();
    }

    WorkspaceBundle ws =
            get_thread_bundle(param, matmul_algo, im2col_kern_param,
                              matmul_desc, oc_tile_size, ohw_tile_size);
    return {nullptr,
            {padding, packa_size, ws.total_size_in_bytes() * nr_threads}};
 }

 }  // namespace

 size_t ConvBiasImpl::AlgoIm2col::get_workspace(
        const NCBKernSizeParam& p) const {
    MIDOUT_BEGIN(megdnn_fallback_im2col, 0, 0) {
        return get_bundle(p).total_size_in_bytes();
        fallback::MatrixMulImpl::AlgoBase::MatmulDescription matmul_desc =
                m_matmul_algo->matmul_description();
        size_t oc_tile_size = 0, ohw_tile_size = 0;
        choice_ohw_oc_block(p, oc_tile_size, ohw_tile_size,
                            matmul_desc.innerblocksize.m, matmul_desc.innerblocksize.n,
                            m_ohw_tile_size, matmul_desc.packmode);
        return get_bundle(p, m_matmul_algo, oc_tile_size, ohw_tile_size)
                .total_size_in_bytes();
    }
    MIDOUT_END();
    return 0;
@@ -499,22 +528,21 @@ SmallVector<ConvBiasImpl::NCBKern> ConvBiasImpl::AlgoIm2col::dispatch_kerns(
        size_t oc_tile_size = 0, ohw_tile_size = 0;
        size_t ohw = OH * OW;
        size_t GROUP = param.filter_meta.group;
        WorkspaceBundle bundle = get_bundle(param);
        WorkspaceBundle bundle_thread = {nullptr, {}};
        bool need_padding = (PH != 0 || PW != 0);

        fallback::MatrixMulImpl::AlgoBase::MatmulDescription mdesc =
        fallback::MatrixMulImpl::AlgoBase::MatmulDescription matmul_desc =
                m_matmul_algo->matmul_description();

        Pack_Mode packmode = mdesc.packmode;
        bool default_pack = packmode == Pack_Mode::DEFAULT;
        bool no_pack = packmode == Pack_Mode::NO_PACK;
        bool only_packA = packmode == Pack_Mode::ONLY_PACKA;

        bool default_pack = matmul_desc.packmode == Pack_Mode::DEFAULT;
        bool no_pack = matmul_desc.packmode == Pack_Mode::NO_PACK;
        bool only_packA = matmul_desc.packmode == Pack_Mode::ONLY_PACKA;
        bool enable_filter_preprocess = is_enable_filter_preprocess(param);
        choice_ohw_oc_block(param, oc_tile_size, ohw_tile_size,
                            mdesc.innerblocksize.m, mdesc.innerblocksize.n,
                            mdesc.packmode);
                            matmul_desc.innerblocksize.m,
                            matmul_desc.innerblocksize.n, m_ohw_tile_size,
                            matmul_desc.packmode);

        WorkspaceBundle bundle = get_bundle(param,m_matmul_algo,oc_tile_size,ohw_tile_size);
        size_t ohw_parallel_times = div_ceil(ohw, ohw_tile_size);
        size_t oc_parallel_times = div_ceil<size_t>(OC, oc_tile_size);
        size_t packa_parallel_times = 0;
@@ -523,28 +551,16 @@ SmallVector<ConvBiasImpl::NCBKern> ConvBiasImpl::AlgoIm2col::dispatch_kerns(
        if (only_packA) {
            packa_parallel_times = div_ceil<size_t>(OC, oc_tile_size);
        } else if (default_pack) {
            packa_parallel_times = div_ceil<size_t>(OC, mdesc.innerblocksize.m);
            packa_parallel_times =
                    div_ceil<size_t>(OC, matmul_desc.innerblocksize.m);
        }

        auto matmul_param = get_matmul_kern_param(
                param, ohw_tile_size, only_packA ? oc_tile_size : OC);
        if (mdesc.packmode == Pack_Mode::DEFAULT) {
            Im2colKerns<Pack_Mode::DEFAULT> defaultkern;
            bundle_thread = defaultkern.get_thread_bundle(
                    param, matmul_param, m_matmul_algo, ohw_tile_size,
                    oc_tile_size);
        } else if (mdesc.packmode == Pack_Mode::ONLY_PACKA) {
            Im2colKerns<Pack_Mode::ONLY_PACKA> onlypackakern;
            bundle_thread = onlypackakern.get_thread_bundle(
                    param, matmul_param, m_matmul_algo, ohw_tile_size,
                    oc_tile_size);
        } else {
            Im2colKerns<Pack_Mode::NO_PACK> nopackkern;
            bundle_thread = nopackkern.get_thread_bundle(
                    param, matmul_param, m_matmul_algo, ohw_tile_size,
                    oc_tile_size);
        }
                param, ohw_tile_size, default_pack ? OC : oc_tile_size);

        WorkspaceBundle bundle_thread =
                get_thread_bundle(param, m_matmul_algo, matmul_param,
                                  matmul_desc, oc_tile_size, ohw_tile_size);
        StrategyParam strategyparam;
        strategyparam.ohw = ohw;
        strategyparam.is_dst_8bit =
@@ -557,6 +573,9 @@ SmallVector<ConvBiasImpl::NCBKern> ConvBiasImpl::AlgoIm2col::dispatch_kerns(
                strategyparam.is_ohw_size_bigger && !strategyparam.is_dst_8bit;
        strategyparam.oc_tile_size = oc_tile_size;
        strategyparam.pack_oc_size = pack_oc_size;
        strategyparam.enable_filter_preprocess = enable_filter_preprocess;
        strategyparam.packA_group_size = packA_group_size(
                m_matmul_algo, matmul_param, matmul_desc, packa_parallel_times);

        SmallVector<ConvBiasImpl::NCBKern> ret_kern;
        MIDOUT_BEGIN(
@@ -569,88 +588,126 @@ SmallVector<ConvBiasImpl::NCBKern> ConvBiasImpl::AlgoIm2col::dispatch_kerns(
                                        const NCBKernParam& param,
                                        const NCBKernIndex& ncb_index) mutable {
                bundle.set(param.workspace_ptr);
                copy_padding_kern(bundle, param, ncb_index, im2colstrategy,
                                  pack_oc_size);
                im2colstrategy->copy_padding_kern(bundle, param, ncb_index,
                                                  pack_oc_size);
            };

            auto kern_packA = [bundle, matmul_algo = m_matmul_algo,
                               matmul_param, im2colstrategy,
                               pack_oc_size = pack_oc_size, mdesc = mdesc](
                               strategyparam = strategyparam,
                               matmul_desc = matmul_desc](
                                      const NCBKernParam& param,
                                      const NCBKernIndex& ncb_index) mutable {
                bundle.set(param.workspace_ptr);
                packA_kern(bundle, param, matmul_param, matmul_algo, ncb_index,
                           im2colstrategy, mdesc, pack_oc_size);

                im2colstrategy->packA_kern(bundle, param, matmul_param,
                                           matmul_algo, ncb_index, matmul_desc,
                                           strategyparam);
            };
            if (default_pack) {
                auto kern_compute_default =
                        [bundle, bundle_thread, matmul_param,
                         matmul_algo = m_matmul_algo,
                         ohw_tile_size = ohw_tile_size,
                         strategyparam = strategyparam, matmul_desc = mdesc,
                         im2colstrategy](
                                const NCBKernParam& param,
                                const NCBKernIndex& ncb_index) mutable {
                            bundle.set(param.workspace_ptr);
                            Im2colKerns<Pack_Mode::DEFAULT>::kerns(
                                    bundle, bundle_thread, param, matmul_param,
                                    matmul_algo, matmul_desc, strategyparam,
                                    ncb_index, ohw_tile_size, im2colstrategy);
                        };
                ret_kern.push_back({kern_packA, {GROUP, packa_parallel_times}});

                if (need_padding) {
                    ret_kern.push_back({kern_padding,
                                        {param.n, GROUP, IC / pack_oc_size}});
                MIDOUT_BEGIN(
                        megdnn_fallback_im2col,
                        midout_iv(
                                "ConvBiasImpl::AlgoIm2col::dispatch_kerns_default_pack"_hash)) {
                    auto kern_compute_default =
                            [bundle, bundle_thread, matmul_param,
                             matmul_algo = m_matmul_algo,
                             ohw_tile_size = ohw_tile_size,
                             strategyparam = strategyparam,
                             matmul_desc = matmul_desc, im2colstrategy](
                                    const NCBKernParam& param,
                                    const NCBKernIndex& ncb_index) mutable {
                                bundle.set(param.workspace_ptr);
                                Im2colKerns<Pack_Mode::DEFAULT>::kerns(
                                        bundle, bundle_thread, param,
                                        matmul_param, matmul_algo, matmul_desc,
                                        strategyparam, ncb_index, ohw_tile_size,
                                        im2colstrategy);
                            };
                    if (!enable_filter_preprocess) {
                        ret_kern.push_back(
                                {kern_packA, {GROUP, packa_parallel_times}});
                    }
                    if (need_padding) {
                        ret_kern.push_back(
                                {kern_padding,
                                 {param.n, GROUP, IC / pack_oc_size}});
                    }
                    ret_kern.push_back({kern_compute_default,
                                        {N, GROUP, ohw_parallel_times,
                                         oc_parallel_times}});
                    return ret_kern;
                }
                ret_kern.push_back(
                        {kern_compute_default,
                         {N, GROUP, ohw_parallel_times, oc_parallel_times}});
                MIDOUT_END();
                return {};
            } else if (only_packA) {
                auto kern_compute_onlypackA =
                        [bundle, bundle_thread, matmul_param,
                         matmul_algo = m_matmul_algo,
                         strategyparam = strategyparam,
                         ohw_tile_size = ohw_tile_size, matmul_desc = mdesc,
                         im2colstrategy](
                                const NCBKernParam& param,
                                const NCBKernIndex& ncb_index) mutable {
                            bundle.set(param.workspace_ptr);
                            Im2colKerns<Pack_Mode::ONLY_PACKA>::kerns(
                                    bundle, bundle_thread, param, matmul_param,
                                    matmul_algo, matmul_desc, strategyparam,
                                    ncb_index, ohw_tile_size, im2colstrategy);
                        };
                ret_kern.push_back({kern_packA, {GROUP, packa_parallel_times}});
                if (need_padding) {
                    ret_kern.push_back({kern_padding, {param.n, GROUP, IC}});
                MIDOUT_BEGIN(
                        megdnn_fallback_im2col,
                        midout_iv(
                                "ConvBiasImpl::AlgoIm2col::dispatch_kerns_onlypacka"_hash)) {
                    auto kern_compute_onlypackA =
                            [bundle, bundle_thread, matmul_param,
                             matmul_algo = m_matmul_algo,
                             strategyparam = strategyparam,
                             ohw_tile_size = ohw_tile_size,
                             matmul_desc = matmul_desc, im2colstrategy](
                                    const NCBKernParam& param,
                                    const NCBKernIndex& ncb_index) mutable {
                                bundle.set(param.workspace_ptr);
                                Im2colKerns<Pack_Mode::ONLY_PACKA>::kerns(
                                        bundle, bundle_thread, param,
                                        matmul_param, matmul_algo, matmul_desc,
                                        strategyparam, ncb_index, ohw_tile_size,
                                        im2colstrategy);
                            };
                    if (!enable_filter_preprocess) {
                        ret_kern.push_back(
                                {kern_packA, {GROUP, packa_parallel_times}});
                    }
                    if (need_padding) {
                        ret_kern.push_back(
                                {kern_padding, {param.n, GROUP, IC}});
                    }
                    ret_kern.push_back({kern_compute_onlypackA,
                                        {N, GROUP, ohw_parallel_times,
                                         oc_parallel_times}});
                    return ret_kern;
                }
                ret_kern.push_back(
                        {kern_compute_onlypackA,
                         {N, GROUP, ohw_parallel_times, oc_parallel_times}});
                MIDOUT_END();
                return {};
            } else if (no_pack) {
                auto kern_compute_nopack =
                        [bundle, bundle_thread, matmul_param,
                         matmul_algo = m_matmul_algo,
                         strategyparam = strategyparam,
                         ohw_tile_size = ohw_tile_size, matmul_desc = mdesc,
                         im2colstrategy](
                                const NCBKernParam& param,
                                const NCBKernIndex& ncb_index) mutable {
                            bundle.set(param.workspace_ptr);
                            Im2colKerns<Pack_Mode::NO_PACK>::kerns(
                                    bundle, bundle_thread, param, matmul_param,
                                    matmul_algo, matmul_desc, strategyparam,
                                    ncb_index, ohw_tile_size, im2colstrategy);
                        };
                if (need_padding) {
                    ret_kern.push_back({kern_padding, {param.n, GROUP, IC}});
                MIDOUT_BEGIN(
                        megdnn_fallback_im2col,
                        midout_iv(
                                "ConvBiasImpl::AlgoIm2col::dispatch_kerns_no_pack"_hash)) {
                    auto kern_compute_nopack =
                            [bundle, bundle_thread, matmul_param,
                             matmul_algo = m_matmul_algo,
                             strategyparam = strategyparam,
                             ohw_tile_size = ohw_tile_size,
                             matmul_desc = matmul_desc, im2colstrategy](
                                    const NCBKernParam& param,
                                    const NCBKernIndex& ncb_index) mutable {
                                bundle.set(param.workspace_ptr);
                                Im2colKerns<Pack_Mode::NO_PACK>::kerns(
                                        bundle, bundle_thread, param,
                                        matmul_param, matmul_algo, matmul_desc,
                                        strategyparam, ncb_index, ohw_tile_size,
                                        im2colstrategy);
                            };
                    if (need_padding) {
                        ret_kern.push_back(
                                {kern_padding, {param.n, GROUP, IC}});
                    }
                    ret_kern.push_back({kern_compute_nopack,
                                        {N, GROUP, ohw_parallel_times,
                                         oc_parallel_times}});
                    return ret_kern;
                }
                ret_kern.push_back(
                        {kern_compute_nopack,
                         {N, GROUP, ohw_parallel_times, oc_parallel_times}});
                MIDOUT_END();
                return {};
            }
            return ret_kern;
            return {};
        }
        MIDOUT_END();
        return {};
@@ -694,12 +751,19 @@ bool ConvBiasImpl::AlgoIm2col::usable(
                return false;
            }
        }
        fallback::MatrixMulImpl::AlgoBase::MatmulDescription mdesc =
        fallback::MatrixMulImpl::AlgoBase::MatmulDescription matmul_desc =
                m_matmul_algo->matmul_description();
        //! only matmul's packmode is packa or default support weight preprocess
        if (is_enable_filter_preprocess(param) &&
            (matmul_desc.packmode ==
             fallback::MatrixMulImpl::AlgoBase::PackMode::NO_PACK)) {
            return false;
        }

        if (format == param::ConvBias::Format::NCHW44 ||
            format == param::ConvBias::Format::NCHW44_DOT) {
            //! current NCHW44 im2col only support DEFAULT mode matmul
            if (mdesc.packmode != Pack_Mode::DEFAULT) {
            if (matmul_desc.packmode != Pack_Mode::DEFAULT) {
                return false;
                //! nchw44 hybird mode and channel wise is not support
            } else if (param.filter_meta.icpg < 4_z ||
@@ -711,8 +775,9 @@ bool ConvBiasImpl::AlgoIm2col::usable(

        size_t oc_tile_size = 0, ohw_tile_size = 0;
        choice_ohw_oc_block(param, oc_tile_size, ohw_tile_size,
                            mdesc.innerblocksize.m, mdesc.innerblocksize.n,
                            m_matmul_algo->packmode());
                            matmul_desc.innerblocksize.m,
                            matmul_desc.innerblocksize.n, m_ohw_tile_size,
                            matmul_desc.packmode);
        fallback::MatrixMulImpl::KernSizeParam matmul_param =
                get_matmul_kern_param(param, ohw_tile_size, oc_tile_size);
        bool matmulusable = m_matmul_algo->usable(matmul_param);
@@ -731,4 +796,104 @@ bool ConvBiasImpl::AlgoIm2col::usable(
    return false;
 }

 SmallVector<TensorLayout>
 ConvBiasImpl::AlgoIm2col::deduce_preprocessed_filter_layout(
        const NCBKernSizeParam& param) const {
    MIDOUT_BEGIN(
            megdnn_fallback_im2col,
            midout_iv(
                    "ConvBiasImpl::AlgoIm2col::deduce_preprocessed_filter_layout"_hash)) {
        fallback::MatrixMulImpl::AlgoBase::MatmulDescription matmul_desc =
                m_matmul_algo->matmul_description();

        //! only support default_pack and only_packa mode
        if (matmul_desc.packmode == Pack_Mode::NO_PACK) {
            return {};
        }

        size_t GROUP = param.filter_meta.group;
        bool default_pack = matmul_desc.packmode == Pack_Mode::DEFAULT;

        size_t OC = param.filter_meta.ocpg;
        SmallVector<TensorLayout> preprocessed_layouts;
        size_t oc_tile_size = 0, ohw_tile_size = 0;
        choice_ohw_oc_block(param, oc_tile_size, ohw_tile_size,
                            matmul_desc.innerblocksize.m,
                            matmul_desc.innerblocksize.n, m_ohw_tile_size,
                            matmul_desc.packmode);
        auto matmul_param = get_matmul_kern_param(
                param, ohw_tile_size, default_pack ? OC : oc_tile_size);

        size_t packa_parallel_times = div_ceil<size_t>(
                OC, default_pack ? matmul_desc.innerblocksize.m : oc_tile_size);

        size_t packa_group_size = packA_group_size(
                m_matmul_algo, matmul_param, matmul_desc, packa_parallel_times);
        preprocessed_layouts.push_back(
                {{GROUP, packa_group_size}, dtype::Int8()});
        return preprocessed_layouts;
    }
    MIDOUT_END();
    return {};
 }

 SmallVector<ConvBiasImpl::NCBKern>
 ConvBiasImpl::AlgoIm2col::dispatch_preprocess_kerns(
        const NCBKernSizeParam& param) const {
    MIDOUT_BEGIN(megdnn_fallback_im2col, 0, 3) {
        size_t OC = param.filter_meta.ocpg;
        size_t oc_tile_size = 0, ohw_tile_size = 0;
        size_t GROUP = param.filter_meta.group;
        fallback::MatrixMulImpl::AlgoBase::MatmulDescription matmul_desc =
                m_matmul_algo->matmul_description();
        choice_ohw_oc_block(param, oc_tile_size, ohw_tile_size,
                            matmul_desc.innerblocksize.m,
                            matmul_desc.innerblocksize.n, m_ohw_tile_size,
                            matmul_desc.packmode);
        WorkspaceBundle bundle =
                get_bundle(param, m_matmul_algo, oc_tile_size, ohw_tile_size);

        Pack_Mode packmode = matmul_desc.packmode;
        bool default_pack = packmode == Pack_Mode::DEFAULT;
        bool only_packA = packmode == Pack_Mode::ONLY_PACKA;
        size_t packa_parallel_times = 0;

        if (only_packA) {
            packa_parallel_times = div_ceil<size_t>(OC, oc_tile_size);
        } else if (default_pack) {
            packa_parallel_times =
                    div_ceil<size_t>(OC, matmul_desc.innerblocksize.m);
        } else {
            //! if nopack return null so that OprWeightPreprocessProxy can run
            //! with nopack mode
            return {};
        }
        auto matmul_param = get_matmul_kern_param(
                param, ohw_tile_size, default_pack ? OC : oc_tile_size);

        StrategyParam strategyparam;
        strategyparam.enable_filter_preprocess =
                is_enable_filter_preprocess(param);
        strategyparam.packA_group_size = packA_group_size(
                m_matmul_algo, matmul_param, matmul_desc, packa_parallel_times);
        SmallVector<ConvBiasImpl::NCBKern> ret_kern;
        StrategyBase* im2colstrategy =
                Factory::get_im2col_strategy(param, m_matmul_algo);

        auto kern_packA = [bundle, matmul_algo = m_matmul_algo, matmul_param,
                           im2colstrategy, strategyparam = strategyparam,
                           matmul_desc = matmul_desc](
                                  const NCBKernParam& param,
                                  const NCBKernIndex& ncb_index) mutable {
            bundle.set(param.workspace_ptr);
            im2colstrategy->packA_kern(bundle, param, matmul_param, matmul_algo,
                                       ncb_index, matmul_desc, strategyparam);
        };
        ret_kern.push_back({kern_packA, {GROUP, packa_parallel_times}});
        return ret_kern;
    }
    MIDOUT_END();
    return {};
 }

 // vim: syntax=cpp.doxygen
--- a/dnn/src/fallback/conv_bias/im2col/algos.h
+++ b/dnn/src/fallback/conv_bias/im2col/algos.h
@@ -22,27 +22,6 @@ namespace megdnn {
 namespace fallback {

 class ConvBiasImpl::AlgoIm2col final : public AlgoBase {
    //! calculate m_oc_tile_size in choice_ohw_oc_block() fucntion,
    //! when m_oc_tile_size < this value m_oc_tile_size = ohw
    static constexpr size_t DEFAULT_OHW_MIN_TILE_SIZE = 32;
    //! when nr_threads > 1 and round(ohw,nr_threads)>nr_threads,
    //! m_oc_tile_size = DEFAULT_OC_TILE_SIZE
    static constexpr size_t DEFAULT_OC_TILE_SIZE = 512;
    //! when m_oc_tile_size > this value m_oc_tile_size =
    //! DEFAULT_OC_MAX_TILE_SIZE
    static constexpr size_t DEFAULT_OC_MAX_TILE_SIZE = 1024;
    //! when m_oc_tile_size < this value m_oc_tile_size =
    //! DEFAULT_OC_MIN_TILE_SIZE the purpose is aligning the calculation
    static constexpr size_t DEFAULT_OC_MIN_TILE_SIZE = 128;
    fallback::MatrixMulImpl::KernSizeParam get_matmul_kern_param(
            const NCBKernSizeParam& param, size_t ohw_tile_size,
            size_t oc_tile_size) const;
    WorkspaceBundle get_bundle(const NCBKernSizeParam& param) const;
    void choice_ohw_oc_block(
            const NCBKernSizeParam& param, size_t& oc_tile_size,
            size_t& ohw_tile_size, size_t block_m, size_t block_n,
            fallback::MatrixMulImpl::AlgoBase::PackMode pack_mode) const;

 public:
    AlgoIm2col(MatrixMulImpl::AlgoBase* matmul_algo, size_t ohw_tile_size)
            : m_matmul_algo(matmul_algo),
@@ -59,10 +38,16 @@ public:
    bool usable(const NCBKernSizeParam& param,
                AlgoSelectionStrategy algo_selection_strategy) const override;
    size_t get_workspace(const NCBKernSizeParam& param) const override;
    SmallVector<NCBKern> dispatch_kerns(
    SmallVector<NCBKern> dispatch_kerns(const NCBKernSizeParam& param) const override;
    SmallVector<TensorLayout> deduce_preprocessed_filter_layout(
            const NCBKernSizeParam& param) const override;
    size_t get_preprocess_workspace(
            const NCBKernSizeParam& /*param*/) const override {
        return 0;
    }
    SmallVector<NCBKern> dispatch_preprocess_kerns(
            const NCBKernSizeParam& param) const override;
    bool is_preferred(
                      const NCBKernSizeParam& param) const override {
    bool is_preferred(const NCBKernSizeParam& param) const override {
        if (param.src_type.category() == DTypeCategory::QUANTIZED) {
            static CpuOprDelegationStorage<1> storage;
            auto conv_bias_opr = storage.get<ConvBias, 0>();
--- a/dnn/src/fallback/conv_bias/im2col/strategy_base.h
+++ b/dnn/src/fallback/conv_bias/im2col/strategy_base.h
@@ -40,9 +40,11 @@ struct StrategyParam {
    size_t block_n;
    size_t block_k;
    size_t pack_oc_size;
    size_t packA_group_size;
    bool skip_copy_dst;
    bool is_dst_8bit;
    bool is_ohw_size_bigger;
    bool enable_filter_preprocess;
 };

 class StrategyBase {
@@ -62,7 +64,7 @@ public:
            const fallback::ConvBiasImpl::NCBKernIndex& ncb_index,
            const fallback::MatrixMulImpl::AlgoBase::MatmulDescription&
                    matmul_desec,
            size_t pack_size) = 0;
            const StrategyParam& sparam) = 0;

    virtual void exec_im2col(
            const WorkspaceBundle& bundle, const WorkspaceBundle& bundle_thread,
@@ -296,7 +298,7 @@ public:
            const fallback::ConvBiasImpl::NCBKernIndex& ncb_index,
            const fallback::MatrixMulImpl::AlgoBase::MatmulDescription&
                    matmul_desc,
            size_t pack_size) override;
            const StrategyParam& sparam) override;
    virtual void exec_im2col(
            const WorkspaceBundle& bundle, const WorkspaceBundle& bundle_thread,
            const StrategyParam& sparam,
@@ -375,7 +377,7 @@ public:
            const fallback::MatrixMulImpl::AlgoBase* matmul_algo,
            const fallback::ConvBiasImpl::NCBKernIndex& ncb_index,
            const fallback::MatrixMulImpl::AlgoBase::MatmulDescription& MDsec,
            size_t pack_size) override;
            const StrategyParam& sparam) override;

    void exec_matmul(const fallback::ConvBiasImpl::NCBKernParam& param,
                     const StrategyParam& sparam, const WorkspaceBundle& bundle,
@@ -431,7 +433,7 @@ public:
            const fallback::MatrixMulImpl::AlgoBase* matmul_algo,
            const fallback::ConvBiasImpl::NCBKernIndex& ncb_index,
            const fallback::MatrixMulImpl::AlgoBase::MatmulDescription& MDsec,
            size_t pack_size) override;
            const StrategyParam& sparam) override;

    void exec_im2col(
            const WorkspaceBundle& bundle, const WorkspaceBundle& bundle_thread,
--- a/dnn/src/fallback/conv_bias/im2col/strategy_default.cpp
+++ b/dnn/src/fallback/conv_bias/im2col/strategy_default.cpp
@@ -25,19 +25,23 @@ void Strategy<src_ctype, bias_ctype, dst_ctype, op_ctype, op_dtype,
                   const fallback::ConvBiasImpl::NCBKernIndex& ncb_index,
                   const fallback::MatrixMulImpl::AlgoBase::MatmulDescription&
                           matmul_desc,
                   size_t) {
                   const StrategyParam& sparam) {
    fallback::MatrixMulImpl::KernParam matmul_param;
    size_t group_id = ncb_index.ndrange_id[0];
    static_cast<fallback::MatrixMulImpl::KernSizeParam&>(matmul_param) =
            matmulparam;
    size_t packA_group_size = matmul_algo->get_bundle(matmul_param).get_size(0);
    size_t packed_per_oc_block_size =
            round_up(matmul_param.K, matmul_desc.innerblocksize.k) *
            matmul_desc.innerblocksize.m * matmul_desc.packa_type_size;

    size_t a_panel_offset = ncb_index.ndrange_id[1] * packed_per_oc_block_size;
    int8_t* a_panel = static_cast<int8_t*>(bundle.get(BUNDLE_PACKA_INDEX)) +
                      group_id * packA_group_size + a_panel_offset;
    int8_t* tmp_ptr =
            sparam.enable_filter_preprocess
                    ? static_cast<int8_t*>(
                              param.preprocessed_filter->tensors[0].raw_ptr)
                    : static_cast<int8_t*>(bundle.get(BUNDLE_PACKA_INDEX));
    int8_t* a_panel =
            tmp_ptr + group_id * sparam.packA_group_size + a_panel_offset;
    matmul_param.A_ptr =
            const_cast<src_ctype*>(param.filter<src_ctype>(group_id));
    matmul_algo->pack_A(matmul_param, a_panel, ncb_index.ndrange_id[1],
@@ -149,15 +153,20 @@ void Strategy<src_ctype, bias_ctype, dst_ctype, op_ctype, op_dtype,
    size_t packA_per_oc_block_size =
            round_up(matmul_param.K, matmul_desc.innerblocksize.k) *
            sparam.oc_tile_size * matmul_desc.packa_type_size;
    size_t packA_group_size = matmul_algo->get_bundle(matmul_param).get_size(0);
    size_t packA_group_size = sparam.packA_group_size;
    size_t a_panel_offset = ncb_index.ndrange_id[1] * packA_group_size +
                            ncb_index.ndrange_id[3] * packA_per_oc_block_size;

    void* matmul_dst = get_matmul_dst_ptr(param, bundle_thread, sparam);

    src_ctype* a_panel = reinterpret_cast<src_ctype*>(
            reinterpret_cast<uintptr_t>(bundle.get(BUNDLE_PACKA_INDEX)) +
            a_panel_offset);
    int8_t* tmp_ptr =
           sparam.enable_filter_preprocess
                    ? static_cast<int8_t*>(
                              param.preprocessed_filter->tensors[0].raw_ptr)
                    : static_cast<int8_t*>(bundle.get(BUNDLE_PACKA_INDEX));

    src_ctype* a_panel =
                    reinterpret_cast<src_ctype*>(tmp_ptr + a_panel_offset);
    src_ctype* b_panel =
            reinterpret_cast<src_ctype*>(reinterpret_cast<uintptr_t>(
                    bundle_thread.get(THREAD_BUNDLE_PACKB_INDEX)));
--- a/dnn/src/fallback/conv_bias/im2col/strategy_nopack.cpp
+++ b/dnn/src/fallback/conv_bias/im2col/strategy_nopack.cpp
@@ -26,7 +26,7 @@ void Strategy<src_ctype, bias_ctype, dst_ctype, op_ctype, op_dtype,
                   const fallback::ConvBiasImpl::NCBKernIndex& ncb_index,
                   const fallback::MatrixMulImpl::AlgoBase::
                           MatmulDescription& /*matmul_dsec*/,
                   size_t) {
                   const StrategyParam&) {
    MEGDNN_MARK_USED_VAR(bundle);
    MEGDNN_MARK_USED_VAR(param);
    MEGDNN_MARK_USED_VAR(matmulparam);
--- a/dnn/src/fallback/conv_bias/im2col/strategy_onlypacka.cpp
+++ b/dnn/src/fallback/conv_bias/im2col/strategy_onlypacka.cpp
@@ -26,7 +26,7 @@ void Strategy<src_ctype, bias_ctype, dst_ctype, op_ctype, op_dtype,
                   const fallback::ConvBiasImpl::NCBKernIndex& ncb_index,
                   const fallback::MatrixMulImpl::AlgoBase::
                           MatmulDescription& /*matmul_desc*/,
                   size_t) {
                   const StrategyParam& sparam) {
    fallback::MatrixMulImpl::KernParam matmul_param;
    static_cast<fallback::MatrixMulImpl::KernSizeParam&>(matmul_param) =
            matmulparam;
@@ -36,12 +36,17 @@ void Strategy<src_ctype, bias_ctype, dst_ctype, op_ctype, op_dtype,
    size_t output_block_oc_size =
            std::min(oc_tile_size, OC - ncb_index.ndrange_id[1] * oc_tile_size);
    size_t oc_cur_index = ncb_index.ndrange_id[1] * oc_tile_size;
    size_t packA_group_size =
            bundle.get_size(BUNDLE_PACKA_INDEX) / param.filter_meta.group;
    size_t a_panel_offset = ncb_index.ndrange_id[1] *
                            matmul_algo->get_bundle(matmul_param).get_size(0);
    int8_t* a_panel = static_cast<int8_t*>(bundle.get(BUNDLE_PACKA_INDEX)) +
                      group_id * packA_group_size + a_panel_offset;

    int8_t* tmp_ptr =
           sparam.enable_filter_preprocess
                    ? static_cast<int8_t*>(
                              param.preprocessed_filter->tensors[0].raw_ptr)
                    : static_cast<int8_t*>(bundle.get(BUNDLE_PACKA_INDEX));

    int8_t* a_panel = tmp_ptr +
                      group_id * sparam.packA_group_size + a_panel_offset;
    matmul_param.A_ptr =
            const_cast<src_ctype*>(param.filter<src_ctype>(group_id)) +
            oc_cur_index * matmul_param.K;
@@ -60,20 +65,22 @@ void Strategy<src_ctype, bias_ctype, dst_ctype, op_ctype, op_dtype,
                    fallback::MatrixMulImpl::KernParam matmul_param,
                    const fallback::MatrixMulImpl::AlgoBase* matmul_algo,
                    const fallback::ConvBiasImpl::NCBKernIndex& ncb_index,
                    const fallback::MatrixMulImpl::AlgoBase::
                            MatmulDescription& /*matmul_desc*/
        ) {
    size_t packA_group_size =
            bundle.get_size(BUNDLE_PACKA_INDEX) / param.filter_meta.group;
                    const fallback::MatrixMulImpl::AlgoBase::MatmulDescription&
                            /*matmul_desc*/) {
    size_t a_panel_offset = ncb_index.ndrange_id[3] *
                            matmul_algo->get_bundle(matmul_param).get_size(0);
    a_panel_offset = sparam.group_id * packA_group_size + a_panel_offset;
    a_panel_offset =
            sparam.group_id * sparam.packA_group_size + a_panel_offset;

    void* matmul_dst = get_matmul_dst_ptr(param, bundle_thread, sparam);

    src_ctype* a_panel = reinterpret_cast<src_ctype*>(
            reinterpret_cast<uintptr_t>(bundle.get(BUNDLE_PACKA_INDEX)) +
            a_panel_offset);
    int8_t* tmp_ptr =
            sparam.enable_filter_preprocess
                    ? static_cast<int8_t*>(
                              param.preprocessed_filter->tensors[0].raw_ptr)
                    : static_cast<int8_t*>(bundle.get(BUNDLE_PACKA_INDEX));

    src_ctype* a_panel = reinterpret_cast<src_ctype*>(tmp_ptr + a_panel_offset);
    src_ctype* b_panel = nullptr;

    src_ctype* im2col_dst = static_cast<src_ctype*>(
--- a/dnn/src/fallback/conv_bias/opr_impl.cpp
+++ b/dnn/src/fallback/conv_bias/opr_impl.cpp
@@ -154,7 +154,8 @@ void ConvBiasImpl::exec_preprocess(const TensorLayout& src_layout,
            bias{nullptr, bias_layout};
    auto fparam = make_ncb_kern_param(src, filter, bias, dst, workspace,
                                      preprocessed_filter);
    ConvolutionImpl::Algorithm* algo = get_algorithm(fparam, workspace.size);
    //! should not pass workspace_size limit otherwise can not find match algo
    ConvBiasImpl::Algorithm* algo = get_algorithm(fparam);
    if (!is_naive_algo(algo) && NCB_ALGO_FUNC(get_preprocess_workspace, algo,
                                              fparam) <= workspace.size) {
        exec_preprocess_with_ncb_kern(fparam, algo);
--- a/dnn/src/fallback/conv_bias/opr_impl.h
+++ b/dnn/src/fallback/conv_bias/opr_impl.h
@@ -299,6 +299,11 @@ private:
            const PreprocessedFilter* preprocessed_filter);
 };

 inline bool is_enable_filter_preprocess(
        const ConvBiasImpl::NCBKernSizeParam& param) {
    return param.preprocessed_filter &&
           param.preprocessed_filter->tensors.size() >= 1;
 }
 }  // namespace fallback
 }  // namespace megdnn

--- a/dnn/src/fallback/convolution/opr_impl.cpp
+++ b/dnn/src/fallback/convolution/opr_impl.cpp
@@ -109,7 +109,9 @@ void ConvolutionImpl::exec_preprocess(const TensorLayout& src_layout,
    TensorND src{nullptr, src_layout}, dst{nullptr, dst_layout};
    auto fparam = make_ncb_kern_param(src, filter, dst, preprocessed_filter,
                                      workspace);
    ConvolutionImpl::Algorithm* algo = get_algorithm(fparam, workspace.size);

    //! should not pass workspace_size limit otherwise can not find match algo
    ConvolutionImpl::Algorithm* algo = get_algorithm(fparam);
    if (!is_naive_algo(algo) && NCB_ALGO_FUNC(get_preprocess_workspace, algo,
                                              fparam) <= workspace.size) {
        exec_preprocess_with_ncb_kern(fparam, algo);
--- a/dnn/test/arm_common/conv_bias_multi_thread.cpp
+++ b/dnn/test/arm_common/conv_bias_multi_thread.cpp
@@ -1837,6 +1837,21 @@ void checker_conv_bias(std::vector<conv_bias::TestArg> args, Handle* handle,
                {arg.src, arg.filter, arg.bias, {}, {}});
    }
 }

 TEST_F(ARM_COMMON_MULTI_THREADS, CONVBIAS_IM2COL_FP32_STRIDE2_PREPROCESS) {
 #define cb(name)                                                               \
    check_conv_bias_preprocess(                                                \
            get_conv_bias_args({1, 2, 3, 4, 5, 6, 7}, 2, false, false, false), \
            handle(), nullptr, 0.001, dtype::Float32(), dtype::Float32(),      \
            dtype::Float32(), dtype::Float32(), name);
 #if MEGDNN_AARCH64
    cb("IM2COLMATMUL:AARCH64_F32K8X12X1") 
    cb("IM2COLMATMUL:AARCH64_F32K4X16X1")
 #elif MEGDNN_ARMV7
    cb("IM2COLMATMUL:ARMV7_F32")
 #endif
 #undef cb
 }
 // clang-format off
 TEST_F(ARM_COMMON_MULTI_THREADS, CONVBIAS_IM2COL_FP32_STRIDE2) {
 #define cb(name)                                                               \
@@ -1851,6 +1866,22 @@ TEST_F(ARM_COMMON_MULTI_THREADS, CONVBIAS_IM2COL_FP32_STRIDE2) {
    cb("IM2COLMATMUL:ARMV7_F32")
 #endif
 #undef cb

 }

 TEST_F(ARM_COMMON_MULTI_THREADS, CONVBIAS_IM2COL_FP32_STRIDE1_PREPROCESS) {
 #define cb(name)                                                            \
    check_conv_bias_preprocess(                                             \
            get_conv_bias_args({2, 3, 4, 5, 6, 7}, 1, false, false, false), \
            handle(), nullptr, 0.001, dtype::Float32(), dtype::Float32(),   \
            dtype::Float32(), dtype::Float32(), name);
 #if MEGDNN_AARCH64
    cb("IM2COLMATMUL:AARCH64_F32K8X12X1") 
    cb("IM2COLMATMUL:AARCH64_F32K4X16X1")
 #elif MEGDNN_ARMV7
    cb("IM2COLMATMUL:ARMV7_F32")
 #endif
 #undef cb
 }

 TEST_F(ARM_COMMON_MULTI_THREADS, CONVBIAS_IM2COL_FP32_STRIDE1) {
@@ -1899,6 +1930,37 @@ TEST_F(ARM_COMMON_MULTI_THREADS, CONV_BIAS_IM2COLMATMUL_QUANTIZEDSYM) {
 #undef cb
 }

 TEST_F(ARM_COMMON_MULTI_THREADS, CONV_BIAS_IM2COLMATMUL_QUANTIZEDSYM_PREPROCESS) {
    UniformIntRNG rng{-50, 50};

 #define cb(name)                                                                       \
    check_conv_bias_preprocess(get_conv_bias_args({2, 3, 4, 5, 6, 7}, 1, false, false, \
                                         false, true, true),                           \
                      handle(), &rng, epsilon, dtype::QuantizedS8(2.5f),               \
                      dtype::QuantizedS8(2.5f), dtype::QuantizedS32(6.25f),            \
                      dtype::QuantizedS8(60.25f), name);                               \
    check_conv_bias_preprocess(                                                        \
            get_conv_bias_args({1}, 2, false, false, false, true, true),               \
            handle(), &rng, epsilon, dtype::QuantizedS8(2.5f),                         \
            dtype::QuantizedS8(2.5f), dtype::QuantizedS32(6.25f),                      \
            dtype::QuantizedS8(60.25f), name);

    float epsilon = 0.001;
 #if MEGDNN_AARCH64
 #if __ARM_FEATURE_DOTPROD
    cb("IM2COLMATMUL:AARCH64_INT8X8X32_K8X12X4_DOTPROD");
 #else
    cb("IM2COLMATMUL:AARCH64_INT8X8X32_K8X8X8");
    cb("IM2COLMATMUL:AARCH64_INT8X8X32_K4X4X16");
 #endif
 #elif MEGDNN_ARMV7
    epsilon = 1;
    cb("IM2COLMATMUL:ARMV7_INT8X8X32_K4X8X8");
 #endif
 #undef cb
 }


 #if __ARM_FEATURE_DOTPROD

 TEST_F(ARM_COMMON_MULTI_THREADS, CONV_BIAS_IM2COLMATMUL_QUANTIZEDSYM_MK4_DOT) {
@@ -1924,6 +1986,29 @@ TEST_F(ARM_COMMON_MULTI_THREADS, CONV_BIAS_IM2COLMATMUL_QUANTIZEDSYM_MK4_DOT) {
 #endif
 #undef cb
 }
 TEST_F(ARM_COMMON_MULTI_THREADS, CONV_BIAS_IM2COLMATMUL_QUANTIZEDSYM_MK4_DOT_PREPROCESS) {
    UniformIntRNG rng{-50, 50};

 #define cb(name)                                                                        \
    check_conv_bias_preprocess(get_nchw44_conv_bias_args({2, 3, 4, 5, 6, 7}, 1, false,  \
                                                false, false, false, true),             \
                      handle(), &rng, epsilon, dtype::QuantizedS8(2.5f),                \
                      dtype::QuantizedS8(2.5f), dtype::QuantizedS32(6.25f),             \
                      dtype::QuantizedS8(60.25f), name);                                \
    checker_conv_bias(                                                                  \
            get_nchw44_conv_bias_args({1}, 2, false, true, true, false, true),          \
            handle(), &rng, epsilon, dtype::QuantizedS8(2.5f),                          \
            dtype::QuantizedS8(2.5f), dtype::QuantizedS32(6.25f),                       \
            dtype::QuantizedS8(60.25f), name);

    float epsilon = 0.001;
 #if MEGDNN_AARCH64
    cb("IM2COLMATMUL:AARCH64_INT8X8X32_MK4_8X12X4_DOTPROD:96");
 #elif MEGDNN_ARMV7
    cb("IM2COLMATMUL:AARCH32_INT8_MK4_8X4X4_DOTPROD:96");
 #endif
 #undef cb
 }

 TEST_F(ARM_COMMON_MULTI_THREADS, CONV_BIAS_IM2COLMATMUL_QUANTIZEDSYM_MK4_DOT_S2_FUSE) {
    UniformIntRNG rng{-50, 50};
@@ -1968,6 +2053,31 @@ TEST_F(ARM_COMMON_MULTI_THREADS, CONV_BIAS_IM2COLMATMUL_S8x8x32_MK4_DOT) {
 #undef cb
 }

 TEST_F(ARM_COMMON_MULTI_THREADS, CONV_BIAS_IM2COLMATMUL_S8x8x32_MK4_DOT_PREPROCESS) {
    UniformIntRNG rng{-50, 50};

 #define cb(name)                                                              \
    check_conv_bias_preprocess(                                               \
            get_nchw44_conv_bias_args({2, 3, 4, 5, 6, 7}, 1, false, false,    \
                                      true, false, true, false, false, true), \
            handle(), &rng, epsilon, dtype::QuantizedS8(2.5f),                \
            dtype::QuantizedS8(2.5f), dtype::QuantizedS32(6.25f), {}, name);  \
    check_conv_bias_preprocess(                                               \
            get_nchw44_conv_bias_args({1}, 2, false, true, true, false, true, \
                                      false, false, true),                    \
            handle(), &rng, epsilon, dtype::QuantizedS8(2.5f),                \
            dtype::QuantizedS8(2.5f), dtype::QuantizedS32(6.25f), {}, name);

    float epsilon = 0.001;
 #if MEGDNN_AARCH64
    cb("IM2COLMATMUL:AARCH64_INT8X8X32_MK4_8X12X4_DOTPROD:96");
 #elif MEGDNN_ARMV7
    cb("IM2COLMATMUL:AARCH32_INT8_MK4_8X4X4_DOTPROD:96");
 #endif
 #undef cb
 }


 TEST_F(ARM_COMMON_MULTI_THREADS, CONV_BIAS_IM2COLMATMUL_INT8x8x32_MK4_DOT) {
    UniformIntRNG rng{-50, 50};

@@ -1992,6 +2102,30 @@ TEST_F(ARM_COMMON_MULTI_THREADS, CONV_BIAS_IM2COLMATMUL_INT8x8x32_MK4_DOT) {
 #undef cb
 }

 TEST_F(ARM_COMMON_MULTI_THREADS, CONV_BIAS_IM2COLMATMUL_INT8x8x32_MK4_DOT_PREPROCESS) {
    UniformIntRNG rng{-50, 50};

 #define cb(name)                                                              \
    check_conv_bias_preprocess(                                               \
            get_nchw44_conv_bias_args({2, 3, 4, 5, 6, 7}, 1, false, false,    \
                                      true, false, true, false, false, true), \
            handle(), &rng, epsilon, dtype::Int8(), dtype::Int8(),            \
            dtype::Int32(), {}, name);                                        \
    check_conv_bias_preprocess(                                               \
            get_nchw44_conv_bias_args({1}, 2, false, true, true, false, true, \
                                      false, false, true),                    \
            handle(), &rng, epsilon, dtype::Int8(), dtype::Int8(),            \
            dtype::Int32(), {}, name);

    float epsilon = 0.001;
 #if MEGDNN_AARCH64
    cb("IM2COLMATMUL:AARCH64_INT8X8X32_MK4_8X12X4_DOTPROD:96");
 #elif MEGDNN_ARMV7
    cb("IM2COLMATMUL:AARCH32_INT8_MK4_8X4X4_DOTPROD:96");
 #endif
 #undef cb
 }

 TEST_F(ARM_COMMON_MULTI_THREADS, CONV_BIAS_CONV1x1_QUANTIZEDSYM_MK4_DOT) {
    UniformIntRNG rng{-50, 50};

@@ -2055,6 +2189,41 @@ TEST_F(ARM_COMMON_MULTI_THREADS, CONV_BIAS_IM2COLMATMUL_QUANTIZEDASYM) {
 #endif
 #undef cb
 }

 TEST_F(ARM_COMMON_MULTI_THREADS,
       CONV_BIAS_IM2COLMATMUL_QUANTIZEDASYM_FILTERPREPROCESS) {
    NormalRNG rng(128.f);

 #define cb(name)                                                           \
    check_conv_bias_preprocess(                                            \
            get_conv_bias_args({2, 3, 4, 5, 6, 7}, 1, false, false, false, \
                               true, true),                                \
            handle(), &rng, epsilon,                                       \
            dtype::Quantized8Asymm(1.2f, (uint8_t)125),                    \
            dtype::Quantized8Asymm(1.3f, (uint8_t)129),                    \
            dtype::QuantizedS32(1.2 * 1.3),                                \
            dtype::Quantized8Asymm(50.3f, (uint8_t)120), name);            \
    check_conv_bias_preprocess(                                            \
            get_conv_bias_args({1}, 2, false, false, false, true, true),   \
            handle(), &rng, epsilon,                                       \
            dtype::Quantized8Asymm(1.2f, (uint8_t)125),                    \
            dtype::Quantized8Asymm(1.3f, (uint8_t)129),                    \
            dtype::QuantizedS32(1.2 * 1.3),                                \
            dtype::Quantized8Asymm(50.3f, (uint8_t)120), name);
    float epsilon = 0.001;
 #if MEGDNN_AARCH64
 #if __ARM_FEATURE_DOTPROD
    cb("IM2COLMATMUL:AARCH64_QUINT8_K8X8X4_DOTPROD");
 #else
    cb("IM2COLMATMUL:AARCH64_QUINT8_K8X8X8");
 #endif
 #elif MEGDNN_ARMV7
    epsilon = 1;
    cb("IM2COLMATMUL:ARMV7_QUINT8_K4X8X8");
 #endif
 #undef cb
 }

 #endif

 #if MEGDNN_AARCH64 || MEGDNN_ARMV7
@@ -2088,6 +2257,39 @@ TEST_F(ARM_COMMON_MULTI_THREADS, CONV_BIAS_IM2COLMATMUL_QUINT8x8x32) {
 #endif
 #undef cb
 }

 TEST_F(ARM_COMMON_MULTI_THREADS, CONV_BIAS_IM2COLMATMUL_QUINT8x8x32_FILTERPREPROCESS) {
    UniformIntRNG rng{-50, 50};
    float epsilon = 0.001;
 #define cb(name)                                                              \
    check_conv_bias_preprocess(                                               \
            get_conv_bias_args({2, 3, 4, 5, 6, 7}, 1, false, true, true),     \
            handle(), &rng, epsilon,                                          \
            dtype::Quantized8Asymm(1.2f, (uint8_t)125),                       \
            dtype::Quantized8Asymm(1.3f, (uint8_t)129),                       \
            dtype::QuantizedS32(1.2 * 1.3), {}, name);                        \
    check_conv_bias_preprocess(get_conv_bias_args({1}, 2, false, true, true), \
                               handle(), &rng, epsilon,                       \
                               dtype::Quantized8Asymm(1.2f, (uint8_t)125),    \
                               dtype::Quantized8Asymm(1.3f, (uint8_t)129),    \
                               dtype::QuantizedS32(1.2 * 1.3), {}, name);

 #if MEGDNN_AARCH64
 #if __ARM_FEATURE_DOTPROD
    cb("IM2COLMATMUL:AARCH64_QUINT8_K8X8X4_DOTPROD");
 #else
    cb("IM2COLMATMUL:AARCH64_QUINT8_K8X8X8");
 #endif
 #elif MEGDNN_ARMV7
 #if __ARM_FEATURE_DOTPROD
    cb("IM2COLMATMUL:AARCH32_QUINT8_K4X8X4");
 #endif
    cb("IM2COLMATMUL:ARMV7_QUINT8_K4X8X8");
 #endif
 #undef cb
 }


 TEST_F(ARM_COMMON_MULTI_THREADS, CONVBIAS_IM2COLMATMUL_INT8x8x16) {
    UniformIntRNG rng{-50, 50};
    float epsilon = 0.001;
@@ -2127,6 +2329,51 @@ TEST_F(ARM_COMMON_MULTI_THREADS, CONVBIAS_IM2COLMATMUL_INT8x8x16) {
 #undef cb
 #undef cb_nchw44
 }

 TEST_F(ARM_COMMON_MULTI_THREADS, CONVBIAS_IM2COLMATMUL_INT8x8x16_FILTERPREPROCESS) {
    UniformIntRNG rng{-50, 50};
    float epsilon = 0.001;
 #define cb(name)                                                              \
    check_conv_bias_preprocess(                                               \
            get_conv_bias_args({2, 3, 4, 5, 6, 7}, 1, false, true, true),     \
            handle(), &rng, epsilon, dtype::Int8{}, dtype::Int8{},            \
            dtype::Int16{}, dtype::Int16{}, name);                            \
    check_conv_bias_preprocess(get_conv_bias_args({1}, 2, false, true, true), \
                               handle(), &rng, epsilon, dtype::Int8{},        \
                               dtype::Int8{}, dtype::Int16{}, dtype::Int16{}, \
                               name);

 #if MEGDNN_AARCH64
    cb("IM2COLMATMUL:AARCH64_INT8X8X16_K8X8X8");
    cb("IM2COLMATMUL:AARCH64_INT8X8X16_K4X4X16");
 #elif MEGDNN_ARMV7
    cb("IM2COLMATMUL:ARMV7_INT8X8X16_K4X8X8");
    cb("IM2COLMATMUL:ARMV7_INT8X8X16_K4X2X16");
 #endif
 #undef cb
 }

 TEST_F(ARM_COMMON_MULTI_THREADS, CONVBIAS_IM2COLMATMUL_INT8x8x16_NOPACK_FILTERPREPROCESS) {
    UniformIntRNG rng{-50, 50};
    float epsilon = 0.001;
 #define cb(name)                                                              \
    check_conv_bias_preprocess(                                               \
            get_conv_bias_args({2, 3, 4, 5, 6, 7}, 1, false, true, true),     \
            handle(), &rng, epsilon, dtype::Int8{}, dtype::Int8{},            \
            dtype::Int16{}, dtype::Int16{}, name);                            \
    check_conv_bias_preprocess(get_conv_bias_args({1}, 2, false, true, true), \
                               handle(), &rng, epsilon, dtype::Int8{},        \
                               dtype::Int8{}, dtype::Int16{}, dtype::Int16{}, \
                               name);

 #if MEGDNN_AARCH64
    cb("IM2COLMATMUL:ARM_COMMON_INT8X8X16");
 #elif MEGDNN_ARMV7
    cb("IM2COLMATMUL:ARM_COMMON_INT8X8X16");
 #endif
 #undef cb
 }

 #endif

 #if __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
@@ -2154,6 +2401,31 @@ TEST_F(ARM_COMMON_MULTI_THREADS, CONV_BIAS_IM2COLMATMUL_FP16) {
 #endif
 #undef cb
 }

 TEST_F(ARM_COMMON_MULTI_THREADS, CONV_BIAS_IM2COLMATMUL_FP16_FILTERPREPROCESS) {
    using namespace conv_bias;

    param::ConvBias cur_param;

    std::vector<conv_bias::TestArg> args =
            get_conv_bias_args({2, 3, 4, 5, 6, 7}, 1, false, false, false);
    std::vector<conv_bias::TestArg> args1 =
            get_conv_bias_args({1}, 2, false, false, false);
    args.insert(args.begin(), args1.begin(), args1.end());

    NormalRNG rng(1);
 #define cb(name)                                                             \
    check_conv_bias_preprocess(args, handle(), &rng, 0.03, dtype::Float16{}, \
                               dtype::Float16{}, dtype::Float16{},           \
                               dtype::Float16{}, name);

 #if MEGDNN_AARCH64
    cb("IM2COLMATMUL:AARCH64_F16_K8X24X1");
 #elif MEGDNN_ARMV7
    cb("IM2COLMATMUL:AARCH32_F16_K4X16X1");
 #endif
 #undef cb
 }
 #endif

 void checker_conv_bias_mul_int8x8x32(std::vector<conv_bias::TestArg> args,
@@ -2185,6 +2457,36 @@ void checker_conv_bias_mul_int8x8x32(std::vector<conv_bias::TestArg> args,
    }
 }

 void checker_conv_bias_int8x8x32_preprocess(std::vector<conv_bias::TestArg> args,
                                     Handle* handle, const char* algo_name) {
    using namespace conv_bias;

    Checker<ConvBiasForward, OprWeightPreprocessProxy<ConvBiasForward>> checker(
            handle);
    checker.set_before_exec_callback(
            conv_bias::ConvBiasAlgoChecker<ConvBias>(algo_name));
    checker.set_dtype(0, dtype::Int8());
    checker.set_dtype(1, dtype::Int8());
    checker.set_dtype(2, dtype::Int32());
    checker.set_dtype(4, dtype::Int32());
    for (auto&& arg : args) {
        checker.set_param(arg.param).execs({arg.src, arg.filter, {}, {}, {}});
    }

    UniformIntRNG rng{-50, 50};
    for (auto&& arg : args) {
        checker.set_dtype(0, dtype::QuantizedS8(2.5f))
                .set_dtype(1, dtype::QuantizedS8(2.5f))
                .set_dtype(2, dtype::QuantizedS32(6.25f))
                .set_dtype(4, {})
                .set_rng(0, &rng)
                .set_rng(1, &rng)
                .set_rng(2, &rng)
                .set_param(arg.param)
                .execs({arg.src, arg.filter, {}, {}, {}});
    }
 }

 #if MEGDNN_AARCH64 || MEGDNN_ARMV7
 #if !__ARM_FEATURE_DOTPROD
 TEST_F(ARM_COMMON_MULTI_THREADS, CONV_BIAS_IM2COLMATMUL_INT8x8x32NCHW44_S2) {
@@ -2201,6 +2503,20 @@ TEST_F(ARM_COMMON_MULTI_THREADS, CONV_BIAS_IM2COLMATMUL_INT8x8x32NCHW44_S2) {
 #undef cb
 }

 TEST_F(ARM_COMMON_MULTI_THREADS, CONV_BIAS_IM2COLMATMUL_INT8x8x32NCHW44_S2_PREPROCESS) {
    using namespace conv_bias;
    std::vector<conv_bias::TestArg> args =
            get_nchw44_conv_bias_args({2, 5, 7}, 2, false, true, true);

 #define cb(name) checker_conv_bias_int8x8x32_preprocess(args, handle(), name);
 #if MEGDNN_AARCH64
    cb("IM2COLMATMUL:AARCH64_INT8X8X32_MK4_4X4X16:96");
 #else
    cb("IM2COLMATMUL:ARMV7_INT8X8X32_MK4_4X2X16:96");
 #endif
 #undef cb
 }

 TEST_F(ARM_COMMON_MULTI_THREADS, CONV_BIAS_IM2COLMATMUL_INT8x8x32NCHW44_S1) {
    using namespace conv_bias;
    std::vector<conv_bias::TestArg> args =
@@ -2216,6 +2532,21 @@ TEST_F(ARM_COMMON_MULTI_THREADS, CONV_BIAS_IM2COLMATMUL_INT8x8x32NCHW44_S1) {
 #undef cb
 }

 TEST_F(ARM_COMMON_MULTI_THREADS, CONV_BIAS_IM2COLMATMUL_INT8x8x32NCHW44_S1_PREPROCESS) {
    using namespace conv_bias;
    std::vector<conv_bias::TestArg> args =
            get_nchw44_conv_bias_args({3, 4, 6}, 1, false, true, true);

 #define cb(name) checker_conv_bias_int8x8x32_preprocess(args, handle(), name);
 #if MEGDNN_AARCH64
    cb("IM2COLMATMUL:AARCH64_INT8X8X32_MK4_4X4X16:96");
 #else
    cb("IM2COLMATMUL:ARMV7_INT8X8X32_MK4_4X2X16:96");
 #endif

 #undef cb
 }

 TEST_F(ARM_COMMON_MULTI_THREADS,
       CONV_BIAS_IM2COLMATMUL_QUANTIZEDSYM_NCHW44_S2) {
    UniformIntRNG rng{-50, 50};
@@ -2235,6 +2566,25 @@ TEST_F(ARM_COMMON_MULTI_THREADS,
 }

 TEST_F(ARM_COMMON_MULTI_THREADS,
       CONV_BIAS_IM2COLMATMUL_QUANTIZEDSYM_NCHW44_S2_PREPROCESS) {
    UniformIntRNG rng{-50, 50};

 #define cb(name)                                                              \
    check_conv_bias_preprocess(                                               \
            get_nchw44_conv_bias_args({3, 4, 6}, 2), handle(), &rng, epsilon, \
            dtype::QuantizedS8(2.5f), dtype::QuantizedS8(2.5f),               \
            dtype::QuantizedS32(6.25f), dtype::QuantizedS8(60.25f), name);
    float epsilon = 0.001;
 #if MEGDNN_AARCH64
    cb("IM2COLMATMUL:AARCH64_INT8X8X32_MK4_4X4X16:96");
 #else
    cb("IM2COLMATMUL:ARMV7_INT8X8X32_MK4_4X2X16:96");
 #endif
 #undef cb
 }


 TEST_F(ARM_COMMON_MULTI_THREADS,
       CONV_BIAS_IM2COLMATMUL_QUANTIZEDSYM_NCHW44_S1) {
    UniformIntRNG rng{-50, 50};

@@ -2252,6 +2602,24 @@ TEST_F(ARM_COMMON_MULTI_THREADS,
 #undef cb
 }

 TEST_F(ARM_COMMON_MULTI_THREADS,
       CONV_BIAS_IM2COLMATMUL_QUANTIZEDSYM_NCHW44_S1_PREPROCESS) {
    UniformIntRNG rng{-50, 50};

 #define cb(name)                                                              \
    check_conv_bias_preprocess(                                               \
            get_nchw44_conv_bias_args({2, 5, 7}, 1), handle(), &rng, epsilon, \
            dtype::QuantizedS8(2.5f), dtype::QuantizedS8(2.5f),               \
            dtype::QuantizedS32(6.25f), dtype::QuantizedS8(60.25f), name);
    float epsilon = 0.001;
 #if MEGDNN_AARCH64
    cb("IM2COLMATMUL:AARCH64_INT8X8X32_MK4_4X4X16:96");
 #else
    cb("IM2COLMATMUL:ARMV7_INT8X8X32_MK4_4X2X16:96");
 #endif
 #undef cb
 }

 #if MEGDNN_AARCH64
 TEST_F(ARM_COMMON_MULTI_THREADS,
       CONV_BIAS_IM2COLMATMUL_QUANTIZEDSYM_NCHW44_FUSE) {
@@ -2266,6 +2634,21 @@ TEST_F(ARM_COMMON_MULTI_THREADS,
    cb("IM2COLMATMUL:AARCH64_INT8X8X32_MK4_4X4X16:96");
 #undef cb
 }

 TEST_F(ARM_COMMON_MULTI_THREADS,
       CONV_BIAS_IM2COLMATMUL_QUANTIZEDSYM_NCHW44_FUSE_PREPROCESS) {
    UniformIntRNG rng{-50, 50};

 #define cb(name)                                                        \
    check_conv_bias_preprocess(                                         \
            get_nchw44_conv_bias_args({3}, 1), handle(), &rng, epsilon, \
            dtype::QuantizedS8(2.5f), dtype::QuantizedS8(2.5f),         \
            dtype::QuantizedS32(6.25f), dtype::QuantizedS8(60.25f), name);
    float epsilon = 0.001;
    cb("IM2COLMATMUL:AARCH64_INT8X8X32_MK4_4X4X16:96");
 #undef cb
 }

 #endif
 #endif
 #endif
@@ -2287,6 +2670,23 @@ TEST_F(ARM_COMMON_MULTI_THREADS,
    cb("IM2COLMATMUL:AARCH64_INT8X8X32_MK4_8X12X4_DOTPROD:96");
 #undef cb
 }

 TEST_F(ARM_COMMON_MULTI_THREADS,
       CONV_BIAS_IM2COLMATMUL_QUANTIZEDSYM_NCHW44DOT_FUSE_PREPROCESS) {
    UniformIntRNG rng{-50, 50};

 #define cb(name)                                                          \
    check_conv_bias_preprocess(                                           \
            get_nchw44_conv_bias_args({3}, 1, false, false, false, false, \
                                      true, false, false, false),         \
            handle(), &rng, epsilon, dtype::QuantizedS8(2.5f),            \
            dtype::QuantizedS8(2.5f), dtype::QuantizedS32(6.25f),         \
            dtype::QuantizedS8(60.25f), name);
    float epsilon = 0.001;
    cb("IM2COLMATMUL:AARCH64_INT8X8X32_MK4_8X12X4_DOTPROD:96");
 #undef cb
 }

 #endif
 #endif

@@ -2320,6 +2720,36 @@ TEST_F(ARM_COMMON_MULTI_THREADS, CONV_BIAS_IM2COLMATMUL_INT8x8x32) {
 #undef cb
 }

 TEST_F(ARM_COMMON_MULTI_THREADS, CONV_BIAS_IM2COLMATMUL_INT8X8X32_FILTER_PREPROCESS) {
    using namespace conv_bias;
    std::vector<conv_bias::TestArg> args =
            get_conv_bias_args({2, 3, 4, 5, 6, 7}, 1, false, true, true);
    std::vector<conv_bias::TestArg> args1 =
            get_conv_bias_args({1}, 2, false, true, true);
    args.insert(args.begin(), args1.begin(), args1.end());

 #define cb(name) checker_conv_bias_int8x8x32_preprocess(args, handle(), name);

 #if MEGDNN_AARCH64
 #if __ARM_FEATURE_DOTPROD
    cb("IM2COLMATMUL:AARCH64_INT8X8X32_K8X12X4_DOTPROD");
 #else
    cb("IM2COLMATMUL:AARCH64_INT8X8X32_K8X8X8");
    cb("IM2COLMATMUL:AARCH64_INT8X8X32_K4X4X16");
 #endif
 #elif MEGDNN_ARMV7
 #if __ARM_FEATURE_DOTPROD
    cb("IM2COLMATMUL:AARCH32_INT8_K6X8X4");
 #endif
    cb("IM2COLMATMUL:ARMV7_INT8X8X32_K4X8X8");
 #endif

 #if MEGDNN_ARMV7
    cb("IM2COLMATMUL:ARMV7_INT8X8X32_K4X2X16");
 #endif
 #undef cb
 }

 TEST_F(ARM_COMMON_MULTI_THREADS, CONV_BIAS_IM2COL_S1_MK4_PACK_F32) {
    using namespace conv_bias;
    std::vector<conv_bias::TestArg> args = get_nchw44_conv_bias_args(
@@ -2331,25 +2761,62 @@ TEST_F(ARM_COMMON_MULTI_THREADS, CONV_BIAS_IM2COL_S1_MK4_PACK_F32) {
 #endif
 }

 TEST_F(ARM_COMMON_MULTI_THREADS, CONV_BIAS_IM2COL_S1_MK4_PACK_F32_PREPROCESS) {
    using namespace conv_bias;
    std::vector<conv_bias::TestArg> args = get_nchw44_conv_bias_args(
            {2, 4, 7}, 1, false, false, false, false, false, true,true);
 #define cb(name)                                                   \
    check_conv_bias_preprocess(args, handle(), nullptr, 0.001,     \
                               dtype::Float32(), dtype::Float32(), \
                               dtype::Float32(), dtype::Float32(), name);
 #if MEGDNN_AARCH64
    cb("IM2COLMATMUL:AARCH64_F32_MK4_K8X12X1");
 #elif MEGDNN_ARMV7
    cb("IM2COLMATMUL:ARMV7_F32_MK4_PACK_4X12");
 #endif
 #undef cb
 }

 TEST_F(ARM_COMMON_MULTI_THREADS, CONV_BIAS_IM2COL_S2_MK4_PACK_F32) {
    using namespace conv_bias;
    std::vector<conv_bias::TestArg> args = get_nchw44_conv_bias_args(
            {3, 5, 6}, 2, false, false, false, false, false, true, true);
 #define cb(name) check_conv_bias(args, handle(), name);
 #if MEGDNN_AARCH64
    check_conv_bias(args, handle(), "IM2COLMATMUL:AARCH64_F32_MK4_K8X12X1");
    cb("IM2COLMATMUL:AARCH64_F32_MK4_K8X12X1");
 #elif MEGDNN_ARMV7
    check_conv_bias(args, handle(), "IM2COLMATMUL:ARMV7_F32_MK4_PACK_4X12");
    cb("IM2COLMATMUL:ARMV7_F32_MK4_PACK_4X12");
 #endif
 #undef cb
 }

 TEST_F(ARM_COMMON_MULTI_THREADS, CONV_BIAS_IM2COL_S2_MK4_PACK_F32_FUSE_PREPROCESS) {
    using namespace conv_bias;
    std::vector<conv_bias::TestArg> args = get_nchw44_conv_bias_args(
            {3}, 2, false, false, false, false, false, true, true, false);
 #define cb(name)                                                   \
    check_conv_bias_preprocess(args, handle(), nullptr, 0.001,     \
                               dtype::Float32(), dtype::Float32(), \
                               dtype::Float32(), dtype::Float32(), name);
 #if MEGDNN_AARCH64
    cb("IM2COLMATMUL:AARCH64_F32_MK4_K8X12X1");
 #elif MEGDNN_ARMV7
    cb("IM2COLMATMUL:ARMV7_F32_MK4_PACK_4X12");
 #endif
 #undef cb
 }

 TEST_F(ARM_COMMON_MULTI_THREADS, CONV_BIAS_IM2COL_S2_MK4_PACK_F32_FUSE) {
    using namespace conv_bias;
    std::vector<conv_bias::TestArg> args = get_nchw44_conv_bias_args(
            {3}, 2, false, false, false, false, false, true, true, false);
 #define cb(name) check_conv_bias(args, handle(), name);
 #if MEGDNN_AARCH64
    check_conv_bias(args, handle(), "IM2COLMATMUL:AARCH64_F32_MK4_K8X12X1");
    cb("IM2COLMATMUL:AARCH64_F32_MK4_K8X12X1");
 #elif MEGDNN_ARMV7
    check_conv_bias(args, handle(), "IM2COLMATMUL:ARMV7_F32_MK4_PACK_4X12");
    cb("IM2COLMATMUL:ARMV7_F32_MK4_PACK_4X12");
 #endif
 #undef cb
 }
 /***************************** Conv1x1 Algo Test ***********************/
 TEST_F(ARM_COMMON_MULTI_THREADS, CONV_BIAS_1X1_S1_F32) {
--- a/dnn/test/common/conv_bias.cpp
+++ b/dnn/test/common/conv_bias.cpp
@@ -1118,6 +1118,30 @@ void checker_conv_bias_int8x8x16(std::vector<conv_bias::TestArg> args,
    }
 }

 void check_conv_bias_preprocess(std::vector<conv_bias::TestArg> args,
                                Handle* handle, RNG* rng, float epsilon,
                                DType type0, DType type1, DType type2,
                                DType type3, const char* algo_name) {
    using namespace conv_bias;

    Checker<ConvBiasForward, OprWeightPreprocessProxy<ConvBiasForward>> checker(
            handle);
    checker.set_dtype(0, type0);
    checker.set_dtype(1, type1);
    checker.set_dtype(2, type2);
    checker.set_dtype(4, type3);
    checker.set_epsilon(epsilon);
    if (NULL != rng) {
        checker.set_rng(0, rng).set_rng(1, rng).set_rng(2, rng).set_rng(3, rng);
    }
    checker.set_before_exec_callback(
            conv_bias::ConvBiasAlgoChecker<ConvBias>(algo_name));
    for (auto&& arg : args) {
        checker.set_param(arg.param).execs(
                {arg.src, arg.filter, arg.bias, {}, {}});
    }
 }


 void winograd_algo_extra_impl(const TensorNDArray& tensors, uint32_t m,
                              param::ConvBias param, Handle* handle,
--- a/dnn/test/common/conv_bias.h
+++ b/dnn/test/common/conv_bias.h
@@ -58,7 +58,10 @@ std::vector<TestArg> get_int8_chwn4_tensorcore_args(size_t kernel_size);
 std::vector<TestArg> get_int8_nchw44_args(size_t kernel_size, size_t pack_size,
                                          bool compute_float32 = false,
                                          bool group_mode = false);

 void check_conv_bias_preprocess(std::vector<conv_bias::TestArg> args,
                                Handle* handle, RNG* rng, float epsilon,
                                DType type0, DType type1, DType type2,
                                DType type3, const char* algo_name);
 template <typename Opr>
 using ConvBiasAlgoChecker = AlgoChecker<Opr>;

--- a/dnn/test/x86/conv_bias.cpp
+++ b/dnn/test/x86/conv_bias.cpp
@@ -752,7 +752,7 @@ TEST_F(X86_MULTI_THREADS, CONV_BIAS_DIRECT_STRIDE2) {
    }
 }

 TEST_F(X86_MULTI_THREADS, CONV_BIAS_IM2COLMATMUL_INT8X8X) {
 TEST_F(X86_MULTI_THREADS, CONV_BIAS_IM2COLMATMUL_INT8X8X32) {
    using namespace conv_bias;
    std::vector<TestArg> args;

@@ -842,6 +842,98 @@ TEST_F(X86_MULTI_THREADS, CONV_BIAS_IM2COLMATMUL_INT8X8X) {
 #undef cb2
 }

 TEST_F(X86_MULTI_THREADS, CONV_BIAS_IM2COLMATMUL_INT8X8X32_FILTER_PREPROCESS) {
    using namespace conv_bias;
    std::vector<TestArg> args;

    auto run = [&](size_t oc, size_t ic, size_t w, size_t h, size_t kernel,
                   size_t p, NonlineMode nonline_mode) {
        if (w + 2 * p < kernel || h + 2 * p < kernel)
            return;
        param::ConvBias param;
        param.stride_h = 1;
        param.stride_w = 1;
        param.pad_h = p;
        param.pad_w = p;
        param.nonlineMode = nonline_mode;

        //! no bias
        args.emplace_back(param, TensorShape{1, ic, h, w},
                          TensorShape{oc, ic, kernel, kernel}, TensorShape{});
    };

    for (size_t kernel : {2, 3, 4, 5, 6, 7})
        for (size_t ic : {1, 4, 8, 16})
            for (size_t oc : {1, 4, 8})
                for (size_t p : {0, 2})
                    for (size_t size : {20, 21, 24})
                        for (NonlineMode nonline_mode :
                             {NonlineMode::IDENTITY}) {
                            run(oc, ic, size, size, kernel, p, nonline_mode);
                        }
    //! test OC block
    run(2046, 1, 8, 8, 2, 0, NonlineMode::IDENTITY);

    Checker<ConvBiasForward, OprWeightPreprocessProxy<ConvBiasForward>> checker(
            handle());
    UniformIntRNG rng{-50, 50};
 #define cb(algo_name)                                                          \
    checker.set_before_exec_callback(                                          \
            conv_bias::ConvBiasAlgoChecker<ConvBias>(algo_name));              \
    checker.set_dtype(0, dtype::Int8());                                       \
    checker.set_dtype(1, dtype::Int8());                                       \
    checker.set_dtype(2, dtype::Int32());                                      \
    checker.set_dtype(4, dtype::Int32());                                      \
    for (auto&& arg : args) {                                                  \
        checker.set_param(arg.param).execs({arg.src, arg.filter, {}, {}, {}}); \
    }                                                                          \
    for (auto&& arg : args) {                                                  \
        checker.set_dtype(0, dtype::QuantizedS8(2.5f))                         \
                .set_dtype(1, dtype::QuantizedS8(2.5f))                        \
                .set_dtype(2, dtype::QuantizedS32(6.25f))                      \
                .set_dtype(4, {})                                              \
                .set_rng(0, &rng)                                              \
                .set_rng(1, &rng)                                              \
                .set_rng(2, &rng)                                              \
                .set_param(arg.param)                                          \
                .execs({arg.src, arg.filter, {}, {}, {}});                     \
    }
 #define cb2(algo_name)                                                         \
    checker.set_before_exec_callback(                                          \
            conv_bias::ConvBiasAlgoChecker<ConvBias>(algo_name));              \
    checker.set_dtype(0, dtype::Int8());                                       \
    checker.set_dtype(1, dtype::Int8());                                       \
    checker.set_dtype(2, dtype::Int16());                                      \
    checker.set_dtype(4, dtype::Int16());                                      \
    for (auto&& arg : args) {                                                  \
        checker.set_param(arg.param).execs({arg.src, arg.filter, {}, {}, {}}); \
    }

 #if MEGDNN_X86_WITH_MKL_DNN
    if (megdnn::x86::is_supported(x86::SIMDType::VNNI)) {
        cb("IM2COLMATMUL:X86_INT8X8X32_MKLDNN");
    }
 #endif
 #if MEGDNN_X86_WITH_VNNI
    if (megdnn::x86::is_supported(x86::SIMDType::VNNI)) {
        cb("IM2COLMATMUL:X86_INT8X8X32_VNNI");
    }
 #endif
    if (megdnn::x86::is_supported(x86::SIMDType::AVX2)) {
        cb("IM2COLMATMUL:X86_INT8X8X32_AVX2_2X4X16");
        cb("IM2COLMATMUL:X86_INT8X8X32_AVX2_4X16X2");
        cb2("IM2COLMATMUL:X86_INT8X8X16_AVX2");
    }
    if (::megdnn::x86::is_supported(::megdnn::x86::SIMDType::SSE4_2)) {
        cb("IM2COLMATMUL:X86_INT8X8X32_SSE_4X8X2");
        cb2("IM2COLMATMUL:X86_INT8X8X16_SSE");
    }

 #undef cb
 #undef cb2
 }


 TEST_F(X86_MULTI_THREADS, CONV_BIAS_IM2COLMATMUL_FP32) {
    using namespace conv_bias;
    std::vector<TestArg> args;
@@ -950,6 +1042,61 @@ TEST_F(X86, CONV_BIAS_IM2COLMATMUL_FP32) {

 #undef cb
 }

 TEST_F(X86, CONV_BIAS_IM2COLMATMUL_FP32_NOPACK_PREPROCESS) {
    using namespace conv_bias;
    std::vector<TestArg> args;

    auto run = [&](size_t oc, size_t ic, size_t w, size_t h, size_t kernel,
                   size_t p, NonlineMode nonline_mode) {
        if (w + 2 * p < kernel || h + 2 * p < kernel)
            return;
        param::ConvBias param;
        param.stride_h = 1;
        param.stride_w = 1;
        param.pad_h = p;
        param.pad_w = p;
        param.nonlineMode = nonline_mode;

        //! no bias
        args.emplace_back(param, TensorShape{1, ic, h, w},
                          TensorShape{oc, ic, kernel, kernel}, TensorShape{});
        args.emplace_back(param, TensorShape{1, ic, h, w},
                          TensorShape{oc, ic, kernel, kernel},
                          TensorShape{1, oc, 1, 1});
        args.emplace_back(
                param, TensorShape{1, ic, h, w},
                TensorShape{oc, ic, kernel, kernel},
                TensorShape{1, oc, (h + 2 * p - kernel) / param.stride_h + 1,
                            (w + 2 * p - kernel) / param.stride_w + 1});
    };

    for (size_t kernel : {2, 3, 4, 5, 6, 7})
        for (size_t ic : {1, 4, 8, 16})
            for (size_t oc : {1, 4, 8, 16, 300})
                for (size_t p : {0, 2})
                    for (size_t size : {8, 24})
                        for (NonlineMode nonline_mode :
                             {NonlineMode::IDENTITY, NonlineMode::RELU}) {
                            run(oc, ic, size, size, kernel, p, nonline_mode);
                        }

    run(2046, 8, 20, 20, 3, 1, NonlineMode::IDENTITY);

    Checker<ConvBiasForward, OprWeightPreprocessProxy<ConvBiasForward>> checker(
            handle());
 #define cb(algo_name)                                             \
    checker.set_before_exec_callback(                             \
            conv_bias::ConvBiasAlgoChecker<ConvBias>(algo_name)); \
    for (auto&& arg : args) {                                     \
        checker.set_param(arg.param).execs(                       \
                {arg.src, arg.filter, arg.bias, {}, {}});         \
    }
    cb("IM2COLMATMUL:X86_F32_BLAS");

 #undef cb
 }

 #endif


@@ -1020,6 +1167,73 @@ TEST_F(X86_MULTI_THREADS, CONV_BIAS_IM2COLMATMUL_FP32_PACKA) {
 #undef cb
 }

 TEST_F(X86_MULTI_THREADS, CONV_BIAS_IM2COLMATMUL_FP32_PACKA_FILTER_PREPROCESS) {
    using namespace conv_bias;
    std::vector<TestArg> args;

    auto run = [&](size_t oc, size_t ic, size_t w, size_t h, size_t kernel,
                   size_t p, NonlineMode nonline_mode) {
        if (w + 2 * p < kernel || h + 2 * p < kernel)
            return;
        param::ConvBias param;
        param.stride_h = 1;
        param.stride_w = 1;
        param.pad_h = p;
        param.pad_w = p;
        param.nonlineMode = nonline_mode;

        //! no bias
        args.emplace_back(param, TensorShape{1, ic, h, w},
                          TensorShape{oc, ic, kernel, kernel}, TensorShape{});
        args.emplace_back(param, TensorShape{1, ic, h, w},
                          TensorShape{oc, ic, kernel, kernel},
                          TensorShape{1, oc, 1, 1});
        args.emplace_back(
                param, TensorShape{1, ic, h, w},
                TensorShape{oc, ic, kernel, kernel},
                TensorShape{1, oc, (h + 2 * p - kernel) / param.stride_h + 1,
                            (w + 2 * p - kernel) / param.stride_w + 1});
        param.sparse = param::ConvBias::Sparse::GROUP;
        args.emplace_back(param, TensorShape{1, 2 * ic, h, w},
                          TensorShape{2, oc, ic, kernel, kernel},
                          TensorShape{});
        args.emplace_back(param, TensorShape{1, 2 * ic, h, w},
                          TensorShape{2, oc, ic, kernel, kernel},
                          TensorShape{1, oc * 2, 1, 1});

        args.emplace_back(
                param, TensorShape{1, 2 * ic, h, w},
                TensorShape{2, oc, ic, kernel, kernel},
                TensorShape{1, 2 * oc, (h + 2 * param.pad_h - kernel) / 1 + 1,
                            (w + 2 * param.pad_w - kernel) / 1 + 1});
    };

    for (size_t kernel : {2, 3, 4, 5, 6, 7})
        for (size_t ic : {1, 4, 8, 16})
            for (size_t oc : {1, 4, 8, 16})
                for (size_t p : {0, 1})
                    for (size_t size : {8, 24})
                        for (NonlineMode nonline_mode :
                             {NonlineMode::IDENTITY, NonlineMode::RELU}) {
                            run(oc, ic, size, size, kernel, p, nonline_mode);
                        }

    run(2046, 8, 20, 20, 3, 1, NonlineMode::IDENTITY);
    Checker<ConvBiasForward, OprWeightPreprocessProxy<ConvBiasForward>> checker(
            handle());
 #define cb(algo_name)                                             \
    checker.set_before_exec_callback(                             \
            conv_bias::ConvBiasAlgoChecker<ConvBias>(algo_name)); \
    for (auto&& arg : args) {                                     \
        checker.set_param(arg.param).execs(                       \
                {arg.src, arg.filter, arg.bias, {}, {}});         \
    }

    cb("IM2COLMATMUL:X86_F32_MKL_PACKA:192");

 #undef cb
 }

 /**************************** Conv1x1 PackA *************************/
 namespace {
 void checker_conv_bias(std::vector<conv_bias::TestArg> args, Handle* handle,
@@ -1169,6 +1383,77 @@ TEST_F(X86_MULTI_THREADS, CONV_BIAS_IM2COLMATMUL_QINT8) {
 #undef cb
 }

 TEST_F(X86_MULTI_THREADS, CONV_BIAS_IM2COLMATMUL_QINT8_FILTER_PREPROCESS) {
    using namespace conv_bias;
    std::vector<TestArg> args;

    auto run = [&](size_t oc, size_t ic, size_t w, size_t h, size_t kernel,
                   size_t p, NonlineMode nonline_mode) {
        if (w + 2 * p < kernel || h + 2 * p < kernel)
            return;
        param::ConvBias param;
        param.stride_h = 1;
        param.stride_w = 1;
        param.pad_h = p;
        param.pad_w = p;
        param.nonlineMode = nonline_mode;

        //! no bias
        args.emplace_back(param, TensorShape{1, ic, h, w},
                          TensorShape{oc, ic, kernel, kernel}, TensorShape{});
        //! bias channel
        args.emplace_back(param, TensorShape{2, ic, h, w},
                          TensorShape{oc, ic, kernel, kernel},
                          TensorShape{1, oc, 1, 1});
    };

    for (size_t kernel : {2, 3, 4, 5, 6, 7})
        for (size_t ic : {1, 4, 8, 16})
            for (size_t oc : {1, 4, 8})
                for (size_t p : {0, 2})
                    for (size_t size : {20, 21, 24})
                        for (NonlineMode nonline_mode :
                             {NonlineMode::IDENTITY, NonlineMode::RELU,
                              NonlineMode::H_SWISH}) {
                            run(oc, ic, size, size, kernel, p, nonline_mode);
                        }
    run(2046, 8, 20, 20, 3, 1, NonlineMode::IDENTITY);
    Checker<ConvBiasForward, OprWeightPreprocessProxy<ConvBiasForward>> checker(
            handle());
 #define cb(algo_name)                                             \
    checker.set_before_exec_callback(                             \
            conv_bias::ConvBiasAlgoChecker<ConvBias>(algo_name)); \
    UniformIntRNG rng{-50, 50};                                   \
    for (auto&& arg : args) {                                     \
        checker.set_dtype(0, dtype::QuantizedS8(2.5f))            \
                .set_dtype(1, dtype::QuantizedS8(2.5f))           \
                .set_dtype(2, dtype::QuantizedS32(6.25f))         \
                .set_dtype(4, dtype::QuantizedS8(60.25))          \
                .set_rng(0, &rng)                                 \
                .set_rng(1, &rng)                                 \
                .set_rng(2, &rng)                                 \
                .set_param(arg.param)                             \
                .execs({arg.src, arg.filter, {}, {}, {}});        \
    }

 #if MEGDNN_X86_WITH_MKL_DNN
    if (x86::is_supported(x86::SIMDType::VNNI)) {
        cb("IM2COLMATMUL:X86_INT8X8X32_MKLDNN");
    }
 #endif
 #if MEGDNN_X86_WITH_VNNI
    if (x86::is_supported(x86::SIMDType::VNNI)) {
        cb("IM2COLMATMUL:X86_INT8X8X32_VNNI");
    }
 #endif
    if (x86::is_supported(x86::SIMDType::AVX2)) {
        cb("IM2COLMATMUL:X86_INT8X8X32_AVX2_2X4X16");
    }

 #undef cb
 }


 TEST_F(X86, CONV_BIAS_MATMUL) {
    using namespace conv_bias;
    std::vector<TestArg> args;