feat(dnn/x86): add avx2 int8 stride1 chanwise multithread conv

GitOrigin-RevId: 8f310c3d13
5 years ago · 90ca85541e
--- a/dnn/src/common/unroll_macro.h
+++ b/dnn/src/common/unroll_macro.h
@@ -40,6 +40,15 @@
    UNROLL_RAW16(cb, v0, ##a)                                               \
    cb(16, ##a) cb(17, ##a) cb(18, ##a) cb(19, ##a) cb(20, ##a) cb(21, ##a) \
            cb(22, ##a) cb(23, ##a)
 #define UNROLL_RAW25(cb, v0, a...) \
    UNROLL_RAW24(cb, v0, ##a)      \
    cb(24, ##a)
 #define UNROLL_RAW49(cb, v0, a...)                                          \
    UNROLL_RAW25(cb, v0, ##a)                                               \
    cb(25, ##a) cb(26, ##a) cb(27, ##a) cb(28, ##a) cb(29, ##a) cb(30, ##a) \
    cb(31, ##a) cb(32, ##a) cb(33, ##a) cb(34, ##a) cb(35, ##a) cb(36, ##a) \
    cb(37, ##a) cb(38, ##a) cb(39, ##a) cb(40, ##a) cb(41, ##a) cb(42, ##a) \
    cb(43, ##a) cb(44, ##a) cb(45, ##a) cb(46, ##a) cb(47, ##a) cb(48, ##a)

 #define UNROLL_CALL0(step, cb, v...) UNROLL_RAW##step(cb, 0, ##v)
 #define UNROLL_CALL1(step, cb, v...) UNROLL_CALL0(step, cb, ##v)
--- a/dnn/src/x86/conv_bias/int8/algos.cpp
+++ b/dnn/src/x86/conv_bias/int8/algos.cpp
@@ -15,6 +15,7 @@
 #include "src/fallback/convolution/img2col_helper.h"
 #include "src/x86/conv_bias/int8/avx2_direct_conv_stride1.h"
 #include "src/x86/conv_bias/int8/avx2_direct_conv_stride2.h"
 #include "src/x86/conv_bias/int8/avx2_chanwise_stride1.h"
 #include "src/x86/conv_bias/opr_impl.h"
 #include "src/x86/conv_bias/postprocess_helper.h"
 #include "src/x86/handle.h"
@@ -31,6 +32,65 @@ using namespace dnnl;
 using namespace megdnn;
 using namespace x86;

 bool ConvBiasImpl::AlgoChanWiseAvx2Stride1Qint8::usable(
        FallbackConvBiasImpl* /*opr*/, const NCBKernSizeParam& param,
        AlgoSelectionStrategy /*algo_selection_strategy*/) const {
    auto&& fm = param.filter_meta;
    auto FH = fm.spatial[0];
    bool aviliable =
            ((param.src_type.enumv() == DTypeEnum::QuantizedS8 &&
              param.filter_type.enumv() == DTypeEnum::QuantizedS8 &&
              param.dst_type.enumv() == DTypeEnum::QuantizedS8) ||
             (((param.src_type.enumv() == DTypeEnum::Int8 &&
                param.filter_type.enumv() == DTypeEnum::Int8 &&
                param.dst_type.enumv() == DTypeEnum::Int32) ||
               (param.src_type.enumv() == DTypeEnum::QuantizedS8 &&
                param.filter_type.enumv() == DTypeEnum::QuantizedS8 &&
                param.dst_type.enumv() == DTypeEnum::QuantizedS32)))) &&
            fm.format == Param::Format::NCHW && fm.spatial_ndim == 2 &&
            fm.dilation[0] == 1 && fm.dilation[1] == 1 &&
            (FH == 2 || FH == 3 || FH == 5 || FH == 7) && fm.stride[0] == 1 &&
            fm.stride[1] == 1 && (fm.icpg == 1) && (fm.ocpg == 1) &&
            is_supported(SIMDType::AVX2);
    return aviliable;
 }

 WorkspaceBundle ConvBiasImpl::AlgoChanWiseAvx2Stride1Qint8::get_bundle(
        const NCBKernSizeParam& param) {
    size_t nr_threads = param.nr_threads;
    size_t IH2, IW2, OH2, OW2;
    size_t src_size = 0, dst_size = 0, int32_temp = 0;

    avx2_chanwise_stride1::get_rectified_size(param, IH2, IW2, OH2, OW2);

    if (avx2_chanwise_stride1::need_src_copy(param)) {
        src_size = IH2 * IW2 * sizeof(int8_t) * nr_threads;
    }
    if (avx2_chanwise_stride1::need_dst_copy(param)) {
        dst_size = OH2 * OW2 * param.dst_type.size() * nr_threads;
    }
    bool dst_need_convert = param.dst_type.enumv() == DTypeEnum::QuantizedS8;

    if (dst_need_convert) {
        int32_temp = OH2 * OW2 * sizeof(int32_t) * nr_threads;
    }
    return dst_need_convert
                   ? WorkspaceBundle(nullptr, {src_size, dst_size, int32_temp})
                   : WorkspaceBundle(nullptr, {src_size, dst_size});
 }

 size_t ConvBiasImpl::AlgoChanWiseAvx2Stride1Qint8::get_workspace(
        FallbackConvBiasImpl*, const NCBKernSizeParam& param) const {
    return get_bundle(param).total_size_in_bytes();
 }

 SmallVector<fallback::ConvBiasImpl::NCBKern>
 ConvBiasImpl::AlgoChanWiseAvx2Stride1Qint8::get_kimpls(
        const NCBKernSizeParam& param) const {
    auto bundle = get_bundle(param);
    return avx2_chanwise_stride1::get_kimpls(param, bundle);
 }

 bool ConvBiasImpl::AlgoDirectAvx2Stride1Int8::usable(
        FallbackConvBiasImpl* /*opr*/, const NCBKernSizeParam& param,
        AlgoSelectionStrategy /*algo_selection_strategy*/) const {
--- a/dnn/src/x86/conv_bias/int8/algos.h
+++ b/dnn/src/x86/conv_bias/int8/algos.h
@@ -13,6 +13,29 @@

 namespace megdnn {
 namespace x86 {

 /* ===================== avx2 stride1 chanwise algo ===================== */
 class ConvBiasImpl::AlgoChanWiseAvx2Stride1Qint8 final : public AlgoBase {
    SmallVector<NCBKern> get_kimpls(const NCBKernSizeParam& param) const;
    static WorkspaceBundle get_bundle(const NCBKernSizeParam& param);

 public:
    bool is_reproducible() const override { return true; }
    const char* name() const override {
        return "X86_CONV_BIAS_CHANWISE_AVX2_INT8_STRIDE1";
    }
    bool usable(FallbackConvBiasImpl* opr, const NCBKernSizeParam& param,
                AlgoSelectionStrategy algo_selection_strategy) const override;
    size_t get_workspace(FallbackConvBiasImpl* opr,
                         const NCBKernSizeParam& param) const override;
    virtual SmallVector<NCBKern> dispatch_kerns(
            fallback::ConvBiasImpl*,
            const NCBKernSizeParam& param) const override {
        return get_kimpls(param);
    }
    void* type() const override;
 };

 /* ===================== avx2 stride1 direct algo ===================== */
 class ConvBiasImpl::AlgoDirectAvx2Stride1Int8 final : public AlgoBase {
    SmallVector<NCBKern> get_kimpls(const NCBKernSizeParam& param) const;
--- a/dnn/src/x86/conv_bias/int8/avx2_chanwise_kern.cpp
+++ b/dnn/src/x86/conv_bias/int8/avx2_chanwise_kern.cpp
--- a/dnn/src/x86/conv_bias/int8/avx2_chanwise_kern.h
+++ b/dnn/src/x86/conv_bias/int8/avx2_chanwise_kern.h
@@ -0,0 +1,39 @@
 /**
 * \file src/x86/conv_bias/int8/avx2_chanwsie_kern.h
 * MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
 *
 * Copyright (c) 2014-2020 Megvii Inc. All rights reserved.
 *
 * Unless required by applicable law or agreed to in writing,
 * software distributed under the License is distributed on an
 * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or
 * implied.
 */
 #pragma once

 #include "src/x86/conv_bias/opr_impl.h"

 namespace megdnn {
 namespace x86 {
 namespace avx2_chanwise_stride1 {

 #define KERN(stride, i)                                                   \
    template <BiasMode bias_mode, bool is_quantized, typename Op>         \
    MEGDNN_ATTRIBUTE_TARGET("avx2")                                       \
    void avx2_chanwise_direct_##stride##_##i##x##i##_int8(                \
            const int8_t* src, const int8_t* filter, const int32_t* bias, \
            int32_t* temp, int8_t* dst, const size_t IH, const size_t IW, \
            const size_t OH, const size_t OW, const Op& op);

 KERN(stride1, 2)
 KERN(stride1, 3)
 KERN(stride1, 5)
 KERN(stride1, 7)

 #undef KERN

 }  // namespace avx2_chanwise_stride1
 }  // namespace x86
 }  // namespace megdnn

 // vim: syntax=cpp.doxygen
--- a/dnn/src/x86/conv_bias/int8/avx2_chanwise_stride1.cpp
+++ b/dnn/src/x86/conv_bias/int8/avx2_chanwise_stride1.cpp
@@ -0,0 +1,251 @@
 /**
 * \file src/x86/conv_bias/int8/avx2_chanwsie_stride1.cpp
 * MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
 *
 * Copyright (c) 2014-2020 Megvii Inc. All rights reserved.
 *
 * Unless required by applicable law or agreed to in writing,
 * software distributed under the License is distributed on an
 * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or
 * implied.
 */

 #include "src/x86/conv_bias/int8/avx2_chanwise_stride1.h"
 #include "src/x86/conv_bias/int8/avx2_chanwise_kern.h"
 #include "src/x86/elemwise_op.h"

 namespace megdnn {
 namespace x86 {
 namespace avx2_chanwise_stride1 {

 bool need_dst_copy(const NCBKernSizeParam& param) {
    return param.osz[1] % 16;
 }
 bool need_src_copy(const NCBKernSizeParam& param) {
    auto&& fm = param.filter_meta;
    return (fm.padding[0] != 0 || fm.padding[1] != 0) ? true
                                                      : need_dst_copy(param);
 }
 void get_rectified_size(const NCBKernSizeParam& param, size_t& IH2, size_t& IW2,
                        size_t& OH2, size_t& OW2) {
    auto&& fm = param.filter_meta;
    auto SW = fm.stride[1];
    auto OH = param.osz[0];
    auto OW = param.osz[1];
    auto FH = fm.spatial[0];
    auto FW = fm.spatial[1];

    OH2 = OH;
    OW2 = (OW + 15) & ~15;
    IH2 = SW * OH + FH - SW;
    IW2 = SW * OW2 + FW - SW;
 }
 void copy_padding_kern(WorkspaceBundle bundle,
                       const ConvBiasImpl::NCBKernParam& kern_param,
                       const ConvBiasImpl::NCBKernIndex& ncb_index) {
    size_t IH = kern_param.isz[0];
    size_t IW = kern_param.isz[1];
    size_t PH = kern_param.filter_meta.padding[0];
    size_t PW = kern_param.filter_meta.padding[1];

    size_t IH2, IW2, OH2, OW2;
    get_rectified_size(kern_param, IH2, IW2, OH2, OW2);
    bool need_src_copy_var = need_src_copy(kern_param);
    size_t padding_group_size = IH2 * IW2;
    bundle.set(kern_param.workspace_ptr);

    size_t group_id = ncb_index.ndrange_id[0],
           batch_id = ncb_index.ndrange_id[1],
           channel_id = ncb_index.ndrange_id[2];
    size_t workspace_group_id = ncb_index.thread_id;
    const int8_t* sptr = kern_param.src<int8_t>(batch_id, group_id, channel_id);
    if (need_src_copy_var) {
        int8_t* sptr_base = static_cast<int8_t*>(bundle.get(0)) +
                            workspace_group_id * padding_group_size;
        std::memset(sptr_base, 0, sizeof(int8_t) * IH2 * IW2);
        rep(ih, IH) {
            std::memcpy(sptr_base + (ih + PH) * IW2 + PW, sptr + ih * IW,
                        sizeof(int8_t) * IW);
        }
    }
 };
 template <size_t filter, BiasMode bias_mode, bool is_quantized, typename Op>
 void conv_kimpl(WorkspaceBundle bundle, const NCBKernParam& kern_param,
                const NCBKernIndex& ncb_index) {
    size_t OH = kern_param.osz[0];
    size_t OW = kern_param.osz[1];
    size_t IH2, IW2, OH2, OW2;
    get_rectified_size(kern_param, IH2, IW2, OH2, OW2);
    bool need_src_copy_var = need_src_copy(kern_param);
    bool need_dst_copy_var = need_dst_copy(kern_param);
    bool need_post_process =
            kern_param.dst_type.enumv() == DTypeEnum::QuantizedS8;

    Op op = Op(1.0f, 4.0f);
    if (need_post_process) {
        float scale_bias =
                kern_param.bias_type.param<dtype::QuantizedS32>().scale;
        float scale_dst = kern_param.dst_type.param<dtype::QuantizedS8>().scale;
        op = Op(scale_bias, scale_dst);
    }
    size_t padding_group_size = IH2 * IW2;

    bundle.set(kern_param.workspace_ptr);

    size_t workspace_group_id = ncb_index.thread_id;
    size_t group_id = ncb_index.ndrange_id[0],
           batch_id = ncb_index.ndrange_id[1];

    const int8_t* sptr = kern_param.src<dt_int8>(batch_id, group_id);
    const int8_t* fptr =
            kern_param.filter<dt_int8>(group_id);
    void* dst = kern_param.dst<void>(batch_id, group_id);
    const int32_t* bptr = kern_param.bias<dt_int32>(batch_id, group_id);
    if (need_src_copy_var) {
        sptr = static_cast<int8_t*>(bundle.get(0)) +
               workspace_group_id * padding_group_size;
    }
    void* dptr = nullptr;
    int32_t* tptr = nullptr;
    if (need_dst_copy_var) {
        dptr = reinterpret_cast<void*>(
                reinterpret_cast<ptrdiff_t>(bundle.get(1)) +
                ncb_index.thread_id * OH2 * OW2 * kern_param.dst_type.size());
    } else {
        dptr = dst;
    }

 #define KERN_NEED_POST_PROCESS(filter)                                         \
    avx2_chanwise_direct_stride1_##filter##x##filter##_int8<bias_mode, true,   \
                                                            Op>(               \
            sptr, fptr, bptr, tptr, static_cast<int8_t*>(dptr), IH2, IW2, OH2, \
            OW2, op)

 #define KERN_NO_POST_PROCESS(filter)                                          \
    avx2_chanwise_direct_stride1_##filter##x##filter##_int8<bias_mode, false, \
                                                            Op>(              \
            sptr, fptr, bptr, static_cast<int32_t*>(dptr), nullptr, IH2, IW2, \
            OH2, OW2, op)

    if (need_post_process) {
        tptr = static_cast<int32_t*>(bundle.get(2)) +
               ncb_index.thread_id * OH2 * OW2 * kern_param.dst_type.size();
            DISPATCH_FILTER(filter, KERN_NEED_POST_PROCESS)
    } else {
            DISPATCH_FILTER(filter, KERN_NO_POST_PROCESS)
    }

 #undef KERN_NEED_POST_PROCESS
 #undef KERN_NO_POST_PROCESS
    if (need_dst_copy_var) {
        rep(oh, OH) {
            std::memcpy(reinterpret_cast<void*>(
                                reinterpret_cast<ptrdiff_t>(dst) +
                                oh * OW * kern_param.dst_type.size()),
                        reinterpret_cast<void*>(
                                reinterpret_cast<ptrdiff_t>(dptr) +
                                oh * OW2 * kern_param.dst_type.size()),
                        kern_param.dst_type.size() * OW);
        }
    }
 };
 SmallVector<NCBKern> get_kimpls(const NCBKernSizeParam& kern_param,
                                WorkspaceBundle bundle) {
    MEGDNN_MARK_USED_VAR(kern_param);
    auto fm = kern_param.filter_meta;
    size_t group = fm.group;
    size_t n = kern_param.n;

    SmallVector<NCBKern> ncb_kerns;
    conv_fun do_conv_fun = nullptr;

 #define DO_CONV_KERN_FUN(filter, bias_mode, is_quantized, op) \
    do_conv_fun = conv_kimpl<filter, bias_mode, is_quantized, op>;

 #define GET_OP_PARAM(i, bias_mode, is_quantized)                             \
    switch (kern_param.nonlineMode) {                                        \
        case param::ConvBias::NonlineMode::IDENTITY:                         \
            DO_CONV_KERN_FUN(i, bias_mode, is_quantized,                     \
                             TypeCvtOp<SIMDType::AVX2 MEGDNN_COMMA dt_qint32 \
                                               MEGDNN_COMMA dt_qint8>)       \
            break;                                                           \
        case param::ConvBias::NonlineMode::RELU:                             \
            DO_CONV_KERN_FUN(i, bias_mode, is_quantized,                     \
                             ReluOp<SIMDType::AVX2 MEGDNN_COMMA dt_qint32    \
                                            MEGDNN_COMMA dt_qint8>)          \
            break;                                                           \
        case param::ConvBias::NonlineMode::H_SWISH:                          \
            DO_CONV_KERN_FUN(i, bias_mode, is_quantized,                     \
                             HSwishOp<SIMDType::AVX2 MEGDNN_COMMA dt_qint32  \
                                              MEGDNN_COMMA dt_qint8>)        \
            break;                                                           \
        default:                                                             \
            megdnn_assert(0);                                                \
            break;                                                           \
    }

 #define GET_BIAS_MODE_PARAM(i, is_quantized)                                \
    switch (kern_param.bias_mode) {                                         \
        case BiasMode::NO_BIAS:                                             \
            GET_OP_PARAM(i, BiasMode::NO_BIAS, is_quantized)                \
            break;                                                          \
        case BiasMode::BROADCAST_CHANNEL_BIAS:                              \
            GET_OP_PARAM(i, BiasMode::BROADCAST_CHANNEL_BIAS, is_quantized) \
            break;                                                          \
        default:                                                            \
            megdnn_assert(0);                                               \
            break;                                                          \
    }

 #define GET_QUANTIZED(i)                   \
    switch (kern_param.dst_type.enumv()) { \
        case DTypeEnum::QuantizedS8:       \
            GET_BIAS_MODE_PARAM(i, true)   \
            break;                         \
        case DTypeEnum::QuantizedS32:      \
            GET_BIAS_MODE_PARAM(i, false)  \
            break;                         \
        case DTypeEnum::Int32:             \
            GET_BIAS_MODE_PARAM(i, false)  \
            break;                         \
        default:                           \
            megdnn_assert(0);              \
            break;                         \
    }

 #define DISPATCH_CONV_KERN()                     \
    switch (kern_param.filter_meta.spatial[0]) { \
        case 2:                                  \
            GET_QUANTIZED(2)                     \
            break;                               \
        case 3:                                  \
            GET_QUANTIZED(3)                     \
            break;                               \
        case 5:                                  \
            GET_QUANTIZED(5)                     \
            break;                               \
        case 7:                                  \
            GET_QUANTIZED(7)                     \
            break;                               \
        default:                                 \
            megdnn_assert(0);                    \
            break;                               \
    }

    DISPATCH_CONV_KERN();

    auto exec_one_group = [bundle, do_conv_fun](const NCBKernParam& kern_param,
                                                const NCBKernIndex& ncb_index) {
        copy_padding_kern(bundle, kern_param, ncb_index);
        do_conv_fun(bundle, kern_param, ncb_index);
    };
    ncb_kerns.push_back({exec_one_group, {group, n, 1_z}});

    return ncb_kerns;
 }

 }  // namespace avx2_chanwise_stride1
 }  // namespace x86
 }  // namespace megdnn

 // vim: syntax=cpp.doxygen
--- a/dnn/src/x86/conv_bias/int8/avx2_chanwise_stride1.h
+++ b/dnn/src/x86/conv_bias/int8/avx2_chanwise_stride1.h
@@ -0,0 +1,42 @@
 /**
 * \file src/x86/conv_bias/int8/avx2_chanwsie_stride1.h
 * MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
 *
 * Copyright (c) 2014-2020 Megvii Inc. All rights reserved.
 *
 * Unless required by applicable law or agreed to in writing,
 * software distributed under the License is distributed on an
 * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or
 * implied.
 */
 #pragma once

 #include "src/x86/conv_bias/opr_impl.h"

 namespace megdnn {
 namespace x86 {
 namespace avx2_chanwise_stride1 {
 using NCBKern = fallback::ConvBiasImpl::NCBKern;
 using NCBKernSizeParam = fallback::ConvBiasImpl::NCBKernSizeParam;
 using NCBKernParam = fallback::ConvBiasImpl::NCBKernParam;
 using NCBKernIndex = fallback::ConvBiasImpl::NCBKernIndex;

 using conv_fun = std::function<void(WorkspaceBundle bundle,
                                    const NCBKernParam& kern_param,
                                    const NCBKernIndex& ncb_index)>;

 bool need_dst_copy(const NCBKernSizeParam& param);

 bool need_src_copy(const NCBKernSizeParam& param);

 void get_rectified_size(const NCBKernSizeParam& param, size_t& IH2, size_t& IW2,
                        size_t& OH2, size_t& OW2);

 SmallVector<NCBKern> get_kimpls(const NCBKernSizeParam& param,
                                WorkspaceBundle bundle);

 }  // namespace avx2_chanwise_stride1
 }  // namespace x86
 }  // namespace megdnn

 // vim: syntax=cpp.doxygen
--- a/dnn/src/x86/conv_bias/int8/avx2_direct_conv_stride1.cpp
+++ b/dnn/src/x86/conv_bias/int8/avx2_direct_conv_stride1.cpp
@@ -10,7 +10,6 @@
 */

 #include "src/x86/conv_bias/int8/avx2_direct_conv_stride1.h"
 #include "src/common/unroll_macro.h"
 #include "src/x86/conv_bias/int8/common_helper.h"
 #include "src/x86/conv_bias/postprocess_helper.h"

--- a/dnn/src/x86/conv_bias/int8/avx2_direct_conv_stride2.cpp
+++ b/dnn/src/x86/conv_bias/int8/avx2_direct_conv_stride2.cpp
@@ -10,7 +10,6 @@
 */

 #include "src/x86/conv_bias/int8/avx2_direct_conv_stride2.h"
 #include "src/common/unroll_macro.h"
 #include "src/x86/conv_bias/int8/common_helper.h"
 #include "src/x86/conv_bias/postprocess_helper.h"

--- a/dnn/src/x86/conv_bias/int8/common_helper.h
+++ b/dnn/src/x86/conv_bias/int8/common_helper.h
@@ -11,6 +11,7 @@
 #pragma once

 #include <immintrin.h>
 #include "src/common/unroll_macro.h"
 #include "megdnn/arch.h"
 #ifdef WIN32CMAKE
 #include <smmintrin.h>
--- a/dnn/src/x86/conv_bias/opr_impl.cpp
+++ b/dnn/src/x86/conv_bias/opr_impl.cpp
@@ -65,6 +65,10 @@ void* ConvBiasImpl::AlgoAVX2DirectConvStride2::type() const {
    return x86_algo_type;
 }

 void* ConvBiasImpl::AlgoChanWiseAvx2Stride1Qint8::type() const {
    return x86_algo_type;
 }

 class ConvBiasImpl::AlgoPack : NonCopyableObj {
    AlgoDirect stride1_direct_large_group{true};
    AlgoDirect stride1_direct_small_group{false};
@@ -72,6 +76,7 @@ class ConvBiasImpl::AlgoPack : NonCopyableObj {
    AlgoDirectStride2 stride2_direct_small_group{false};
    AlgoDirectAvx2Stride1Int8 avx2_stride1_direct_int8;
    AlgoAVX2DirectConvStride2 avx2_stride2_direct;
    AlgoChanWiseAvx2Stride1Qint8 avx2_stride1_chanwsie_qint8;
    AlgoMatrixMul matmul;
 #if defined(MEGDNN_X86_WITH_MKL_DNN)
    AlgoMkldnnMatmulQint8 mkldnn_matmul_qint8;
@@ -94,6 +99,7 @@ public:
        all_algos.emplace_back(&stride2_direct_small_group);
        all_algos.emplace_back(&avx2_stride1_direct_int8);
        all_algos.emplace_back(&avx2_stride2_direct);
        all_algos.emplace_back(&avx2_stride1_chanwsie_qint8);
        all_algos.emplace_back(&matmul);

        static CpuOprDelegationStorage<> storage;
--- a/dnn/src/x86/conv_bias/opr_impl.h
+++ b/dnn/src/x86/conv_bias/opr_impl.h
@@ -31,6 +31,7 @@ public:
    class AlgoMatrixMul;
    class AlgoDirectAvx2Stride1Int8;
    class AlgoAVX2DirectConvStride2;
    class AlgoChanWiseAvx2Stride1Qint8;
 #if defined(MEGDNN_X86_WITH_MKL_DNN)
    class AlgoMkldnnConv;
    class AlgoMkldnnQint8;
--- a/dnn/src/x86/elemwise_helper/kimpl/typecvt.h
+++ b/dnn/src/x86/elemwise_helper/kimpl/typecvt.h
@@ -258,6 +258,32 @@ struct TypeCvtOp<SIMDType::SSE4_2, dt_qint32, dt_qint8>
 };

 template <>
 struct TypeCvtOp<SIMDType::AVX2, dt_qint32, dt_qint8>
        : UnaryOpBase<SIMDType::AVX2, dt_qint32, dt_qint8> {
    using UnaryOpBase::UnaryOpBase;
    constexpr static size_t SIMD_WIDTH = 8;

    MEGDNN_ATTRIBUTE_TARGET("avx2")
    void operator()(const __m256ix2& vsrc, dt_qint8* dst) const {
        _mm_store_si128((__m128i*)(dst), (operator()(vsrc)));
    }

    MEGDNN_ATTRIBUTE_TARGET("avx2")
    __m128i operator()(const __m256ix2& vsrc) const {
        auto cvtps_src0 = _mm256_cvtepi32_ps(vsrc.val[0]);
        auto cvtps_src1 = _mm256_cvtepi32_ps(vsrc.val[1]);
        auto vitem0 = _mm256_mul_ps(cvtps_src0, _mm256_set1_ps(this->scale));
        auto vitem1 = _mm256_mul_ps(cvtps_src1, _mm256_set1_ps(this->scale));
        return QConverter::convert<__m128i, __m256x2>({{vitem0, vitem1}});
    }

    void operator()(src_ctype src, dst_ctype* dst) {
        *reinterpret_cast<int8_t*>(dst) = saturate<int8_t, float>(
                std::round(src.as_int32() * scale), -128, 127);
    }
 };

 template <>
 struct TypeCvtOp<SIMDType::SSE4_2, dt_float32, dt_qint8>
        : UnaryOpBase<SIMDType::SSE4_2, dt_float32, dt_qint8> {
    using UnaryOpBase::UnaryOpBase;
--- a/dnn/test/x86/conv_bias.cpp
+++ b/dnn/test/x86/conv_bias.cpp
@@ -40,6 +40,165 @@ TEST_F(X86, CONV_BIAS_FORWARD) {
                .execs({arg.src, arg.filter, arg.bias, {}, {}});
    }
 }

 TEST_F(X86_MULTI_THREADS, AVX2_CHANWISE_DIRECT_STRIDE1_INT8x8x32) {
    using namespace conv_bias;
    std::vector<TestArg> args;

    auto run = [&](size_t ic, size_t w, size_t h, size_t kernel, size_t p,
                   NonlineMode nonline_mode) {
        if (w + 2 * p < kernel || h + 2 * p < kernel)
            return;
        param::ConvBias param;
        param.stride_h = 1;
        param.stride_w = 1;
        param.pad_h = p;
        param.pad_w = p;
        param.nonlineMode = nonline_mode;

        param.sparse = param::ConvBias::Sparse::GROUP;
        //! no bias
        args.emplace_back(param, TensorShape{2, ic, h, w},
                          TensorShape{ic, 1, 1, kernel, kernel}, TensorShape{});
        //! bias channel
        args.emplace_back(param, TensorShape{2, ic, h, w},
                          TensorShape{ic, 1, 1, kernel, kernel},
                          TensorShape{1, ic, 1, 1});
    };

    for (size_t kernel : {2, 3, 5, 7})
        for (size_t pad : {0, 1})
            for (size_t ic : {1, 5, 17, 20})
                for (size_t h : {7, 16, 38, 40})
                    for (size_t w : {16, 25, 40, 55})
                        for (NonlineMode nonline_mode : {NonlineMode::IDENTITY})
                            run(ic, w, h, kernel, pad, nonline_mode);

    Checker<ConvBias> checker(handle());
    UniformIntRNG rng{-50, 50};
    checker.set_dtype(0, dtype::Int8())
            .set_dtype(1, dtype::Int8())
            .set_dtype(2, dtype::Int32())
            .set_dtype(4, dtype::Int32())
            .set_rng(0, &rng)
            .set_rng(1, &rng)
            .set_rng(2, &rng)
            .set_epsilon(1e-3);
    checker.set_before_exec_callback(
            conv_bias::ConvBiasAlgoChecker<ConvBiasForward>(
                    "X86_CONV_BIAS_CHANWISE_AVX2_INT8_STRIDE1"));
    for (auto&& arg : args) {
        checker.set_param(arg.param).exec(
                {arg.src, arg.filter, arg.bias, {}, {}});
    }
 }

 TEST_F(X86_MULTI_THREADS, AVX2_CHANWISE_DIRECT_STRIDE1_QuantizedS32) {
    using namespace conv_bias;
    std::vector<TestArg> args;

    auto run = [&](size_t ic, size_t w, size_t h, size_t kernel, size_t p,
                   NonlineMode nonline_mode) {
        if (w + 2 * p < kernel || h + 2 * p < kernel)
            return;
        param::ConvBias param;
        param.stride_h = 1;
        param.stride_w = 1;
        param.pad_h = p;
        param.pad_w = p;
        param.nonlineMode = nonline_mode;

        param.sparse = param::ConvBias::Sparse::GROUP;
        //! no bias
        args.emplace_back(param, TensorShape{2, ic, h, w},
                          TensorShape{ic, 1, 1, kernel, kernel}, TensorShape{});
        //! bias channel
        args.emplace_back(param, TensorShape{2, ic, h, w},
                          TensorShape{ic, 1, 1, kernel, kernel},
                          TensorShape{1, ic, 1, 1});
    };

    for (size_t kernel : {2, 3, 5, 7})
        for (size_t pad : {0, 1})
            for (size_t ic : {1, 3, 5, 7, 17})
                for (size_t h : {10, 17, 25, 30})
                    for (size_t w : {19, 28, 58, 168})
                        for (NonlineMode nonline_mode : {NonlineMode::IDENTITY})
                            run(ic, w, h, kernel, pad, nonline_mode);

    Checker<ConvBias> checker(handle());
    UniformIntRNG rng{-50, 50};
    checker.set_dtype(0, dtype::QuantizedS8(2.5f))
            .set_dtype(1, dtype::QuantizedS8(2.5f))
            .set_dtype(2, dtype::QuantizedS32(6.25f))
            .set_dtype(4, {})
            .set_rng(0, &rng)
            .set_rng(1, &rng)
            .set_rng(2, &rng)
            .set_epsilon(1e-3);
    checker.set_before_exec_callback(
            conv_bias::ConvBiasAlgoChecker<ConvBiasForward>(
                    "X86_CONV_BIAS_CHANWISE_AVX2_INT8_STRIDE1"));
    for (auto&& arg : args) {
        checker.set_param(arg.param).exec(
                {arg.src, arg.filter, arg.bias, {}, {}});
    }
 }

 TEST_F(X86_MULTI_THREADS, AVX2_CHANWISE_DIRECT_STRIDE1_QuantizedS8x8x8) {
    using namespace conv_bias;
    std::vector<TestArg> args;

    auto run = [&](size_t ic, size_t w, size_t h, size_t kernel, size_t p,
                   NonlineMode nonline_mode) {
        if (w + 2 * p < kernel || h + 2 * p < kernel)
            return;
        param::ConvBias param;
        param.stride_h = 1;
        param.stride_w = 1;
        param.pad_h = p;
        param.pad_w = p;
        param.nonlineMode = nonline_mode;

        param.sparse = param::ConvBias::Sparse::GROUP;
        //! no bias
        args.emplace_back(param, TensorShape{2, ic, h, w},
                          TensorShape{ic, 1, 1, kernel, kernel}, TensorShape{});
        //! bias channel
        args.emplace_back(param, TensorShape{2, ic, h, w},
                          TensorShape{ic, 1, 1, kernel, kernel},
                          TensorShape{1, ic, 1, 1});
    };

    for (size_t kernel : {2, 3, 5, 7})
        for (size_t pad : {0, 1})
            for (size_t ic : {1, 3, 5, 7, 17})
                for (size_t h : {10, 15, 17, 30})
                    for (size_t w : {19, 28, 58, 168})
                        for (NonlineMode nonline_mode :
                             {NonlineMode::IDENTITY, NonlineMode::H_SWISH,
                              NonlineMode::RELU})
                            run(ic, w, h, kernel, pad, nonline_mode);

    Checker<ConvBias> checker(handle());
    UniformIntRNG rng{-50, 50};
    checker.set_dtype(0, dtype::QuantizedS8(2.5f))
            .set_dtype(1, dtype::QuantizedS8(2.5f))
            .set_dtype(2, dtype::QuantizedS32(6.25f))
            .set_dtype(4, dtype::QuantizedS8(60.25f))
            .set_rng(0, &rng)
            .set_rng(1, &rng)
            .set_rng(2, &rng)
            .set_epsilon(1e-3);
    checker.set_before_exec_callback(
            conv_bias::ConvBiasAlgoChecker<ConvBiasForward>(
                    "X86_CONV_BIAS_CHANWISE_AVX2_INT8_STRIDE1"));
    for (auto&& arg : args) {
        checker.set_param(arg.param).exec(
                {arg.src, arg.filter, arg.bias, {}, {}});
    }
 }

 TEST_F(X86_MULTI_THREADS, AVX2_CONV_BIAS_DIRECT_STRIDE1_INT8x8x32) {
    using namespace conv_bias;
    std::vector<TestArg> args;
@@ -1556,6 +1715,67 @@ void benchmark_impl_comp(const param::ConvBias param,
 }

 }  // namespace
 TEST_F(X86_BENCHMARK_MULTI_THREADS, BENCHMARK_CONVBIAS_CHANWISE_AVX2_INT8) {
    constexpr size_t RUNS = 50;
    param::ConvBias param;
    param.stride_h = 1;
    param.stride_w = 1;
    param.sparse = param::ConvBias::Sparse::GROUP;

    std::vector<DType> data_type = {dtype::Int8(), dtype::Int8(),
                                    dtype::Int32(), dtype::Int32()};

    std::vector<std::pair<SmallVector<TensorShape>, float>>
            shapes_and_computation;
    auto bench_case = [&](size_t N, size_t IC, size_t H, size_t W, size_t FS) {
        param.pad_h = FS / 2;
        param.pad_w = FS / 2;

        SmallVector<TensorShape> shapes{
                {N, IC, H, W}, {IC, 1, 1, FS, FS}, {}, {}, {}};
        TensorShape dst{N, IC, (H + 2 * param.pad_h - FS) + 1,
                        (W + 2 * param.pad_w - FS) + 1};
        float computations = (FS * FS * dst.total_nr_elems() * 2) * 1e-6;
        shapes_and_computation.push_back(std::make_pair(shapes, computations));
    };

    bench_case(1, 32, 112, 112, 7);
    bench_case(1, 144, 56, 56, 7);
    bench_case(1, 192, 28, 28, 7);
    bench_case(1, 384, 28, 28, 7);
    bench_case(1, 576, 14, 14, 7);
    bench_case(1, 960, 7, 7, 7);

    bench_case(1, 32, 112, 112, 5);
    bench_case(1, 144, 56, 56, 5);
    bench_case(1, 192, 28, 28, 5);
    bench_case(1, 384, 28, 28, 5);
    bench_case(1, 576, 14, 14, 5);
    bench_case(1, 960, 7, 7, 5);

    bench_case(1, 32, 112, 112, 3);
    bench_case(1, 144, 56, 56, 3);
    bench_case(1, 192, 28, 28, 3);
    bench_case(1, 384, 28, 28, 3);
    bench_case(1, 576, 14, 14, 3);
    bench_case(1, 960, 7, 7, 3);

    bench_case(1, 32, 112, 112, 2);
    bench_case(1, 144, 56, 56, 2);
    bench_case(1, 192, 28, 28, 2);
    bench_case(1, 384, 28, 28, 2);
    bench_case(1, 576, 14, 14, 2);
    bench_case(1, 960, 7, 7, 2);

    std::string algo_name = "X86_CONV_BIAS_CHANWISE_AVX2_INT8_STRIDE1";
    printf("Benchmark X86_CONV_BIAS_CHANWISE_AVX2_INT8_STRIDE1\n");
    benchmark_impl(param, shapes_and_computation, algo_name, RUNS,
                   {4, {4, 5, 6, 7}}, {1, {4}}, data_type);
    benchmark_impl(param, shapes_and_computation, algo_name, RUNS, {2, {4, 5}},
                   {1, {4}}, data_type);
    shapes_and_computation.clear();
 }

 TEST_F(X86_BENCHMARK_MULTI_THREADS, BENCHMARK_CONVBIAS_DIRECT_AVX2_INT8) {
    constexpr size_t RUNS = 50;
    param::ConvBias param;