feat(dnn/arm_common): add nchw44 8x8x16 channel wise conv

stride1 2x2 3x3 5x5 stride2 2x2 3x3 5x5 GitOrigin-RevId: 43d76311c2
4 years ago · a773d07678
--- a/dnn/src/arm_common/conv_bias/int8/channel_wise_kernel.h
+++ b/dnn/src/arm_common/conv_bias/int8/channel_wise_kernel.h
@@ -33,7 +33,7 @@ KERN(stride2, 5)

 #undef KERN

 }  // namesapce conv_bias
 }  // namespace channel_wise_nchw44
 }  // namespace arm_common
 }  // namespace megdnn

--- a/dnn/src/arm_common/conv_bias/int8x8x16/algos.cpp
+++ b/dnn/src/arm_common/conv_bias/int8x8x16/algos.cpp
@@ -10,16 +10,15 @@
 */

 #include "src/arm_common/conv_bias/int8x8x16/algos.h"
 #include "src/arm_common/conv_bias/int8x8x16/channel_wise_nchw44.h"
 #include "src/arm_common/conv_bias/int8x8x16/channel_wise_nchw44_8x8x16.h"
 #include "src/arm_common/conv_bias/int8x8x16/conv_direct.h"
 #include "src/arm_common/conv_bias/int8x8x16/conv_stride2.h"

 #include "midout.h"
 #include "src/common/opr_delegate.h"

 MIDOUT_DECL(megdnn_arm_common_conv_bias_int8816_kimpl)

 #include <atomic>
 #include <cstring>
 #include <mutex>

 using namespace megdnn;
 using namespace arm_common;
@@ -550,4 +549,70 @@ ConvBiasImpl::AlgoI8x8x16Stride2Filter2::dispatch_kerns(
    return {{kern, {group, 1_z, 1_z}}};
 }

 /* =====================8int8x8x16 channel_wise_nchw44  stride1 stride2 algo ===================== */
 bool ConvBiasImpl::AlgoS8x8x16ChanWiseStride1Stride2NCHW44::usable(
        const NCBKernSizeParam& param, AlgoSelectionStrategy) const {
    auto&& fm = param.filter_meta;
    auto FH = fm.spatial[0];
    bool avaible =
            //! src and filter are int8, dst is int16
            (param.src_type.enumv() == DTypeEnum::Int8 &&
             param.filter_type.enumv() == DTypeEnum::Int8 &&
             param.dst_type.enumv() == DTypeEnum::Int16) &&
            fm.format == param::Convolution::Format::NCHW44 &&
            param.bias_mode != megdnn::BiasMode::BIAS &&
            param.nonlineMode == megdnn::NonlineMode::IDENTITY &&
            !fm.should_flip && fm.spatial_ndim == 2 && fm.dilation[0] == 1 &&
            fm.dilation[1] == 1 &&
            (fm.stride[0] == fm.stride[1] &&
             (fm.stride[0] == 1 || fm.stride[0] == 2)) &&
            FH == fm.spatial[1] && (FH == 2 || FH == 3 || FH == 5) &&
            fm.icpg == 1 && fm.ocpg == 1 && fm.group % 4 == 0;
    return avaible;
 }

 size_t ConvBiasImpl::AlgoS8x8x16ChanWiseStride1Stride2NCHW44::get_workspace(
        const NCBKernSizeParam& param) const {
    size_t stride_h = param.filter_meta.stride[0];
    size_t stride_w = param.filter_meta.stride[1];
    megdnn_assert(stride_h == stride_w);
    if (stride_h == 1) {
        return channel_wise_nchw44_8x8x16::stride1::get_bundle(param)
                .total_size_in_bytes();
    } else if (stride_h == 2) {
        return channel_wise_nchw44_8x8x16::stride2::get_bundle(param)
                .total_size_in_bytes();
    } else {
        return 0;
    }
 }

 SmallVector<ConvBiasImpl::NCBKern>
 ConvBiasImpl::AlgoS8x8x16ChanWiseStride1Stride2NCHW44::dispatch_kerns(
        const NCBKernSizeParam& param) const {
    size_t stride_h = param.filter_meta.stride[0];
    size_t stride_w = param.filter_meta.stride[1];
    if (stride_h == stride_w && stride_h == 1) {
        MIDOUT_BEGIN(
                megdnn_arm_common_conv_bias_int8816_kimpl,
                midout_iv(
                        "AlgoS8x8x16ChanWiseStride1Stride2NCHW44_dispatch_kerns"_hash)) {
            return channel_wise_nchw44_8x8x16::stride1::get_kimpls(param);
        }
        MIDOUT_END();
        return {};
    } else if (stride_h == stride_w && stride_h == 2) {
        MIDOUT_BEGIN(
                megdnn_arm_common_conv_bias_int8816_kimpl,
                midout_iv(
                        "AlgoS8x8x16ChanWiseStride2NCHW44_dispatch_kerns"_hash)) {
            return channel_wise_nchw44_8x8x16::stride2::get_kimpls(param);
        }
        MIDOUT_END();
        return {};
    } else {
        return {};
    }
 }

 // vim: syntax=cpp.doxygen
--- a/dnn/src/arm_common/conv_bias/int8x8x16/algos.h
+++ b/dnn/src/arm_common/conv_bias/int8x8x16/algos.h
@@ -72,6 +72,18 @@ public:
            const NCBKernSizeParam& param) const override;
 };

 class ConvBiasImpl::AlgoS8x8x16ChanWiseStride1Stride2NCHW44 final : public AlgoBase {
 public:
    bool is_reproducible() const override { return true; }
    const char* name() const override { return "S8x8x16_CHAN_WISE_STRD1_STRD2_NCHW44"; }
    bool usable(const NCBKernSizeParam& param,
                AlgoSelectionStrategy algo_selection_strategy) const override;
    size_t get_workspace(
                         const NCBKernSizeParam& param) const override;
    virtual SmallVector<NCBKern> dispatch_kerns(
            const NCBKernSizeParam& param) const override;
 };

 }  // namespace arm_common
 }  // namespace megdnn

--- a/dnn/src/arm_common/conv_bias/int8x8x16/channel_wise_kernel.h
+++ b/dnn/src/arm_common/conv_bias/int8x8x16/channel_wise_kernel.h
@@ -0,0 +1,40 @@
 /**
 * \file dnn/src/arm_common/conv_bias/int8x8x16/channel_wise_kernel.h
 * MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
 *
 * Copyright (c) 2014-2020 Megvii Inc. All rights reserved.
 *
 * Unless required by applicable law or agreed to in writing,
 * software distributed under the License is distributed on an
 * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 */

 #include "src/arm_common/conv_bias/opr_impl.h"
 #include "src/fallback/conv_bias/common.h"

 namespace megdnn {
 namespace arm_common {
 namespace channel_wise_nchw44_8x8x16 {

 #define KERN(stride, i)                                                   \
    template <BiasMode bias_mode>                                         \
    void direct_##stride##_##i##x##i##_int8x8x16(                         \
            const int8_t* src, const int8_t* filter, const int16_t* bias, \
            void* dst, const size_t IH, const size_t IW, const size_t OH, \
            const size_t OW);

 KERN(stride1, 2)
 KERN(stride1, 3)
 KERN(stride1, 5)

 KERN(stride2, 2)
 KERN(stride2, 3)
 KERN(stride2, 5)

 #undef KERN

 }  // namespace channel_wise_nchw44_8x8x16
 }  // namespace arm_common
 }  // namespace megdnn

 // vim: syntax=cpp.doxygen
--- a/dnn/src/arm_common/conv_bias/int8x8x16/channel_wise_kernel_int8x8x16_nchw44.cpp
+++ b/dnn/src/arm_common/conv_bias/int8x8x16/channel_wise_kernel_int8x8x16_nchw44.cpp
--- a/dnn/src/arm_common/conv_bias/int8x8x16/channel_wise_nchw44.h
+++ b/dnn/src/arm_common/conv_bias/int8x8x16/channel_wise_nchw44.h
@@ -0,0 +1,57 @@
 /**
 * \file dnn/src/arm_common/conv_bias/int8x8x16/channel_wise_nchw44.h
 * MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
 *
 * Copyright (c) 2014-2020 Megvii Inc. All rights reserved.
 *
 * Unless required by applicable law or agreed to in writing,
 * software distributed under the License is distributed on an
 * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 */

 #pragma once

 #include "src/arm_common/conv_bias/opr_impl.h"

 namespace megdnn {
 namespace arm_common {
 namespace channel_wise_nchw44 {

 using NCBKernSizeParam = fallback::ConvBiasImpl::NCBKernSizeParam;
 using NCBKernParam = fallback::ConvBiasImpl::NCBKernParam;
 using NCBKernIndex = fallback::ConvBiasImpl::NCBKernIndex;

 using conv_fun = std::function<void(const WorkspaceBundle& bundle,
                                    const NCBKernParam& kern_param,
                                    const NCBKernIndex& ncb_index)>;

 namespace stride1 {

 bool is_available(const NCBKernSizeParam& param);

 WorkspaceBundle get_bundle(const NCBKernSizeParam& param);

 template <bool quantized, size_t filter, BiasMode bias_mode, typename Op>
 void do_conv_kern(const WorkspaceBundle& bundle, const NCBKernParam& kern_param,
                  const NCBKernIndex& ncb_index);

 SmallVector<ConvBiasImpl::NCBKern> get_kimpls(const NCBKernSizeParam& param);
 }  // namespace stride1

 namespace stride2 {
 bool is_available(const NCBKernSizeParam& param);

 WorkspaceBundle get_bundle(const NCBKernSizeParam& param);

 template <bool quantized, size_t filter, BiasMode bias_mode, typename Op>
 void do_conv_kern(const WorkspaceBundle& bundle, const NCBKernParam& kern_param,
                  const NCBKernIndex& ncb_index);

 SmallVector<ConvBiasImpl::NCBKern> get_kimpls(const NCBKernSizeParam& param);

 }  // namespace stride2
 }  // namespace direct_int8_stride1
 }  // namespace arm_common
 }  // namespace megdnn

 // vim: syntax=cpp.doxygen
--- a/dnn/src/arm_common/conv_bias/int8x8x16/channel_wise_nchw44_8x8x16.cpp
+++ b/dnn/src/arm_common/conv_bias/int8x8x16/channel_wise_nchw44_8x8x16.cpp
@@ -0,0 +1,259 @@
 /**
 * \file dnn/src/arm_common/conv_bias/int8x8x16/channel_wise_nchw44.cpp
 * MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
 *
 * Copyright (c) 2014-2020 Megvii Inc. All rights reserved.
 *
 * Unless required by applicable law or agreed to in writing,
 * software distributed under the License is distributed on an
 * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 */

 #include "src/arm_common/conv_bias/int8x8x16/channel_wise_nchw44_8x8x16.h"
 #include "src/arm_common/conv_bias/int8x8x16/channel_wise_kernel.h"
 #include "src/common/opr_delegate.h"

 #include "midout.h"
 #include "src/fallback/conv_bias/common.h"

 using namespace megdnn;
 using namespace arm_common;
 using namespace channel_wise_nchw44_8x8x16;

 namespace {
 void get_rectified_size(
        const megdnn::fallback::ConvBiasImpl::NCBKernSizeParam& param,
        size_t& IH2, size_t& IW2) {
    auto&& fm = param.filter_meta;
    auto SW = fm.stride[1];
    auto OH = param.osz[0];
    auto OW = param.osz[1];
    auto FH = fm.spatial[0];
    auto FW = fm.spatial[1];

    size_t OW2 = (OW + 3) & ~3;
    IH2 = SW * OH + FH - SW;
    IW2 = SW * OW2 + FW - SW;
 }
 }  // namespace

 MIDOUT_DECL(megdnn_arm_common_conv_bias_int8x8x16_nchw44_stride1)
 MIDOUT_DECL(megdnn_arm_common_conv_bias_int8x8x16_nchw44_stride2)


 WorkspaceBundle stride1::get_bundle(
        const ConvBiasImpl::NCBKernSizeParam& param) {
    size_t nr_threads = param.nr_threads;
    size_t IH2, IW2;
    get_rectified_size(param, IH2, IW2);
    constexpr size_t pack_ic_size = 4_z;
    //! The extra 16B is used to void ivalid read in kernel compute
    size_t src_size = IH2 * IW2 * pack_ic_size * sizeof(int8_t) + 16;
    SmallVector<size_t> sizes(nr_threads, src_size);
    return {nullptr, sizes};
 }

 //! compute one output channel
 template <size_t filter, BiasMode bias_mode>
 void stride1::do_conv_kern(const WorkspaceBundle& bundle,
                           const NCBKernParam& kern_param,
                           const NCBKernIndex& ncb_index) {
    size_t PH = kern_param.filter_meta.padding[0];
    size_t PW = kern_param.filter_meta.padding[1];
    size_t OH = kern_param.osz[0];
    size_t OW = kern_param.osz[1];
    size_t IH = kern_param.isz[0];
    size_t IW = kern_param.isz[1];
    size_t IH2, IW2;
    get_rectified_size(kern_param, IH2, IW2);

    constexpr size_t pack_group_size = 4_z;
    constexpr size_t pack_ic_size = 4_z;

    size_t thread_id = ncb_index.thread_id, batch_id = ncb_index.ndrange_id[0];
    size_t group_id = ncb_index.ndrange_id[1];
    int8_t* padding_src = static_cast<int8_t*>(bundle.get(thread_id));
    const int8_t* sptr =
            kern_param.src<dt_int8>(batch_id, group_id, 0, pack_group_size);
    const int8_t* fptr = kern_param.filter<dt_int8>(group_id, pack_group_size);
    void* dst = kern_param.dst<void>(batch_id, group_id, 0, pack_group_size);
    const int16_t* bptr =
            kern_param.bias<dt_int16>(batch_id, group_id, 0, pack_group_size);
    //! copy in case of illegal read src when padding is zero
    std::memset(padding_src, 0, sizeof(int8_t) * IH2 * IW2 * pack_ic_size);
    rep(ih, IH) {
        std::memcpy(padding_src + ((ih + PH) * IW2 + PW) * pack_ic_size,
                    sptr + ih * IW * pack_ic_size,
                    sizeof(int8_t) * IW * pack_ic_size);
    }
    sptr = padding_src;

 #define KERN(_size)                                          \
    direct_stride1_##_size##x##_size##_int8x8x16<bias_mode>( \
            sptr, fptr, bptr, dst, IH2, IW2, OH, OW);
    DISPATCH_FILTER_CHANNEL_WISE(filter, KERN);
 #undef KERN
 }

 SmallVector<ConvBiasImpl::NCBKern> stride1::get_kimpls(
        const NCBKernSizeParam& param) {
    auto fm = param.filter_meta;
    size_t N = param.n;
    size_t group = fm.group / 4;
    megdnn_assert(fm.group % 4 == 0,
                  "nchw44 channel wise conv with group is not times of 4");
    WorkspaceBundle wbundle = get_bundle(param);
    conv_fun do_conv_fun = nullptr;

 #define DO_CONV_KERN_FUN(filter, bias_mode)                            \
    MIDOUT_BEGIN(megdnn_arm_common_conv_bias_int8x8x16_nchw44_stride1, \
                 midout_iv(#filter #bias_mode##_hash)) {               \
        do_conv_fun = do_conv_kern<filter, bias_mode>;                 \
    }                                                                  \
    MIDOUT_END();

 #define GET_OP_PARAM(i, bias_mode)                                  \
    switch (param.nonlineMode) {                                    \
        case param::ConvBias::NonlineMode::IDENTITY:                \
            DO_CONV_KERN_FUN(i, bias_mode)                          \
            break;                                                  \
        default:                                                    \
            megdnn_assert(0, "only support NonlineMode::IDENTITY"); \
            break;                                                  \
    }

 #define GET_BIAS_MODE_PARAM(i)                                  \
    switch (param.bias_mode) {                                  \
        case BiasMode::NO_BIAS:                                 \
            GET_OP_PARAM(i, BiasMode::NO_BIAS)                  \
            break;                                              \
        case BiasMode::BROADCAST_CHANNEL_BIAS:                  \
            GET_OP_PARAM(i, BiasMode::BROADCAST_CHANNEL_BIAS)   \
            break;                                              \
        default:                                                \
            megdnn_assert(0,                                    \
                          "only support BiasMode::NO_BIAS and " \
                          "BiasMode::BROADCAST_CHANNEL_BIAS");  \
            break;                                              \
    }

 #define DISPATCH_CONV_KERN()                                         \
    switch (param.filter_meta.spatial[0]) {                          \
        case 2:                                                      \
            GET_BIAS_MODE_PARAM(2)                                   \
            break;                                                   \
        case 3:                                                      \
            GET_BIAS_MODE_PARAM(3)                                   \
            break;                                                   \
        case 5:                                                      \
            GET_BIAS_MODE_PARAM(5)                                   \
            break;                                                   \
        default:                                                     \
            megdnn_assert(0, "only support filtersize 2x2 3x3 5x5"); \
            break;                                                   \
    }

    DISPATCH_CONV_KERN();
    megdnn_assert(do_conv_fun);

    SmallVector<ConvBiasImpl::NCBKern> ret_kerns;
    auto exec_one_group = [wbundle, do_conv_fun](
                                  const NCBKernParam& kern_param,
                                  const NCBKernIndex& ncb_index) mutable {
        wbundle.set(kern_param.workspace_ptr);
        do_conv_fun(wbundle, kern_param, ncb_index);
    };
    ret_kerns.push_back({exec_one_group, {N, group}});
    return ret_kerns;
 #undef DO_CONV_KERN_FUN
 }

 WorkspaceBundle stride2::get_bundle(
        const ConvBiasImpl::NCBKernSizeParam& param) {
    size_t nr_threads = param.nr_threads;
    size_t IH2, IW2;
    get_rectified_size(param, IH2, IW2);
    constexpr size_t pack_ic_size = 4_z;
    //! The extra 16B is used to void ivalid read in kernel compute
    size_t src_size = IH2 * IW2 * pack_ic_size * sizeof(int8_t) + 16;
    SmallVector<size_t> sizes(nr_threads, src_size);
    return {nullptr, sizes};
 }

 //! compute one output channel
 template <size_t filter, BiasMode bias_mode>
 void stride2::do_conv_kern(const WorkspaceBundle& bundle,
                           const NCBKernParam& kern_param,
                           const NCBKernIndex& ncb_index) {
    size_t PH = kern_param.filter_meta.padding[0];
    size_t PW = kern_param.filter_meta.padding[1];
    size_t OH = kern_param.osz[0];
    size_t OW = kern_param.osz[1];
    size_t IH = kern_param.isz[0];
    size_t IW = kern_param.isz[1];
    size_t IH2, IW2;
    get_rectified_size(kern_param, IH2, IW2);

    constexpr size_t pack_group_size = 4_z;
    constexpr size_t pack_ic_size = 4_z;

    size_t thread_id = ncb_index.thread_id, batch_id = ncb_index.ndrange_id[0];
    size_t group_id = ncb_index.ndrange_id[1];
    int8_t* padding_src = static_cast<int8_t*>(bundle.get(thread_id));
    const int8_t* sptr =
            kern_param.src<dt_int8>(batch_id, group_id, 0, pack_group_size);
    const int8_t* fptr = kern_param.filter<dt_int8>(group_id, pack_group_size);
    void* dst = kern_param.dst<void>(batch_id, group_id, 0, pack_group_size);
    const int16_t* bptr =
            kern_param.bias<dt_int16>(batch_id, group_id, 0, pack_group_size);
    //! copy in case of illegal read src when padding is zero
    std::memset(padding_src, 0, sizeof(int8_t) * IH2 * IW2 * pack_ic_size);
    rep(ih, IH) {
        std::memcpy(padding_src + ((ih + PH) * IW2 + PW) * pack_ic_size,
                    sptr + ih * IW * pack_ic_size,
                    sizeof(int8_t) * IW * pack_ic_size);
    }
    sptr = padding_src;

 #define KERN(_size)                                          \
    direct_stride2_##_size##x##_size##_int8x8x16<bias_mode>( \
            sptr, fptr, bptr, dst, IH2, IW2, OH, OW);
    DISPATCH_FILTER_CHANNEL_WISE(filter, KERN);
 #undef KERN
 }

 SmallVector<ConvBiasImpl::NCBKern> stride2::get_kimpls(
        const NCBKernSizeParam& param) {
    auto fm = param.filter_meta;
    size_t N = param.n;
    size_t group = fm.group / 4;
    megdnn_assert(fm.group % 4 == 0,
                  "nchw44 channel wise conv with group is not times of 4");
    WorkspaceBundle wbundle = get_bundle(param);
    conv_fun do_conv_fun = nullptr;

 #define DO_CONV_KERN_FUN(filter, bias_mode)                            \
    MIDOUT_BEGIN(megdnn_arm_common_conv_bias_int8x8x16_nchw44_stride2, \
                 midout_iv(#filter #bias_mode##_hash)) {               \
        do_conv_fun = do_conv_kern<filter, bias_mode>;                 \
    }                                                                  \
    MIDOUT_END();

    DISPATCH_CONV_KERN();
    megdnn_assert(do_conv_fun);

    SmallVector<ConvBiasImpl::NCBKern> ret_kerns;
    auto exec_one_group = [wbundle, do_conv_fun](
                                  const NCBKernParam& kern_param,
                                  const NCBKernIndex& ncb_index) mutable {
        wbundle.set(kern_param.workspace_ptr);
        do_conv_fun(wbundle, kern_param, ncb_index);
    };
    ret_kerns.push_back({exec_one_group, {N, group}});
    return ret_kerns;
 #undef DISPATCH_CONV_KERN
 #undef GET_BIAS_MODE_PARAM
 #undef GET_OP_PARAM
 }

 // vim: syntax=cpp.doxygen
--- a/dnn/src/arm_common/conv_bias/int8x8x16/channel_wise_nchw44_8x8x16.h
+++ b/dnn/src/arm_common/conv_bias/int8x8x16/channel_wise_nchw44_8x8x16.h
@@ -0,0 +1,57 @@
 /**
 * \file dnn/src/arm_common/conv_bias/int8x8x16/channel_wise_nchw44.h
 * MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
 *
 * Copyright (c) 2014-2020 Megvii Inc. All rights reserved.
 *
 * Unless required by applicable law or agreed to in writing,
 * software distributed under the License is distributed on an
 * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 */

 #pragma once

 #include "src/arm_common/conv_bias/opr_impl.h"

 namespace megdnn {
 namespace arm_common {
 namespace channel_wise_nchw44_8x8x16 {

 using NCBKernSizeParam = fallback::ConvBiasImpl::NCBKernSizeParam;
 using NCBKernParam = fallback::ConvBiasImpl::NCBKernParam;
 using NCBKernIndex = fallback::ConvBiasImpl::NCBKernIndex;

 using conv_fun = std::function<void(const WorkspaceBundle& bundle,
                                    const NCBKernParam& kern_param,
                                    const NCBKernIndex& ncb_index)>;

 namespace stride1 {

 bool is_available(const NCBKernSizeParam& param);

 WorkspaceBundle get_bundle(const NCBKernSizeParam& param);

 template <size_t filter, BiasMode bias_mode>
 void do_conv_kern(const WorkspaceBundle& bundle, const NCBKernParam& kern_param,
                  const NCBKernIndex& ncb_index);

 SmallVector<ConvBiasImpl::NCBKern> get_kimpls(const NCBKernSizeParam& param);
 }  // namespace stride1

 namespace stride2 {
 bool is_available(const NCBKernSizeParam& param);

 WorkspaceBundle get_bundle(const NCBKernSizeParam& param);

 template <size_t filter, BiasMode bias_mode>
 void do_conv_kern(const WorkspaceBundle& bundle, const NCBKernParam& kern_param,
                  const NCBKernIndex& ncb_index);

 SmallVector<ConvBiasImpl::NCBKern> get_kimpls(const NCBKernSizeParam& param);

 }  // namespace stride2
 }  // namespace direct_int8_stride1
 }  // namespace arm_common
 }  // namespace megdnn

 // vim: syntax=cpp.doxygen
--- a/dnn/src/arm_common/conv_bias/opr_impl.cpp
+++ b/dnn/src/arm_common/conv_bias/opr_impl.cpp
@@ -48,6 +48,7 @@ class ConvBiasImpl::AlgoPack : NonCopyableObj {
    AlgoS8DirectStride1 s8_direct_stride1;
    AlgoS8ChanWiseStride1NCHW44 s8_channel_wise_stride1_nchw44;
    AlgoS8ChanWiseStride2NCHW44 s8_channel_wise_stride2_nchw44;
    AlgoS8x8x16ChanWiseStride1Stride2NCHW44 s8x8x16_channel_wise_stride1_stride2_nchw44;

 #if __ARM_FEATURE_DOTPROD
    AlgoDotS8DirectStride1 ds8_direct_stride1;
@@ -95,6 +96,7 @@ public:
        direct_algos.emplace_back(&s8_direct_nchw_nchw44);
        direct_algos.emplace_back(&s8_direct_stride1);

        direct_algos.emplace_back(&s8x8x16_channel_wise_stride1_stride2_nchw44);
        direct_algos.emplace_back(&s8_channel_wise_stride1_nchw44);
        direct_algos.emplace_back(&s8_channel_wise_stride2_nchw44);

--- a/dnn/src/arm_common/conv_bias/opr_impl.h
+++ b/dnn/src/arm_common/conv_bias/opr_impl.h
@@ -54,6 +54,7 @@ private:

    class AlgoS8ChanWiseStride1NCHW44;
    class AlgoS8ChanWiseStride2NCHW44;
    class AlgoS8x8x16ChanWiseStride1Stride2NCHW44;

 #if __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
    class AlgoFP16WinogradF23;
--- a/dnn/test/arm_common/conv_bias.cpp
+++ b/dnn/test/arm_common/conv_bias.cpp
@@ -558,6 +558,142 @@ void BENCHMARK_IM2COL_NCHW44_VS_NCHW(const char* algo_name,
    }
 }

 std::vector<conv_bias::TestArg> get_nchw44_channel_wise_benchmark_args(
        std::vector<size_t> kernel, size_t stride, bool no_bias,
        bool no_nonlinemode, bool no_full_bias) {
    using namespace conv_bias;
    using Param = param::ConvBias;
    using NLMode = param::ConvBias::NonlineMode;
    std::vector<TestArg> args;

    auto pack = [&](size_t n, size_t group, size_t w, size_t h, size_t kernel,
                    size_t stride, NLMode nlmode, bool pad) {
        Param param;
        param.stride_h = stride;
        param.stride_w = stride;
        if (pad) {
            param.pad_h = kernel / 2;
            param.pad_w = kernel / 2;
        } else {
            param.pad_h = 0;
            param.pad_w = 0;
        }
        param.nonlineMode = nlmode;
        param.format = param::ConvBias::Format::NCHW44;
        param.sparse = param::ConvBias::Sparse::GROUP;

        args.emplace_back(param, TensorShape{n, group, h, w, 4},
                          TensorShape{group, 1, 1, kernel, kernel, 4},
                          TensorShape{});
        if (!no_bias) {
            args.emplace_back(param, TensorShape{n, group, h, w, 4},
                              TensorShape{group, 1, 1, kernel, kernel, 4},
                              TensorShape{1, group, 1, 1, 4});
        }
        if (!no_full_bias) {
            args.emplace_back(
                    param, TensorShape{n, group, h, w, 4},
                    TensorShape{group, 1, 1, kernel, kernel, 4},
                    TensorShape{n, group,
                                (h + 2 * param.pad_w - kernel) / stride + 1,
                                (w + 2 * param.pad_w - kernel) / stride + 1,
                                4});
        }
    };

    std::vector<NLMode> nonlinemode = {NLMode::IDENTITY};
    if (!no_nonlinemode) {
        nonlinemode.emplace_back(NLMode::RELU);
        nonlinemode.emplace_back(NLMode::H_SWISH);
    }
    for (size_t n : {1}) {
        for (auto nlmode : nonlinemode) {
            for (bool pad : {true}) {
                for (size_t group : {1, 2, 4, 128}) {
                    for (size_t size : {40,89,100,200}) {
                        for (size_t kern : kernel) {
                            pack(n, group, size, size, kern, stride, nlmode,
                                 pad);
                        }
                    }
                }
            }
           for (bool pad : {false}) {
               for (size_t group : {1, 2, 4, 8, 16, 32, 64, 128}) {
                   for (size_t size : {40, 89, 100}) {
                       for (size_t kern : kernel) {
                           pack(n, group, size, size, kern, stride, nlmode,
                                pad);
                       }
                   }
               }
           }
        }
    }
    return args;
 }

 void BENCHMARK_GROUPCONV_NCHW44_int8x8x16VS_int8x8x32(const char* algo_name0,
                                     const char* algo_name1, Handle* handle,
                                     size_t kernel,size_t stride = 1, size_t pack_size = 1) {

 auto args = get_nchw44_channel_wise_benchmark_args({2, 3, 5}, stride, false, true, true);

    using namespace conv_bias;
    constexpr size_t RUN = 10;
    Benchmarker<ConvBias> benchmark(handle);
    benchmark.set_display(false);
    benchmark.set_times(RUN);
    benchmark.set_dtype(0, dtype::Int8());
    benchmark.set_dtype(1, dtype::Int8());
    benchmark.set_dtype(2, dtype::Int32());
    benchmark.set_dtype(4, dtype::Int32());

    Benchmarker<ConvBias> benchmark_algo1(handle);
    benchmark_algo1.set_display(false);
    benchmark_algo1.set_times(RUN);
    benchmark_algo1.set_dtype(0, dtype::Int8());
    benchmark_algo1.set_dtype(1, dtype::Int8());
    benchmark_algo1.set_dtype(2, dtype::Int16());
    benchmark_algo1.set_dtype(4, dtype::Int16());

    for (auto&& arg : args) {
        TensorLayout dst_layout;
        auto opr = handle->create_operator<ConvBias>();
        opr->param() = arg.param;
        opr->deduce_layout({arg.src, dtype::Float32()},
                           {arg.filter, dtype::Float32()},
                           {arg.bias, dtype::Float32()}, {}, dst_layout);
        //! dst.nr_elems * IC * FH * FW * 2
        float computations = dst_layout.total_nr_elems() * arg.filter[1] *
                             arg.filter[2] * arg.filter[3] * 2.0 * pack_size/
                             (1024 * 1024 * 1024) * 1e3;

        benchmark.set_param(arg.param);
        auto used = algo_benchmark<ConvBias>(benchmark,
                                             {arg.src, arg.filter, {}, {}, {}},
                                             algo_name0) /
                    RUN;

        arg.param.nonlineMode = param::ConvBias::NonlineMode::IDENTITY;
        arg.param.format = param::ConvBias::Format::NCHW44;
        benchmark_algo1.set_param(arg.param);

        auto used_algo1 =
                algo_benchmark<ConvBias>(
                        benchmark_algo1,
                        {arg.src, arg.filter, {}, {}, {}},
                        algo_name1) /
                RUN;
        printf("%s %s: normal: %f ms %f Gflops 8x8x16: %f ms %f GFlops "
               "speedup: "
               "%f\n",
               arg.src.to_string().c_str(), arg.filter.to_string().c_str(),
               used, computations / used, used_algo1,
               computations / used_algo1, used / used_algo1);
    }
 }

 #if MEGDNN_AARCH64
 TEST_F(ARM_COMMON, BENCHMARK_NCHW_VS_NCHW44_INT8x8x32) {
    printf("=========================compare "
@@ -579,6 +715,17 @@ TEST_F(ARM_COMMON, BENCHMARK_NCHW_VS_NCHW44_INT8x8x16) {
 }
 #endif

 TEST_F(ARM_COMMON, BENCHMARK_GROUP_CONV_NCHW44_INT8x8x32_VS_INT8x8x16_STRIDE1) {
    BENCHMARK_GROUPCONV_NCHW44_int8x8x16VS_int8x8x32("S8_CHAN_WISE_STRD1_NCHW44",
                                    "S8x8x16_CHAN_WISE_STRD1_STRD2_NCHW44",
                                    handle(), 3,1,4);
 }
 TEST_F(ARM_COMMON, BENCHMARK_GROUP_CONV_NCHW44_INT8x8x32_VS_INT8x8x16_STRIDE2) {
    BENCHMARK_GROUPCONV_NCHW44_int8x8x16VS_int8x8x32("S8_CHAN_WISE_STRD2_NCHW44",
                                    "S8x8x16_CHAN_WISE_STRD1_STRD2_NCHW44",
                                    handle(), 3,2, 4);
 }

 TEST_F(ARM_COMMON, BENCHMARK_GROUP_CONVBIAS_QUANTIZED) {
    constexpr size_t RUNS = 50;
    param::ConvBias param;
--- a/dnn/test/arm_common/conv_bias_multi_thread.cpp
+++ b/dnn/test/arm_common/conv_bias_multi_thread.cpp
@@ -9,6 +9,7 @@
 * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or
 * implied.
 */
 #include "megdnn/dtype.h"
 #include "test/arm_common/fixture.h"
 #include "test/common/benchmarker.h"
 #include "test/common/conv_bias.h"
@@ -475,6 +476,36 @@ TEST_F(ARM_COMMON_MULTI_THREADS,
            handle(), "S8_CHAN_WISE_STRD2_NCHW44");
 }

 TEST_F(ARM_COMMON,
       CONV_BIAS_INT8_INT8_INT16_CHANNEL_WISE_DIRECT1_NCHW44) {
    Checker<ConvBias> checker(handle());
    checker.set_before_exec_callback(
            conv_bias::ConvBiasAlgoChecker<ConvBias>("S8x8x16_CHAN_WISE_STRD1_STRD2_NCHW44"));
    checker.set_dtype(0, dtype::Int8());
    checker.set_dtype(1, dtype::Int8());
    checker.set_dtype(2, dtype::Int16());
    checker.set_dtype(4, dtype::Int16());
    auto args = get_nchw44_channel_wise_args({2, 3, 5}, 1, false, true, true);
    for (auto&& arg : args) {
        checker.set_param(arg.param).execs({arg.src, arg.filter, {}, {}, {}});
    }
 }

 TEST_F(ARM_COMMON_MULTI_THREADS,
       CONV_BIAS_INT8_INT8_INT16_CHANNEL_WISE_DIRECT2_NCHW44) {
    Checker<ConvBias> checker(handle());
    checker.set_before_exec_callback(
            conv_bias::ConvBiasAlgoChecker<ConvBias>("S8x8x16_CHAN_WISE_STRD1_STRD2_NCHW44"));
    checker.set_dtype(0, dtype::Int8());
    checker.set_dtype(1, dtype::Int8());
    checker.set_dtype(2, dtype::Int16());
    checker.set_dtype(4, dtype::Int16());
    auto args = get_nchw44_channel_wise_args({2, 3, 5}, 2, false, true, true);
    for (auto&& arg : args) {
        checker.set_param(arg.param).execs({arg.src, arg.filter, {}, {}, {}});
    }
 }

 /********************************qint8 direct******************************/
 TEST_F(ARM_COMMON_MULTI_THREADS, CONV_BIAS_INT8_STRIDE1) {
    checker_conv_bias_qint8x8x8(get_int8_quint8_conv_bias_args(
--- a/dnn/test/arm_common/conv_bias_multi_thread_benchmark.cpp
+++ b/dnn/test/arm_common/conv_bias_multi_thread_benchmark.cpp
@@ -1707,6 +1707,77 @@ TEST_F(ARM_COMMON_BENCHMARK_MULTI_THREADS,
 }

 TEST_F(ARM_COMMON_BENCHMARK_MULTI_THREADS,
       BENCHMARK_CHANNEL_WISE_INT8_INT8_INT16_STRIDE1) {
    constexpr size_t RUNS = 50;

    param::ConvBias param;
    param.nonlineMode = param::ConvBias::NonlineMode::IDENTITY;
    param.pad_h = 1;
    param.pad_w = 1;
    param.stride_h = 1;
    param.stride_w = 1;
    param.sparse = param::ConvBias::Sparse::GROUP;
    param.format = param::ConvBias::Format::NCHW44;

    std::vector<std::pair<SmallVector<TensorShape>, float>>
            shapes_and_computation;
    auto bench_case = [&](size_t N, size_t IC, size_t H, size_t W, size_t FS,
                          size_t P) {
        size_t group = IC;
        size_t OC = IC;
        size_t S = 1;
        SmallVector<TensorShape> shapes{
                {N, IC, H, W, 4},
                {group, 1, 1, FS, FS, 4},
                {1, OC, 1, 1, 4},
                {},
                {N, OC, (H + 2 * P - FS) / S + 1, (W + 2 * P - FS) / S + 1, 4}};
        TensorShape dst{N, OC, (H + 2 * P - FS) / S + 1,
                        (W + 2 * P - FS) / S + 1, 4};
        float computations =
                ((IC / group) * FS * FS * dst.total_nr_elems() * 2 +
                 dst.total_nr_elems()) *
                1e-6;
        shapes_and_computation.push_back(std::make_pair(shapes, computations));
    };
    bench_case(1, 128, 200, 200, 3, 1);
    bench_case(1, 128, 128, 128, 3, 1);
    bench_case(1, 128, 100, 100, 3, 1);
    bench_case(1, 128, 80, 80, 3, 1);
    bench_case(1, 128, 56, 56, 3, 1);
    bench_case(1, 128, 28, 28, 3, 1);
    bench_case(1, 128, 14, 14, 3, 1);

    bench_case(1, 64, 200, 200, 3, 1);
    bench_case(1, 64, 128, 128, 3, 1);
    bench_case(1, 64, 100, 100, 3, 1);
    bench_case(1, 64, 80, 80, 3, 1);
    bench_case(1, 64, 56, 56, 3, 1);
    bench_case(1, 64, 28, 28, 3, 1);
    bench_case(1, 64, 14, 14, 3, 1);

    bench_case(1, 32, 200, 200, 3, 1);
    bench_case(1, 32, 128, 128, 3, 1);
    bench_case(1, 32, 100, 100, 3, 1);
    bench_case(1, 32, 80, 80, 3, 1);
    bench_case(1, 32, 56, 56, 3, 1);
    bench_case(1, 32, 28, 28, 3, 1);
    bench_case(1, 32, 14, 14, 3, 1);

    std::string algo_name = "S8x8x16_CHAN_WISE_STRD1_STRD2_NCHW44";
    printf("Benchmarker S8x8x16_CHAN_WISE_STRD1_STRD2_NCHW44 algo\n");
    std::vector<DType> data_type = {dtype::Int8(), dtype::Int8(),
                                    dtype::Int16(), dtype::Int16()};
    benchmark_impl(param, shapes_and_computation, algo_name, RUNS,
                   {4, {4, 5, 6, 7}}, {1, {4}}, data_type);
    benchmark_impl(param, shapes_and_computation, algo_name, RUNS,
                   {4, {4, 5, 6, 7}}, {1, {7}}, data_type);
    benchmark_impl(param, shapes_and_computation, algo_name, RUNS, {2, {4, 5}},
                   {1, {4}}, data_type);
 }


 TEST_F(ARM_COMMON_BENCHMARK_MULTI_THREADS,
       BENCHMARK_IM2COL_NCHW44_INT8x8x32_STRIDE1) {
    constexpr size_t RUNS = 50;