feat(arm/dnn): add support for nchw_nchw44 filter 2

GitOrigin-RevId: 013242911e
5 years ago · 3de1fa5b34
--- a/dnn/src/arm_common/conv_bias/fp32/f32_direct_stride2_nchw_nchw44_algo.cpp
+++ b/dnn/src/arm_common/conv_bias/fp32/f32_direct_stride2_nchw_nchw44_algo.cpp
@@ -207,7 +207,7 @@ bool ConvBiasImpl::AlgoF32DirectStride2NCHWNCHW44::usable(
                   (fm.format == param::Convolution::Format::NCHW44);
    bool ok_src_dst = fm.icpg < 4 && (oc % 4 == 0 && oc >= 4) && fm.group == 1;
    bool ok_filter = fm.spatial_ndim == 2 && fh == fm.spatial[1] &&
                     (fh == 3 || fh == 5 || fh == 7);
                     (fh == 2 || fh == 3 || fh == 5 || fh == 7);
    bool ok_slide = fm.dilation[0] == 1 && fm.dilation[1] == 1 &&
                    fm.stride[0] == 2 && fm.stride[1] == 2;
    bool ok_conv = !fm.should_flip && param.bias_mode != BiasMode::BIAS;
@@ -267,6 +267,9 @@ ConvBiasImpl::AlgoF32DirectStride2NCHWNCHW44::dispatch_kerns(
 #define DISPATCH_CONV_KERN()                \
    switch (param.filter_meta.spatial[0]) { \
        case 2:                             \
            GET_BIAS_MODE_PARAM(2)          \
            break;                          \
        case 3:                             \
            GET_BIAS_MODE_PARAM(3)          \
            break;                          \
--- a/dnn/src/arm_common/conv_bias/fp32/f32_direct_stride2_nchw_nchw44_kern.cpp
+++ b/dnn/src/arm_common/conv_bias/fp32/f32_direct_stride2_nchw_nchw44_kern.cpp
@@ -207,24 +207,27 @@ struct KerNeonXXs2NchwNchw44FP32<bias_mode, Op, remain_w, 3, oc_block> {
            float32x4_t src[src_reg_size];
            float32x4_t weight[c_dim][filter_size];
            // row 0
            load_helper<5, 0, simd_len, 0, Vld1q_f32>(src, src_ptr, 0);
            load_helper<3, 0, oc_step, c_dim, Vld1q_f32>(weight, weight_ptr,
                                                         ld_weight_oc);
            load_helper<src_reg_size, 0, simd_len, 0, Vld1q_f32>(src, src_ptr,
                                                                 0);
            load_helper<filter_size, 0, oc_step, c_dim, Vld1q_f32>(
                    weight, weight_ptr, ld_weight_oc);
            cal_helper<0, 0, c_dim, Vfmaq_laneq_f32>(c, src, weight);
            cal_helper<1, 1, c_dim, Vfmaq_laneq_f32>(c, src, weight);
            cal_helper<2, 2, c_dim, Vfmaq_laneq_f32>(c, src, weight);
            // row 1
            load_helper<5, 0, simd_len, 0, Vld1q_f32>(src, src_ptr + iw, 0);
            load_helper<3, 0, oc_step, c_dim, Vld1q_f32>(
            load_helper<src_reg_size, 0, simd_len, 0, Vld1q_f32>(
                    src, src_ptr + iw, 0);
            load_helper<filter_size, 0, oc_step, c_dim, Vld1q_f32>(
                    weight, weight_ptr + 1 * ld_weight_fw, ld_weight_oc);
            cal_helper<0, 0, c_dim, Vfmaq_laneq_f32>(c, src, weight);
            cal_helper<1, 1, c_dim, Vfmaq_laneq_f32>(c, src, weight);
            cal_helper<2, 2, c_dim, Vfmaq_laneq_f32>(c, src, weight);
            // row 2
            load_helper<5, 0, simd_len, 0, Vld1q_f32>(src, src_ptr + 2 * iw, 0);
            load_helper<3, 0, oc_step, c_dim, Vld1q_f32>(
            load_helper<src_reg_size, 0, simd_len, 0, Vld1q_f32>(
                    src, src_ptr + 2 * iw, 0);
            load_helper<filter_size, 0, oc_step, c_dim, Vld1q_f32>(
                    weight, weight_ptr + 2 * ld_weight_fw, ld_weight_oc);
            cal_helper<0, 0, c_dim, Vfmaq_laneq_f32>(c, src, weight);
            cal_helper<1, 1, c_dim, Vfmaq_laneq_f32>(c, src, weight);
@@ -238,6 +241,52 @@ struct KerNeonXXs2NchwNchw44FP32<bias_mode, Op, remain_w, 3, oc_block> {
    }
 };
 template <BiasMode bias_mode, typename Op, int remain_w, int oc_block>
 struct KerNeonXXs2NchwNchw44FP32<bias_mode, Op, remain_w, 2, oc_block> {
    static void impl(const float32_t* src_ptr, const float32_t* weight_ptr,
                     const float32_t* bias_ptr, float32_t* dst_ptr, int ic,
                     int ih, int iw, int ld_dst_oc, const Op& op) {
        constexpr int loop_ic_step = 1;
        constexpr int filter_size = 2;
        constexpr int oc_step = 4;
        constexpr int simd_len = 4;
        constexpr int src_reg_size = 4;
        constexpr int ld_weight_fw = oc_step * filter_size;
        const int ld_weight_oc = oc_step * filter_size * filter_size * ic;
        const int ld_weight_ic = oc_step * filter_size * filter_size;
        const int ld_src_ic = ih * iw;
        constexpr int c_dim = OCHelper<oc_block>::val;
        float32x4_t c[c_dim][8];
        init_ocx_ow8<c_dim, bias_mode>(c, bias_ptr, oc_step);
        for (int ic_idx = 0; ic_idx < ic; ic_idx += loop_ic_step) {
            float32x4_t src[src_reg_size];
            float32x4_t weight[c_dim][filter_size];
            // row 0
            load_helper<src_reg_size, 0, simd_len, 0, Vld1q_f32>(src, src_ptr,
                                                                 0);
            load_helper<filter_size, 0, oc_step, c_dim, Vld1q_f32>(
                    weight, weight_ptr, ld_weight_oc);
            cal_helper<0, 0, c_dim, Vfmaq_laneq_f32>(c, src, weight);
            cal_helper<1, 1, c_dim, Vfmaq_laneq_f32>(c, src, weight);
            // row 1
            load_helper<src_reg_size, 0, simd_len, 0, Vld1q_f32>(
                    src, src_ptr + iw, 0);
            load_helper<filter_size, 0, oc_step, c_dim, Vld1q_f32>(
                    weight, weight_ptr + 1 * ld_weight_fw, ld_weight_oc);
            cal_helper<0, 0, c_dim, Vfmaq_laneq_f32>(c, src, weight);
            cal_helper<1, 1, c_dim, Vfmaq_laneq_f32>(c, src, weight);
            src_ptr += ld_src_ic;
            weight_ptr += ld_weight_ic;
        }
        store_ocx_ow8_remain_static<c_dim, remain_w, Op>(c, op, dst_ptr,
                                                         ld_dst_oc);
    }
 };
 }  // namespace
 void conv_bias::pack_weight_fp32_nchw_nchw44(const float32_t* in_ptr,
@@ -383,19 +432,12 @@ static void conv_direct_stride2_fp32_nchw_nchw44(
                ow, op, ph, pw);                                                  \
    }
 CONSTRUCT_FUNC(2);
 CONSTRUCT_FUNC(3);
 CONSTRUCT_FUNC(5);
 CONSTRUCT_FUNC(7);
 #undef CONSTRUCT_FUNC
 template <BiasMode bias_mode, typename Op>
 void conv_bias::conv_direct_stride2_2x2_fp32_nchw_nchw44(
        const float32_t*, const float32_t*, const float32_t*, float32_t*,
        float32_t*, const int, const int, const int, const int, const int,
        const int, const int, const Op&, const int, const int) {
    megdnn_assert(0, "not imple nchw_nchw44 2x2s2 conv");
 }
 #define INSTANTIATION(stride, i, bias, Op)                                   \
    template void conv_bias::                                                \
            conv_direct_##stride##_##i##x##i##_fp32_nchw_nchw44<bias, Op>(   \
--- a/dnn/test/arm_common/conv_bias.cpp
+++ b/dnn/test/arm_common/conv_bias.cpp
@@ -195,6 +195,7 @@ static void benchmark_convbias(Handle* handle, bool is_fp32 = false) {
    };
    if (is_fp32) {
        run(1, 1, 4, 112, 112, 2, 2, true);
        run(1, 3, 32, 224, 224, 3, 2, true);
        run(1, 3, 64, 224, 224, 7, 2, true);
    } else {
@@ -1806,12 +1807,12 @@ TEST_F(ARM_COMMON, BENCHMARK_CONV_BIAS_WINOGRAD_VS_IM2COL_INT8) {
        auto opr = handle()->create_operator<ConvBias>();
        opr->param() = arg.param;
        opr->deduce_layout({arg.src, dtype::Float32()},
                        {arg.filter, dtype::Float32()},
                        {arg.bias, dtype::Float32()}, {}, dst_layout);
                           {arg.filter, dtype::Float32()},
                           {arg.bias, dtype::Float32()}, {}, dst_layout);
        //! dst.nr_elems * IC * FH * FW * 2
        float computations = dst_layout.total_nr_elems() * arg.filter[1] *
                            arg.filter[2] * arg.filter[3] * 2.0 /
                            (1024 * 1024 * 1024) * 1e3;
                             arg.filter[2] * arg.filter[3] * 2.0 /
                             (1024 * 1024 * 1024) * 1e3;
        benchmark_im2col.set_param(arg.param);
        auto im2col_used =
@@ -1828,11 +1829,11 @@ TEST_F(ARM_COMMON, BENCHMARK_CONV_BIAS_WINOGRAD_VS_IM2COL_INT8) {
                RUN;
        printf("%s %s: im2col: %f ms %f Gflops winograd: %f ms %f GFlops "
            "speedup: "
            "%f\n",
            arg.src.to_string().c_str(), arg.filter.to_string().c_str(),
            im2col_used, computations / im2col_used, winograd_used,
            computations / winograd_used, im2col_used / winograd_used);
               "speedup: "
               "%f\n",
               arg.src.to_string().c_str(), arg.filter.to_string().c_str(),
               im2col_used, computations / im2col_used, winograd_used,
               computations / winograd_used, im2col_used / winograd_used);
    }
 }
--- a/dnn/test/arm_common/conv_bias_multi_thread.cpp
+++ b/dnn/test/arm_common/conv_bias_multi_thread.cpp
@@ -342,9 +342,9 @@ TEST_F(ARM_COMMON_MULTI_THREADS, CONVBIAS_DIRECT_FP32_STR2_SMALL_GROUP) {
                    handle(), "F32STRD2_SMALL_GROUP");
 }
 TEST_F(ARM_COMMON_MULTI_THREADS, CONVBIAS_NCHW_NCHW44_F32) {
    check_conv_bias(
            get_nchw44_conv_bias_args({3, 5, 7}, 2, false, false, false, true),
            handle(), "F32_CONV_NCHW_NCHW44");
    check_conv_bias(get_nchw44_conv_bias_args({2, 3, 5, 7}, 2, false, false,
                                              false, true),
                    handle(), "F32_CONV_NCHW_NCHW44");
 }
 /**********************************F16 direct************************/
 #if __ARM_FEATURE_FP16_VECTOR_ARITHMETIC