feat(x86/rvv): opt AlgoF32DirectNCHWNCHW44

and opt GiMaximumFloat32/GiMinimumFloat32 on x86 GitOrigin-RevId: 825021e867
2 years ago · fa59a7b061
--- a/dnn/src/fallback/conv_bias/gi/fp32/channel_wise_nchw44_kern.cpp
+++ b/dnn/src/fallback/conv_bias/gi/fp32/channel_wise_nchw44_kern.cpp
@@ -748,7 +748,7 @@ void channel_wise_nchw44_float::do_conv_kern_stride1_5x5(
            GI_FLOAT32_FIXLEN_t src_v[2][5];
 #define COMPUTE_5X5_2(i, dst, src, kernel0, kernel1) \
    load_vec<5>(kernel0, filter + i * 5 * 4);        \
    load_vec<6>(src, input + i * IW * 4);            \
    load_vec<5>(src, input + i * IW * 4);            \
    compute_vec<5>(dst[0][0], &src[0], kernel0);     \
    compute_vec<5>(dst[1][0], &src[0], kernel1);
            // line 0
@@ -813,7 +813,7 @@ void channel_wise_nchw44_float::do_conv_kern_stride1_5x5(
            GI_FLOAT32_FIXLEN_t src_v[2][5];
 #define COMPUTE_5X5_1(i, dst, src, kernel)   \
    load_vec<5>(kernel, filter + i * 5 * 4); \
    load_vec<6>(src, input + i * IW * 4);    \
    load_vec<5>(src, input + i * IW * 4);    \
    compute_vec<5>(dst, &src[0], kernel)
            // line 0
            COMPUTE_5X5_1(0, dst_v, src_v[0], kernel[0]);
@@ -1148,7 +1148,7 @@ void channel_wise_nchw44_float::do_conv_kern_stride2_5x5(
            GI_FLOAT32_FIXLEN_t src_v[2][5];
 #define COMPUTE_5X5_1(i, dst, src, kernel)   \
    load_vec<5>(kernel, filter + i * 5 * 4); \
    load_vec<6>(src, input + i * IW * 4);    \
    load_vec<5>(src, input + i * IW * 4);    \
    compute_vec<5>(dst, &src[0], kernel)
            // line 0
            COMPUTE_5X5_1(0, dst_v, src_v[0], kernel[0]);
--- a/dnn/src/fallback/conv_bias/gi/fp32/direct_kernels/f32_direct_nchw_nchw44_kern_common.h
+++ b/dnn/src/fallback/conv_bias/gi/fp32/direct_kernels/f32_direct_nchw_nchw44_kern_common.h
@@ -37,6 +37,26 @@ struct ShiftCalHelper<src_idx, weight_idx, c_dim, stride, 0, T, T2, T3> {
    static MEGDNN_ALWAYS_INLINE void impl(T&, T2&, T3&) {}
 };

 #if defined(GI_TARGET_X86) || defined(GI_RVV_INTRINSICS)
 //! x86 and rvv GiSimdFmaLane API is slowly, as an alternate, use
 //! GiMultiplyAddScalarFloat32
 #define MLA GiMultiplyAddScalarFloat32
 #define cb(step)                                                                     \
    c[0][step] = GiFloat32Type2FixLenType(MLA(                                       \
            GiFixLenType2GiFloat32Type(c[0][step]),                                  \
            GiFixLenType2GiFloat32Type(weight[0][weight_idx]),                       \
            *(src[(step * stride + src_idx) / 4] + (step * stride + src_idx) % 4))); \
    c[1][step] = GiFloat32Type2FixLenType(MLA(                                       \
            GiFixLenType2GiFloat32Type(c[1][step]),                                  \
            GiFixLenType2GiFloat32Type(weight[1][weight_idx]),                       \
            *(src[(step * stride + src_idx) / 4] + (step * stride + src_idx) % 4)));

 #define cb2(step)                                              \
    c[0][step] = GiFloat32Type2FixLenType(MLA(                 \
            GiFixLenType2GiFloat32Type(c[0][step]),            \
            GiFixLenType2GiFloat32Type(weight[0][weight_idx]), \
            *(src[(step * stride + src_idx) / 4] + (step * stride + src_idx) % 4)));
 #else
 #define cb(step)                                                            \
    c[0][step] = GiFloat32Type2FixLenType(GiSimdFmaLane(                    \
            GiFixLenType2GiFloat32Type(c[0][step]),                         \
@@ -55,6 +75,8 @@ struct ShiftCalHelper<src_idx, weight_idx, c_dim, stride, 0, T, T2, T3> {
            GiFixLenType2GiFloat32Type(weight[0][weight_idx]),              \
            GiFixLenType2GiFloat32Type(src[(step * stride + src_idx) / 4]), \
            (step * stride + src_idx) % 4));
 #undef MLA
 #endif

 #define SHIFT_CAL_HELPER(ow_remain)                                               \
    template <                                                                    \
@@ -151,23 +173,38 @@ struct KerGiXXs2NchwNchw44FP32<bias_mode, Op, remain_w, 7, oc_block, stride, ow_
        init_ocx_ow8<c_dim, bias_mode, remain_w>(c, bias_ptr, oc_step);

        for (int ic_idx = 0; ic_idx < ic; ic_idx += loop_ic_step) {
            //! x86 and rvv GiSimdFmaLane API is slowly, as an alternate, use
            //! GiMultiplyAddScalarFloat32
 #if defined(GI_TARGET_X86) || defined(GI_RVV_INTRINSICS)
            const float* src[src_reg_size];
 #else
            GI_FLOAT32_FIXLEN_t src[src_reg_size];
 #endif
            GI_FLOAT32_FIXLEN_t weight[c_dim][filter_size];

 #define KERNEL_CB(step)                                                                \
    load_helper<src_reg_size, 0, simd_len, 0, Vld1qF32S>(src, src_ptr + step * iw, 0); \
    load_helper<filter_size, 0, oc_step, c_dim, Vld1qF32S>(                            \
            weight, weight_ptr + step * ld_weight_fw, ld_weight_oc);                   \
    cal_helper<0, 0, c_dim, stride, remain_w>(c, src, weight);                         \
    cal_helper<1, 1, c_dim, stride, remain_w>(c, src, weight);                         \
    cal_helper<2, 2, c_dim, stride, remain_w>(c, src, weight);                         \
    cal_helper<3, 3, c_dim, stride, remain_w>(c, src, weight);                         \
    cal_helper<4, 4, c_dim, stride, remain_w>(c, src, weight);                         \
    cal_helper<5, 5, c_dim, stride, remain_w>(c, src, weight);                         \
 #if defined(GI_TARGET_X86) || defined(GI_RVV_INTRINSICS)
 #define SRC_LOAD(step) \
    load_ptr_helper<src_reg_size, 0, simd_len, 0>(src, src_ptr + step * iw, 0)
 #else
 #define SRC_LOAD(step) \
    load_helper<src_reg_size, 0, simd_len, 0, Vld1qF32S>(src, src_ptr + step * iw, 0)
 #endif

 #define KERNEL_CB(step)                                              \
    SRC_LOAD(step);                                                  \
    load_helper<filter_size, 0, oc_step, c_dim, Vld1qF32S>(          \
            weight, weight_ptr + step * ld_weight_fw, ld_weight_oc); \
    cal_helper<0, 0, c_dim, stride, remain_w>(c, src, weight);       \
    cal_helper<1, 1, c_dim, stride, remain_w>(c, src, weight);       \
    cal_helper<2, 2, c_dim, stride, remain_w>(c, src, weight);       \
    cal_helper<3, 3, c_dim, stride, remain_w>(c, src, weight);       \
    cal_helper<4, 4, c_dim, stride, remain_w>(c, src, weight);       \
    cal_helper<5, 5, c_dim, stride, remain_w>(c, src, weight);       \
    cal_helper<6, 6, c_dim, stride, remain_w>(c, src, weight);

            UNROLL_CALL_RAW(7, KERNEL_CB)
 #undef KERNEL_CB
 #undef SRC_LOAD

            src_ptr += ld_src_ic;
            weight_ptr += ld_weight_ic;
@@ -200,20 +237,33 @@ struct KerGiXXs2NchwNchw44FP32<bias_mode, Op, remain_w, 5, oc_block, stride, ow_
        init_ocx_ow8<c_dim, bias_mode, remain_w>(c, bias_ptr, oc_step);

        for (int ic_idx = 0; ic_idx < ic; ic_idx += loop_ic_step) {
 #if defined(GI_TARGET_X86) || defined(GI_RVV_INTRINSICS)
            const float* src[src_reg_size];
 #else
            GI_FLOAT32_FIXLEN_t src[src_reg_size];
 #endif
            GI_FLOAT32_FIXLEN_t weight[c_dim][filter_size];

 #define KERNEL_CB(step)                                                                \
    load_helper<src_reg_size, 0, simd_len, 0, Vld1qF32S>(src, src_ptr + step * iw, 0); \
    load_helper<filter_size, 0, oc_step, c_dim, Vld1qF32S>(                            \
            weight, weight_ptr + step * ld_weight_fw, ld_weight_oc);                   \
    cal_helper<0, 0, c_dim, stride, remain_w>(c, src, weight);                         \
    cal_helper<1, 1, c_dim, stride, remain_w>(c, src, weight);                         \
    cal_helper<2, 2, c_dim, stride, remain_w>(c, src, weight);                         \
    cal_helper<3, 3, c_dim, stride, remain_w>(c, src, weight);                         \
 #if defined(GI_TARGET_X86) || defined(GI_RVV_INTRINSICS)
 #define SRC_LOAD(step) \
    load_ptr_helper<src_reg_size, 0, simd_len, 0>(src, src_ptr + step * iw, 0);
 #else
 #define SRC_LOAD(step) \
    load_helper<src_reg_size, 0, simd_len, 0, Vld1qF32S>(src, src_ptr + step * iw, 0);
 #endif

 #define KERNEL_CB(step)                                              \
    SRC_LOAD(step);                                                  \
    load_helper<filter_size, 0, oc_step, c_dim, Vld1qF32S>(          \
            weight, weight_ptr + step * ld_weight_fw, ld_weight_oc); \
    cal_helper<0, 0, c_dim, stride, remain_w>(c, src, weight);       \
    cal_helper<1, 1, c_dim, stride, remain_w>(c, src, weight);       \
    cal_helper<2, 2, c_dim, stride, remain_w>(c, src, weight);       \
    cal_helper<3, 3, c_dim, stride, remain_w>(c, src, weight);       \
    cal_helper<4, 4, c_dim, stride, remain_w>(c, src, weight);
            UNROLL_CALL_RAW(5, KERNEL_CB)
 #undef KERNEL_CB
 #undef SRC_LOAD

            src_ptr += ld_src_ic;
            weight_ptr += ld_weight_ic;
@@ -246,10 +296,18 @@ struct KerGiXXs2NchwNchw44FP32<bias_mode, Op, remain_w, 3, oc_block, stride, ow_
        init_ocx_ow8<c_dim, bias_mode, remain_w>(c, bias_ptr, oc_step);

        for (int ic_idx = 0; ic_idx < ic; ic_idx += loop_ic_step) {
 #if defined(GI_TARGET_X86) || defined(GI_RVV_INTRINSICS)
            const float* src[src_reg_size];
 #else
            GI_FLOAT32_FIXLEN_t src[src_reg_size];
 #endif
            GI_FLOAT32_FIXLEN_t weight[c_dim][filter_size];
            // row 0
 #if defined(GI_TARGET_X86) || defined(GI_RVV_INTRINSICS)
            load_ptr_helper<src_reg_size, 0, simd_len, 0>(src, src_ptr, 0);
 #else
            load_helper<src_reg_size, 0, simd_len, 0, Vld1qF32S>(src, src_ptr, 0);
 #endif
            load_helper<filter_size, 0, oc_step, c_dim, Vld1qF32S>(
                    weight, weight_ptr, ld_weight_oc);
            cal_helper<0, 0, c_dim, stride, remain_w>(c, src, weight);
@@ -257,7 +315,11 @@ struct KerGiXXs2NchwNchw44FP32<bias_mode, Op, remain_w, 3, oc_block, stride, ow_
            cal_helper<2, 2, c_dim, stride, remain_w>(c, src, weight);

            // row 1
 #if defined(GI_TARGET_X86) || defined(GI_RVV_INTRINSICS)
            load_ptr_helper<src_reg_size, 0, simd_len, 0>(src, src_ptr + iw, 0);
 #else
            load_helper<src_reg_size, 0, simd_len, 0, Vld1qF32S>(src, src_ptr + iw, 0);
 #endif
            load_helper<filter_size, 0, oc_step, c_dim, Vld1qF32S>(
                    weight, weight_ptr + 1 * ld_weight_fw, ld_weight_oc);
            cal_helper<0, 0, c_dim, stride, remain_w>(c, src, weight);
@@ -265,8 +327,12 @@ struct KerGiXXs2NchwNchw44FP32<bias_mode, Op, remain_w, 3, oc_block, stride, ow_
            cal_helper<2, 2, c_dim, stride, remain_w>(c, src, weight);

            // row 2
 #if defined(GI_TARGET_X86) || defined(GI_RVV_INTRINSICS)
            load_ptr_helper<src_reg_size, 0, simd_len, 0>(src, src_ptr + 2 * iw, 0);
 #else
            load_helper<src_reg_size, 0, simd_len, 0, Vld1qF32S>(
                    src, src_ptr + 2 * iw, 0);
 #endif
            load_helper<filter_size, 0, oc_step, c_dim, Vld1qF32S>(
                    weight, weight_ptr + 2 * ld_weight_fw, ld_weight_oc);
            cal_helper<0, 0, c_dim, stride, remain_w>(c, src, weight);
@@ -637,17 +703,29 @@ struct KerGiXXs2NchwNchw44FP32<bias_mode, Op, remain_w, 2, oc_block, stride, ow_
        init_ocx_ow8<c_dim, bias_mode, remain_w>(c, bias_ptr, oc_step);

        for (int ic_idx = 0; ic_idx < ic; ic_idx += loop_ic_step) {
 #if defined(GI_TARGET_X86) || defined(GI_RVV_INTRINSICS)
            const float* src[src_reg_size];
 #else
            GI_FLOAT32_FIXLEN_t src[src_reg_size];
 #endif
            GI_FLOAT32_FIXLEN_t weight[c_dim][filter_size];
            // row 0
 #if defined(GI_TARGET_X86) || defined(GI_RVV_INTRINSICS)
            load_ptr_helper<src_reg_size, 0, simd_len, 0>(src, src_ptr, 0);
 #else
            load_helper<src_reg_size, 0, simd_len, 0, Vld1qF32S>(src, src_ptr, 0);
 #endif
            load_helper<filter_size, 0, oc_step, c_dim, Vld1qF32S>(
                    weight, weight_ptr, ld_weight_oc);
            cal_helper<0, 0, c_dim, stride, remain_w>(c, src, weight);
            cal_helper<1, 1, c_dim, stride, remain_w>(c, src, weight);

            // row 1
 #if defined(GI_TARGET_X86) || defined(GI_RVV_INTRINSICS)
            load_ptr_helper<src_reg_size, 0, simd_len, 0>(src, src_ptr + iw, 0);
 #else
            load_helper<src_reg_size, 0, simd_len, 0, Vld1qF32S>(src, src_ptr + iw, 0);
 #endif
            load_helper<filter_size, 0, oc_step, c_dim, Vld1qF32S>(
                    weight, weight_ptr + 1 * ld_weight_fw, ld_weight_oc);
            cal_helper<0, 0, c_dim, stride, remain_w>(c, src, weight);
@@ -670,7 +748,7 @@ struct ConvDirectFp32NchwNchw44 {
        constexpr int fh = filter_size;
        constexpr int fw = filter_size;
        constexpr int ic_step = 1;
 #if MEGDNN_ARMV7
 #if defined(GI_TARGET_X86) || defined(GI_RVV_INTRINSICS) || defined(MEGDNN_ARMV7)
        constexpr int big_oc_step = 4;
 #else
        constexpr int big_oc_step = 8;
--- a/dnn/src/fallback/conv_bias/gi/intrinsic_helper.h
+++ b/dnn/src/fallback/conv_bias/gi/intrinsic_helper.h
@@ -62,6 +62,13 @@ struct LoadHelper {
    static GI_FORCEINLINE void impl(T& weight, T2 ptr, int oc_offset, XT... args);
 };

 template <
        int weight_number, int base_offset, int ptr_step, int oc_block, typename T,
        typename T2>
 struct LoadPtrHelper {
    static GI_FORCEINLINE void impl(T& weight, T2 ptr, int oc_offset);
 };

 #define WEIGHT_CB(step)                   \
    src[step] = GiFloat32Type2FixLenType( \
            Func::impl(ptr + base_offset + step * ptr_step, args...));
@@ -96,6 +103,36 @@ LOAD_HELPER(16);
 #undef LOAD_HELPER
 #undef WEIGHT_CB

 #define WEIGHT_PTR_CB(step) src[step] = ptr + base_offset + step * ptr_step;

 #define LOAD_PTR_HELPER(step)                                         \
    template <int base_offset, int ptr_step, typename T, typename T2> \
    struct LoadPtrHelper<step, base_offset, ptr_step, 0, T, T2> {     \
        static GI_FORCEINLINE void impl(T& src, T2 ptr, int) {        \
            UNROLL_CALL_RAW(step, WEIGHT_PTR_CB);                     \
        }                                                             \
    }

 LOAD_PTR_HELPER(1);
 LOAD_PTR_HELPER(2);
 LOAD_PTR_HELPER(3);
 LOAD_PTR_HELPER(4);
 LOAD_PTR_HELPER(5);
 LOAD_PTR_HELPER(6);
 LOAD_PTR_HELPER(7);
 LOAD_PTR_HELPER(8);
 LOAD_PTR_HELPER(9);
 LOAD_PTR_HELPER(10);
 LOAD_PTR_HELPER(11);
 LOAD_PTR_HELPER(12);
 LOAD_PTR_HELPER(13);
 LOAD_PTR_HELPER(14);
 LOAD_PTR_HELPER(15);
 LOAD_PTR_HELPER(16);

 #undef LOAD_PTR_HELPER
 #undef WEIGHT_PTR_CB

 ///////////////////////////c_dim = 1/////////////////////////
 #define WEIGHT_CB(step) \
    src[0][step] =      \
@@ -122,6 +159,29 @@ LOAD_HELPER(9);
 #undef LOAD_HELPER
 #undef WEIGHT_CB

 #define WEIGHT_PTR_CB(step) src[0][step] = ptr + base_offset + step * ptr_step;

 #define LOAD_PTR_HELPER(step)                                         \
    template <int base_offset, int ptr_step, typename T, typename T2> \
    struct LoadPtrHelper<step, base_offset, ptr_step, 1, T, T2> {     \
        static GI_FORCEINLINE void impl(T& src, T2 ptr, int) {        \
            UNROLL_CALL_RAW(step, WEIGHT_PTR_CB);                     \
        }                                                             \
    }

 LOAD_PTR_HELPER(1);
 LOAD_PTR_HELPER(2);
 LOAD_PTR_HELPER(3);
 LOAD_PTR_HELPER(4);
 LOAD_PTR_HELPER(5);
 LOAD_PTR_HELPER(6);
 LOAD_PTR_HELPER(7);
 LOAD_PTR_HELPER(8);
 LOAD_PTR_HELPER(9);

 #undef LOAD_PTR_HELPER
 #undef WEIGHT_PTR_CB

 /////////////////////////c_dim = 2///////////////////////////////
 #define WEIGHT_CB(step)                                                                \
    src[0][step] =                                                                     \
@@ -149,6 +209,30 @@ LOAD_HELPER(8);
 #undef LOAD_HELPER
 #undef WEIGHT_CB

 #define WEIGHT_PTR_CB(step)                             \
    src[0][step] = ptr + base_offset + step * ptr_step; \
    src[1][step] = ptr + base_offset + step * ptr_step + oc_offset;

 #define LOAD_PTR_HELPER(step)                                            \
    template <int base_offset, int ptr_step, typename T, typename T2>    \
    struct LoadPtrHelper<step, base_offset, ptr_step, 2, T, T2> {        \
        static GI_FORCEINLINE void impl(T& src, T2 ptr, int oc_offset) { \
            UNROLL_CALL_RAW(step, WEIGHT_PTR_CB);                        \
        }                                                                \
    }

 LOAD_PTR_HELPER(1);
 LOAD_PTR_HELPER(2);
 LOAD_PTR_HELPER(3);
 LOAD_PTR_HELPER(4);
 LOAD_PTR_HELPER(5);
 LOAD_PTR_HELPER(6);
 LOAD_PTR_HELPER(7);
 LOAD_PTR_HELPER(8);

 #undef LOAD_HELPER
 #undef WEIGHT_PTR_CB

 template <
        int weight_number, int base_offset, int ptr_step, int c_dim, typename Func,
        typename T, typename T2>
@@ -157,6 +241,14 @@ GI_FORCEINLINE void load_helper(T& weight, T2 ptr, int oc_offset) {
            weight, ptr, oc_offset);
 }

 template <
        int weight_number, int base_offset, int ptr_step, int c_dim, typename T,
        typename T2>
 GI_FORCEINLINE void load_ptr_helper(T& weight, T2 ptr, int oc_offset) {
    LoadPtrHelper<weight_number, base_offset, ptr_step, c_dim, T, T2>::impl(
            weight, ptr, oc_offset);
 }

 ////////////////////Store_OCX_OW8_Remain/////////////////////////
 template <int c_dim, int ow_remain, typename Op, typename T, typename T2, typename T3>
 struct StoreOcxOw8Remain {
--- a/dnn/src/fallback/general_intrinsic/gi_float.h
+++ b/dnn/src/fallback/general_intrinsic/gi_float.h
@@ -1110,7 +1110,7 @@ GI_FORCEINLINE
 GI_FLOAT32_t GiMaximumFloat32(GI_FLOAT32_t Vector1, GI_FLOAT32_t Vector2) {
 #if defined(GI_NEON_INTRINSICS)
    return vmaxq_f32(Vector1, Vector2);
 #elif defined(GI_NEON32_INTRINSICS)
 #elif defined(GI_SSE2_INTRINSICS)
    return _mm_max_ps(Vector1, Vector2);
 #elif defined(GI_RVV_INTRINSICS)
    return vfmax_vv_f32m1(Vector1, Vector2, GI_SIMD_LEN_BYTE / sizeof(float));
@@ -1127,7 +1127,7 @@ GI_FORCEINLINE
 GI_FLOAT32_t GiMinimumFloat32(GI_FLOAT32_t Vector1, GI_FLOAT32_t Vector2) {
 #if defined(GI_NEON_INTRINSICS)
    return vminq_f32(Vector1, Vector2);
 #elif defined(GI_NEON32_INTRINSICS)
 #elif defined(GI_SSE2_INTRINSICS)
    return _mm_min_ps(Vector1, Vector2);
 #elif defined(GI_RVV_INTRINSICS)
    return vfmin_vv_f32m1(Vector1, Vector2, GI_SIMD_LEN_BYTE / sizeof(float));