From 58ba080d5f7f9763cc74a07aadf8b2e20134901b Mon Sep 17 00:00:00 2001 From: Megvii Engine Team Date: Thu, 14 Jul 2022 18:37:02 +0800 Subject: [PATCH] feat(x86/rvv): make gi conv algo adapt to vv and vf model GitOrigin-RevId: f29593be4df167f63029893bd9cf0fb667861622 --- .../f32_direct_nchw44_kern_common_s1.h | 117 +++++- .../f32_direct_nchw44_kern_common_s2.h | 158 ++++++-- .../f32_direct_nchw_nchw44_kern_common.h | 56 +-- .../fallback/conv_bias/gi/fp32/do_conv_stride1.cpp | 421 ++++++++++++--------- .../fallback/conv_bias/gi/fp32/do_conv_stride2.cpp | 275 +++++++++----- .../conv_bias/gi/fp32/strategy_f73_mk4_nchw44.cpp | 214 ++++++----- dnn/src/fallback/general_intrinsic/gi_float.h | 4 +- 7 files changed, 799 insertions(+), 446 deletions(-) diff --git a/dnn/src/fallback/conv_bias/gi/fp32/direct_kernels/f32_direct_nchw44_kern_common_s1.h b/dnn/src/fallback/conv_bias/gi/fp32/direct_kernels/f32_direct_nchw44_kern_common_s1.h index ba4f14a5..955f7e67 100644 --- a/dnn/src/fallback/conv_bias/gi/fp32/direct_kernels/f32_direct_nchw44_kern_common_s1.h +++ b/dnn/src/fallback/conv_bias/gi/fp32/direct_kernels/f32_direct_nchw44_kern_common_s1.h @@ -24,21 +24,27 @@ struct ShiftCalHelper { static MEGDNN_ALWAYS_INLINE void impl(T&, T2&, T3&) {} }; -#define cb2(step, lane, ow_block) \ - c[0][step] = GiFloat32Type2FixLenType(GiSimdFmaLane( \ - GiFixLenType2GiFloat32Type(c[0][step]), \ - GiFixLenType2GiFloat32Type(weight[0][lane]), \ - GiFixLenType2GiFloat32Type(src[(step + src_idx) % ow_block]), lane)); \ - c[1][step] = GiFloat32Type2FixLenType(GiSimdFmaLane( \ - GiFixLenType2GiFloat32Type(c[1][step]), \ - GiFixLenType2GiFloat32Type(weight[1][lane]), \ - GiFixLenType2GiFloat32Type(src[(step + src_idx) % ow_block]), lane)); - -#define cb(step, lane, ow_block) \ - c[0][step] = GiFloat32Type2FixLenType(GiSimdFmaLane( \ - GiFixLenType2GiFloat32Type(c[0][step]), \ - GiFixLenType2GiFloat32Type(weight[0][lane]), \ - GiFixLenType2GiFloat32Type(src[(step + src_idx) % ow_block]), lane)); +#if defined(GI_TARGET_X86) || defined(GI_RVV_INTRINSICS) +//! x86 and rvv GiSimdFmaLane API is slowly, as an alternate, use +//! GiMultiplyAddScalarFloat32 +#define MLA(a, b, c, d) \ + GiMultiplyAddScalarFloat32( \ + GiFixLenType2GiFloat32Type(a), GiFixLenType2GiFloat32Type(b), *(c + d)) +#else +#define MLA(a, b, c, d) \ + GiSimdFmaLane( \ + GiFixLenType2GiFloat32Type(a), GiFixLenType2GiFloat32Type(b), \ + GiFixLenType2GiFloat32Type(c), d) +#endif +#define cb2(step, lane, ow_block) \ + c[0][step] = GiFloat32Type2FixLenType( \ + MLA(c[0][step], weight[0][lane], src[(step + src_idx) % ow_block], lane)); \ + c[1][step] = GiFloat32Type2FixLenType( \ + MLA(c[1][step], weight[1][lane], src[(step + src_idx) % ow_block], lane)); + +#define cb(step, lane, ow_block) \ + c[0][step] = GiFloat32Type2FixLenType( \ + MLA(c[0][step], weight[0][lane], src[(step + src_idx) % ow_block], lane)); #define SHIFT_CAL_HELPER(ow_block, remain_w) \ template < \ @@ -81,6 +87,7 @@ SHIFT_CAL_HELPER(4, 4); #undef SHIFT_CAL_HELPER #undef cb #undef cb2 +#undef MLA template < int src_idx, int weight_idx, int c_dim, int ow_block, int remain_w, typename T, @@ -145,14 +152,23 @@ struct KerGiXXs1Nchw44FP32 { for (int ic_idx = 0; ic_idx < ic; ic_idx += ic_step) { const float* src_ptr = src_ptr_origin + ic_idx * ld_src_ic; for (int fh_idx = 0; fh_idx < filter_size; ++fh_idx) { - GI_FLOAT32_FIXLEN_t src[ow_block]; GI_FLOAT32_FIXLEN_t weight[c_dim][ic_step]; +#if defined(GI_TARGET_X86) || defined(GI_RVV_INTRINSICS) + const float* src[ow_block]; + load_ptr_helper(src, src_ptr, 0); +#else + GI_FLOAT32_FIXLEN_t src[ow_block]; load_helper(src, src_ptr, 0); +#endif load_helper( weight, weight_ptr, ld_weight_oc); cal_helper<0, 0, c_dim, ow_block, remain_w>(c, src, weight); +#if defined(GI_TARGET_X86) || defined(GI_RVV_INTRINSICS) + src[0] = src_ptr + (ow_block)*ic_step; +#else src[0] = GiFloat32Type2FixLenType( GiLoadFloat32(src_ptr + (ow_block)*ic_step)); +#endif load_helper( weight, weight_ptr, ld_weight_oc); cal_helper<1, 0, c_dim, ow_block, remain_w>(c, src, weight); @@ -188,19 +204,32 @@ struct KerGiXXs1Nchw44FP32 { for (int ic_idx = 0; ic_idx < ic; ic_idx += ic_step) { const float* src_ptr = src_ptr_origin + ic_idx * ld_src_ic; for (int fh_idx = 0; fh_idx < filter_size; ++fh_idx) { +#if defined(GI_TARGET_X86) || defined(GI_RVV_INTRINSICS) + const float* src[ow_block]; + load_ptr_helper(src, src_ptr, 0); +#else GI_FLOAT32_FIXLEN_t src[ow_block]; - GI_FLOAT32_FIXLEN_t weight[c_dim][ic_step]; load_helper(src, src_ptr, 0); +#endif + GI_FLOAT32_FIXLEN_t weight[c_dim][ic_step]; load_helper( weight, weight_ptr, ld_weight_oc); cal_helper<0, 0, c_dim, ow_block, remain_w>(c, src, weight); +#if defined(GI_TARGET_X86) || defined(GI_RVV_INTRINSICS) + src[0] = src_ptr + (ow_block)*ic_step; +#else src[0] = GiFloat32Type2FixLenType( GiLoadFloat32(src_ptr + (ow_block)*ic_step)); +#endif load_helper( weight, weight_ptr, ld_weight_oc); cal_helper<1, 0, c_dim, ow_block, remain_w>(c, src, weight); +#if defined(GI_TARGET_X86) || defined(GI_RVV_INTRINSICS) + src[1] = src_ptr + (ow_block + 1) * ic_step; +#else src[1] = GiFloat32Type2FixLenType( GiLoadFloat32(src_ptr + (ow_block + 1) * ic_step)); +#endif load_helper( weight, weight_ptr, ld_weight_oc); cal_helper<2, 0, c_dim, ow_block, remain_w>(c, src, weight); @@ -235,33 +264,54 @@ struct KerGiXXs1Nchw44FP32 { for (int ic_idx = 0; ic_idx < ic; ic_idx += ic_step) { const float* src_ptr = src_ptr_origin + ic_idx * ld_src_ic; for (int fh_idx = 0; fh_idx < filter_size; ++fh_idx) { +#if defined(GI_TARGET_X86) || defined(GI_RVV_INTRINSICS) + const float* src[ow_block]; + load_ptr_helper(src, src_ptr, 0); +#else GI_FLOAT32_FIXLEN_t src[ow_block]; - GI_FLOAT32_FIXLEN_t weight[c_dim][ic_step]; load_helper(src, src_ptr, 0); +#endif + GI_FLOAT32_FIXLEN_t weight[c_dim][ic_step]; load_helper( weight, weight_ptr, ld_weight_oc); cal_helper<0, 0, c_dim, ow_block, remain_w>(c, src, weight); +#if defined(GI_TARGET_X86) || defined(GI_RVV_INTRINSICS) + src[0] = src_ptr + (ow_block)*ic_step; +#else src[0] = GiFloat32Type2FixLenType( GiLoadFloat32(src_ptr + (ow_block)*ic_step)); +#endif load_helper( weight, weight_ptr, ld_weight_oc); cal_helper<1, 0, c_dim, ow_block, remain_w>(c, src, weight); +#if defined(GI_TARGET_X86) || defined(GI_RVV_INTRINSICS) + src[1] = src_ptr + (ow_block + 1) * ic_step; +#else src[1] = GiFloat32Type2FixLenType( GiLoadFloat32(src_ptr + (ow_block + 1) * ic_step)); +#endif load_helper( weight, weight_ptr, ld_weight_oc); cal_helper<2, 0, c_dim, ow_block, remain_w>(c, src, weight); +#if defined(GI_TARGET_X86) || defined(GI_RVV_INTRINSICS) + src[2] = src_ptr + (ow_block + 2) * ic_step; +#else src[2] = GiFloat32Type2FixLenType( GiLoadFloat32(src_ptr + (ow_block + 2) * ic_step)); +#endif load_helper( weight, weight_ptr, ld_weight_oc); cal_helper<3, 0, c_dim, ow_block, remain_w>(c, src, weight); +#if defined(GI_TARGET_X86) || defined(GI_RVV_INTRINSICS) + src[3] = src_ptr + (ow_block + 3) * ic_step; +#else src[3] = GiFloat32Type2FixLenType( GiLoadFloat32(src_ptr + (ow_block + 3) * ic_step)); +#endif load_helper( weight, weight_ptr, ld_weight_oc); cal_helper<4, 0, c_dim, ow_block, remain_w>(c, src, weight); @@ -297,45 +347,74 @@ struct KerGiXXs1Nchw44FP32 { for (int ic_idx = 0; ic_idx < ic; ic_idx += ic_step) { const float* src_ptr = src_ptr_origin + ic_idx * ld_src_ic; for (int fh_idx = 0; fh_idx < filter_size; ++fh_idx) { +#if defined(GI_TARGET_X86) || defined(GI_RVV_INTRINSICS) + const float* src[ow_block]; + load_ptr_helper(src, src_ptr, 0); +#else GI_FLOAT32_FIXLEN_t src[ow_block]; - GI_FLOAT32_FIXLEN_t weight[c_dim][ic_step]; load_helper(src, src_ptr, 0); +#endif + GI_FLOAT32_FIXLEN_t weight[c_dim][ic_step]; load_helper( weight, weight_ptr, ld_weight_oc); cal_helper<0, 0, c_dim, ow_block, remain_w>(c, src, weight); +#if defined(GI_TARGET_X86) || defined(GI_RVV_INTRINSICS) + src[0] = src_ptr + (ow_block)*ic_step; +#else src[0] = GiFloat32Type2FixLenType( GiLoadFloat32(src_ptr + (ow_block)*ic_step)); +#endif load_helper( weight, weight_ptr, ld_weight_oc); cal_helper<1, 0, c_dim, ow_block, remain_w>(c, src, weight); +#if defined(GI_TARGET_X86) || defined(GI_RVV_INTRINSICS) + src[1] = src_ptr + (ow_block + 1) * ic_step; +#else src[1] = GiFloat32Type2FixLenType( GiLoadFloat32(src_ptr + (ow_block + 1) * ic_step)); +#endif load_helper( weight, weight_ptr, ld_weight_oc); cal_helper<2, 0, c_dim, ow_block, remain_w>(c, src, weight); +#if defined(GI_TARGET_X86) || defined(GI_RVV_INTRINSICS) + src[2] = src_ptr + (ow_block + 2) * ic_step; +#else src[2] = GiFloat32Type2FixLenType( GiLoadFloat32(src_ptr + (ow_block + 2) * ic_step)); +#endif load_helper( weight, weight_ptr, ld_weight_oc); cal_helper<3, 0, c_dim, ow_block, remain_w>(c, src, weight); +#if defined(GI_TARGET_X86) || defined(GI_RVV_INTRINSICS) + src[3] = src_ptr + (ow_block + 3) * ic_step; +#else src[3] = GiFloat32Type2FixLenType( GiLoadFloat32(src_ptr + (ow_block + 3) * ic_step)); +#endif load_helper( weight, weight_ptr, ld_weight_oc); cal_helper<4, 0, c_dim, ow_block, remain_w>(c, src, weight); +#if defined(GI_TARGET_X86) || defined(GI_RVV_INTRINSICS) + src[4] = src_ptr + (ow_block + 4) * ic_step; +#else src[4] = GiFloat32Type2FixLenType( GiLoadFloat32(src_ptr + (ow_block + 4) * ic_step)); +#endif load_helper( weight, weight_ptr, ld_weight_oc); cal_helper<5, 0, c_dim, ow_block, remain_w>(c, src, weight); +#if defined(GI_TARGET_X86) || defined(GI_RVV_INTRINSICS) + src[5] = src_ptr + (ow_block + 5) * ic_step; +#else src[5] = GiFloat32Type2FixLenType( GiLoadFloat32(src_ptr + (ow_block + 5) * ic_step)); +#endif load_helper( weight, weight_ptr, ld_weight_oc); cal_helper<6, 0, c_dim, ow_block, remain_w>(c, src, weight); diff --git a/dnn/src/fallback/conv_bias/gi/fp32/direct_kernels/f32_direct_nchw44_kern_common_s2.h b/dnn/src/fallback/conv_bias/gi/fp32/direct_kernels/f32_direct_nchw44_kern_common_s2.h index e33e2538..3a7c0cdf 100644 --- a/dnn/src/fallback/conv_bias/gi/fp32/direct_kernels/f32_direct_nchw44_kern_common_s2.h +++ b/dnn/src/fallback/conv_bias/gi/fp32/direct_kernels/f32_direct_nchw44_kern_common_s2.h @@ -24,21 +24,28 @@ struct ShiftCalHelper { static MEGDNN_ALWAYS_INLINE void impl(T&, T2&, T3&) {} }; -#define cb2(step, lane, ow_block) \ - c[0][step] = GiFloat32Type2FixLenType(GiSimdFmaLane( \ - GiFixLenType2GiFloat32Type(c[0][step]), \ - GiFixLenType2GiFloat32Type(weight[0][lane]), \ - GiFixLenType2GiFloat32Type(src[(step + src_idx) % ow_block]), lane)); \ - c[1][step] = GiFloat32Type2FixLenType(GiSimdFmaLane( \ - GiFixLenType2GiFloat32Type(c[1][step]), \ - GiFixLenType2GiFloat32Type(weight[1][lane]), \ - GiFixLenType2GiFloat32Type(src[(step + src_idx) % ow_block]), lane)); - -#define cb(step, lane, ow_block) \ - c[0][step] = GiFloat32Type2FixLenType(GiSimdFmaLane( \ - GiFixLenType2GiFloat32Type(c[0][step]), \ - GiFixLenType2GiFloat32Type(weight[0][lane]), \ - GiFixLenType2GiFloat32Type(src[(step + src_idx) % ow_block]), lane)); +#if defined(GI_TARGET_X86) || defined(GI_RVV_INTRINSICS) +//! x86 and rvv GiSimdFmaLane API is slowly, as an alternate, use +//! GiMultiplyAddScalarFloat32 +#define MLA(a, b, c, d) \ + GiMultiplyAddScalarFloat32( \ + GiFixLenType2GiFloat32Type(a), GiFixLenType2GiFloat32Type(b), *(c + d)) +#else +#define MLA(a, b, c, d) \ + GiSimdFmaLane( \ + GiFixLenType2GiFloat32Type(a), GiFixLenType2GiFloat32Type(b), \ + GiFixLenType2GiFloat32Type(c), d) +#endif + +#define cb2(step, lane, ow_block) \ + c[0][step] = GiFloat32Type2FixLenType( \ + MLA(c[0][step], weight[0][lane], src[(step + src_idx) % ow_block], lane)); \ + c[1][step] = GiFloat32Type2FixLenType( \ + MLA(c[1][step], weight[1][lane], src[(step + src_idx) % ow_block], lane)); + +#define cb(step, lane, ow_block) \ + c[0][step] = GiFloat32Type2FixLenType( \ + MLA(c[0][step], weight[0][lane], src[(step + src_idx) % ow_block], lane)); #define SHIFT_CAL_HELPER(ow_block, remain_w) \ template < \ @@ -81,6 +88,7 @@ SHIFT_CAL_HELPER(4, 4); #undef SHIFT_CAL_HELPER #undef cb #undef cb2 +#undef MLA template < int src_idx, int weight_idx, int c_dim, int ow_block, int remain_w, typename T, @@ -146,15 +154,24 @@ struct KerGiXXs2Nchw44FP32 { const float* src_ptr = src_ptr_origin + ic_idx * ld_src_ic; const float* src_ptr_odd = src_ptr_odd_origin + ic_idx * ld_src_ic; - GI_FLOAT32_FIXLEN_t src[ow_block]; GI_FLOAT32_FIXLEN_t weight[c_dim][4]; - /////////row 0///////////// +#if defined(GI_TARGET_X86) || defined(GI_RVV_INTRINSICS) + const float* src[ow_block]; + load_ptr_helper(src, src_ptr, 0); +#else + GI_FLOAT32_FIXLEN_t src[ow_block]; load_helper(src, src_ptr, 0); +#endif + /////////row 0///////////// load_helper<4, 0, oc_step, c_dim, Vld1qF32S>( weight, weight_ptr, ld_weight_oc); cal_helper<0, 0, c_dim, ow_block, remain_w>(c, src, weight); +#if defined(GI_TARGET_X86) || defined(GI_RVV_INTRINSICS) + load_ptr_helper(src, src_ptr_odd, 0); +#else load_helper(src, src_ptr_odd, 0); +#endif load_helper<4, 1 * ld_weight, oc_step, c_dim, Vld1qF32S>( weight, weight_ptr, ld_weight_oc); cal_helper<0, 0, c_dim, ow_block, remain_w>(c, src, weight); @@ -162,12 +179,20 @@ struct KerGiXXs2Nchw44FP32 { src_ptr_odd += ld_src_iw; weight_ptr += ld_weight_fh; /////////row 1///////////// +#if defined(GI_TARGET_X86) || defined(GI_RVV_INTRINSICS) + load_ptr_helper(src, src_ptr, 0); +#else load_helper(src, src_ptr, 0); +#endif load_helper<4, 0, oc_step, c_dim, Vld1qF32S>( weight, weight_ptr, ld_weight_oc); cal_helper<0, 0, c_dim, ow_block, remain_w>(c, src, weight); +#if defined(GI_TARGET_X86) || defined(GI_RVV_INTRINSICS) + load_ptr_helper(src, src_ptr_odd, 0); +#else load_helper(src, src_ptr_odd, 0); +#endif load_helper<4, 1 * ld_weight, oc_step, c_dim, Vld1qF32S>( weight, weight_ptr, ld_weight_oc); cal_helper<0, 0, c_dim, ow_block, remain_w>(c, src, weight); @@ -203,21 +228,34 @@ struct KerGiXXs2Nchw44FP32 { const float* src_ptr = src_ptr_origin + ic_idx * ld_src_ic; const float* src_ptr_odd = src_ptr_odd_origin + ic_idx * ld_src_ic; - GI_FLOAT32_FIXLEN_t src[ow_block]; GI_FLOAT32_FIXLEN_t weight[c_dim][4]; - /////////row 0///////////// +#if defined(GI_TARGET_X86) || defined(GI_RVV_INTRINSICS) + const float* src[ow_block]; + load_ptr_helper(src, src_ptr, 0); +#else + GI_FLOAT32_FIXLEN_t src[ow_block]; load_helper(src, src_ptr, 0); +#endif + /////////row 0///////////// load_helper<4, 0, oc_step, c_dim, Vld1qF32S>( weight, weight_ptr, ld_weight_oc); cal_helper<0, 0, c_dim, ow_block, remain_w>(c, src, weight); +#if defined(GI_TARGET_X86) || defined(GI_RVV_INTRINSICS) + src[0] = src_ptr + ow_block * simd_len; +#else src[0] = GiFloat32Type2FixLenType( GiLoadFloat32(src_ptr + ow_block * simd_len)); +#endif load_helper<4, 2 * ld_weight, oc_step, c_dim, Vld1qF32S>( weight, weight_ptr, ld_weight_oc); cal_helper<1, 0, c_dim, ow_block, remain_w>(c, src, weight); +#if defined(GI_TARGET_X86) || defined(GI_RVV_INTRINSICS) + load_ptr_helper(src, src_ptr_odd, 0); +#else load_helper(src, src_ptr_odd, 0); +#endif load_helper<4, 1 * ld_weight, oc_step, c_dim, Vld1qF32S>( weight, weight_ptr, ld_weight_oc); cal_helper<0, 0, c_dim, ow_block, remain_w>(c, src, weight); @@ -225,17 +263,29 @@ struct KerGiXXs2Nchw44FP32 { src_ptr_odd += ld_src_iw; weight_ptr += ld_weight_fh; /////////row 1///////////// +#if defined(GI_TARGET_X86) || defined(GI_RVV_INTRINSICS) + load_ptr_helper(src, src_ptr, 0); +#else load_helper(src, src_ptr, 0); +#endif load_helper<4, 0, oc_step, c_dim, Vld1qF32S>( weight, weight_ptr, ld_weight_oc); cal_helper<0, 0, c_dim, ow_block, remain_w>(c, src, weight); +#if defined(GI_TARGET_X86) || defined(GI_RVV_INTRINSICS) + src[0] = src_ptr + ow_block * simd_len; +#else src[0] = GiFloat32Type2FixLenType( GiLoadFloat32(src_ptr + ow_block * simd_len)); +#endif load_helper<4, 2 * ld_weight, oc_step, c_dim, Vld1qF32S>( weight, weight_ptr, ld_weight_oc); cal_helper<1, 0, c_dim, ow_block, remain_w>(c, src, weight); +#if defined(GI_TARGET_X86) || defined(GI_RVV_INTRINSICS) + load_ptr_helper(src, src_ptr_odd, 0); +#else load_helper(src, src_ptr_odd, 0); +#endif load_helper<4, 1 * ld_weight, oc_step, c_dim, Vld1qF32S>( weight, weight_ptr, ld_weight_oc); cal_helper<0, 0, c_dim, ow_block, remain_w>(c, src, weight); @@ -243,18 +293,30 @@ struct KerGiXXs2Nchw44FP32 { src_ptr_odd += ld_src_iw; weight_ptr += ld_weight_fh; //////////row 2///////////// +#if defined(GI_TARGET_X86) || defined(GI_RVV_INTRINSICS) + load_ptr_helper(src, src_ptr, 0); +#else load_helper(src, src_ptr, 0); +#endif load_helper<4, 0, oc_step, c_dim, Vld1qF32S>( weight, weight_ptr, ld_weight_oc); cal_helper<0, 0, c_dim, ow_block, remain_w>(c, src, weight); +#if defined(GI_TARGET_X86) || defined(GI_RVV_INTRINSICS) + src[0] = src_ptr + ow_block * simd_len; +#else src[0] = GiFloat32Type2FixLenType( GiLoadFloat32(src_ptr + ow_block * simd_len)); +#endif load_helper<4, 2 * ld_weight, oc_step, c_dim, Vld1qF32S>( weight, weight_ptr, ld_weight_oc); cal_helper<1, 0, c_dim, ow_block, remain_w>(c, src, weight); +#if defined(GI_TARGET_X86) || defined(GI_RVV_INTRINSICS) + load_ptr_helper(src, src_ptr_odd, 0); +#else load_helper(src, src_ptr_odd, 0); +#endif load_helper<4, 1 * ld_weight, oc_step, c_dim, Vld1qF32S>( weight, weight_ptr, ld_weight_oc); cal_helper<0, 0, c_dim, ow_block, remain_w>(c, src, weight); @@ -292,30 +354,51 @@ struct KerGiXXs2Nchw44FP32 { const float* src_ptr_odd = src_ptr_odd_origin + ic_idx * ld_src_ic; for (int fh_idx = 0; fh_idx < filter_size; ++fh_idx) { - GI_FLOAT32_FIXLEN_t src[ow_block]; GI_FLOAT32_FIXLEN_t weight[c_dim][4]; - // even element +#if defined(GI_TARGET_X86) || defined(GI_RVV_INTRINSICS) + const float* src[ow_block]; + load_ptr_helper(src, src_ptr, 0); +#else + GI_FLOAT32_FIXLEN_t src[ow_block]; load_helper(src, src_ptr, 0); +#endif + // even element load_helper<4, 0, oc_step, c_dim, Vld1qF32S>( weight, weight_ptr, ld_weight_oc); cal_helper<0, 0, c_dim, ow_block, remain_w>(c, src, weight); +#if defined(GI_TARGET_X86) || defined(GI_RVV_INTRINSICS) + src[0] = src_ptr + ow_block * simd_len; +#else src[0] = GiFloat32Type2FixLenType( GiLoadFloat32(src_ptr + ow_block * simd_len)); +#endif load_helper<4, 2 * ld_weight, oc_step, c_dim, Vld1qF32S>( weight, weight_ptr, ld_weight_oc); cal_helper<1, 0, c_dim, ow_block, remain_w>(c, src, weight); +#if defined(GI_TARGET_X86) || defined(GI_RVV_INTRINSICS) + src[1] = src_ptr + (ow_block + 1) * simd_len; +#else src[1] = GiFloat32Type2FixLenType( GiLoadFloat32(src_ptr + (ow_block + 1) * simd_len)); +#endif load_helper<4, 4 * ld_weight, oc_step, c_dim, Vld1qF32S>( weight, weight_ptr, ld_weight_oc); cal_helper<2, 0, c_dim, ow_block, remain_w>(c, src, weight); // odd element +#if defined(GI_TARGET_X86) || defined(GI_RVV_INTRINSICS) + load_ptr_helper(src, src_ptr_odd, 0); +#else load_helper(src, src_ptr_odd, 0); +#endif load_helper<4, 1 * ld_weight, oc_step, c_dim, Vld1qF32S>( weight, weight_ptr, ld_weight_oc); cal_helper<0, 0, c_dim, ow_block, remain_w>(c, src, weight); +#if defined(GI_TARGET_X86) || defined(GI_RVV_INTRINSICS) + src[0] = src_ptr_odd + ow_block * simd_len; +#else src[0] = GiFloat32Type2FixLenType( GiLoadFloat32(src_ptr_odd + ow_block * simd_len)); +#endif load_helper<4, 3 * ld_weight, oc_step, c_dim, Vld1qF32S>( weight, weight_ptr, ld_weight_oc); cal_helper<1, 0, c_dim, ow_block, remain_w>(c, src, weight); @@ -360,40 +443,69 @@ struct KerGiXXs2Nchw44FP32 { const float* src_ptr_odd = src_ptr_odd_origin + ic_idx * ld_src_ic; for (int fh_idx = 0; fh_idx < filter_size; ++fh_idx) { - GI_FLOAT32_FIXLEN_t src[ow_block]; GI_FLOAT32_FIXLEN_t weight[c_dim][4]; - // even element +#if defined(GI_TARGET_X86) || defined(GI_RVV_INTRINSICS) + const float* src[ow_block]; + load_ptr_helper(src, src_ptr, 0); +#else + GI_FLOAT32_FIXLEN_t src[ow_block]; load_helper(src, src_ptr, 0); +#endif + // even element load_helper<4, 0, oc_step, c_dim, Vld1qF32S>( weight, weight_ptr, ld_weight_oc); cal_helper<0, 0, c_dim, ow_block, remain_w>(c, src, weight); +#if defined(GI_TARGET_X86) || defined(GI_RVV_INTRINSICS) + src[0] = src_ptr + ow_block * simd_len; +#else src[0] = GiFloat32Type2FixLenType( GiLoadFloat32(src_ptr + ow_block * simd_len)); +#endif load_helper<4, 2 * ld_weight, oc_step, c_dim, Vld1qF32S>( weight, weight_ptr, ld_weight_oc); cal_helper<1, 0, c_dim, ow_block, remain_w>(c, src, weight); +#if defined(GI_TARGET_X86) || defined(GI_RVV_INTRINSICS) + src[1] = src_ptr + (ow_block + 1) * simd_len; +#else src[1] = GiFloat32Type2FixLenType( GiLoadFloat32(src_ptr + (ow_block + 1) * simd_len)); +#endif load_helper<4, 4 * ld_weight, oc_step, c_dim, Vld1qF32S>( weight, weight_ptr, ld_weight_oc); cal_helper<2, 0, c_dim, ow_block, remain_w>(c, src, weight); +#if defined(GI_TARGET_X86) || defined(GI_RVV_INTRINSICS) + src[2] = src_ptr + (ow_block + 2) * simd_len; +#else src[2] = GiFloat32Type2FixLenType( GiLoadFloat32(src_ptr + (ow_block + 2) * simd_len)); +#endif load_helper<4, 6 * ld_weight, oc_step, c_dim, Vld1qF32S>( weight, weight_ptr, ld_weight_oc); cal_helper<3, 0, c_dim, ow_block, remain_w>(c, src, weight); // odd element +#if defined(GI_TARGET_X86) || defined(GI_RVV_INTRINSICS) + load_ptr_helper(src, src_ptr_odd, 0); +#else load_helper(src, src_ptr_odd, 0); +#endif load_helper<4, 1 * ld_weight, oc_step, c_dim, Vld1qF32S>( weight, weight_ptr, ld_weight_oc); cal_helper<0, 0, c_dim, ow_block, remain_w>(c, src, weight); +#if defined(GI_TARGET_X86) || defined(GI_RVV_INTRINSICS) + src[0] = src_ptr_odd + ow_block * simd_len; +#else src[0] = GiFloat32Type2FixLenType( GiLoadFloat32(src_ptr_odd + ow_block * simd_len)); +#endif load_helper<4, 3 * ld_weight, oc_step, c_dim, Vld1qF32S>( weight, weight_ptr, ld_weight_oc); cal_helper<1, 0, c_dim, ow_block, remain_w>(c, src, weight); +#if defined(GI_TARGET_X86) || defined(GI_RVV_INTRINSICS) + src[1] = src_ptr_odd + (ow_block + 1) * simd_len; +#else src[1] = GiFloat32Type2FixLenType( GiLoadFloat32(src_ptr_odd + (ow_block + 1) * simd_len)); +#endif load_helper<4, 5 * ld_weight, oc_step, c_dim, Vld1qF32S>( weight, weight_ptr, ld_weight_oc); cal_helper<2, 0, c_dim, ow_block, remain_w>(c, src, weight); diff --git a/dnn/src/fallback/conv_bias/gi/fp32/direct_kernels/f32_direct_nchw_nchw44_kern_common.h b/dnn/src/fallback/conv_bias/gi/fp32/direct_kernels/f32_direct_nchw_nchw44_kern_common.h index 43e43260..a8e49d9e 100644 --- a/dnn/src/fallback/conv_bias/gi/fp32/direct_kernels/f32_direct_nchw_nchw44_kern_common.h +++ b/dnn/src/fallback/conv_bias/gi/fp32/direct_kernels/f32_direct_nchw_nchw44_kern_common.h @@ -40,44 +40,29 @@ struct ShiftCalHelper { #if defined(GI_TARGET_X86) || defined(GI_RVV_INTRINSICS) //! x86 and rvv GiSimdFmaLane API is slowly, as an alternate, use //! GiMultiplyAddScalarFloat32 -#define MLA GiMultiplyAddScalarFloat32 -#define cb(step) \ - c[0][step] = GiFloat32Type2FixLenType(MLA( \ - GiFixLenType2GiFloat32Type(c[0][step]), \ - GiFixLenType2GiFloat32Type(weight[0][weight_idx]), \ - *(src[(step * stride + src_idx) / 4] + (step * stride + src_idx) % 4))); \ - c[1][step] = GiFloat32Type2FixLenType(MLA( \ - GiFixLenType2GiFloat32Type(c[1][step]), \ - GiFixLenType2GiFloat32Type(weight[1][weight_idx]), \ - *(src[(step * stride + src_idx) / 4] + (step * stride + src_idx) % 4))); - -#define cb2(step) \ - c[0][step] = GiFloat32Type2FixLenType(MLA( \ - GiFixLenType2GiFloat32Type(c[0][step]), \ - GiFixLenType2GiFloat32Type(weight[0][weight_idx]), \ - *(src[(step * stride + src_idx) / 4] + (step * stride + src_idx) % 4))); +#define MLA(a, b, c, d) \ + GiMultiplyAddScalarFloat32( \ + GiFixLenType2GiFloat32Type(a), GiFixLenType2GiFloat32Type(b), *(c + d)) #else -#define cb(step) \ - c[0][step] = GiFloat32Type2FixLenType(GiSimdFmaLane( \ - GiFixLenType2GiFloat32Type(c[0][step]), \ - GiFixLenType2GiFloat32Type(weight[0][weight_idx]), \ - GiFixLenType2GiFloat32Type(src[(step * stride + src_idx) / 4]), \ - (step * stride + src_idx) % 4)); \ - c[1][step] = GiFloat32Type2FixLenType(GiSimdFmaLane( \ - GiFixLenType2GiFloat32Type(c[1][step]), \ - GiFixLenType2GiFloat32Type(weight[1][weight_idx]), \ - GiFixLenType2GiFloat32Type(src[(step * stride + src_idx) / 4]), \ - (step * stride + src_idx) % 4)); - -#define cb2(step) \ - c[0][step] = GiFloat32Type2FixLenType(GiSimdFmaLane( \ - GiFixLenType2GiFloat32Type(c[0][step]), \ - GiFixLenType2GiFloat32Type(weight[0][weight_idx]), \ - GiFixLenType2GiFloat32Type(src[(step * stride + src_idx) / 4]), \ - (step * stride + src_idx) % 4)); -#undef MLA +#define MLA(a, b, c, d) \ + GiSimdFmaLane( \ + GiFixLenType2GiFloat32Type(a), GiFixLenType2GiFloat32Type(b), \ + GiFixLenType2GiFloat32Type(c), d) #endif +#define cb(step) \ + c[0][step] = GiFloat32Type2FixLenType( \ + MLA(c[0][step], weight[0][weight_idx], src[(step * stride + src_idx) / 4], \ + (step * stride + src_idx) % 4)); \ + c[1][step] = GiFloat32Type2FixLenType( \ + MLA(c[1][step], weight[1][weight_idx], src[(step * stride + src_idx) / 4], \ + (step * stride + src_idx) % 4)); + +#define cb2(step) \ + c[0][step] = GiFloat32Type2FixLenType( \ + MLA(c[0][step], weight[0][weight_idx], src[(step * stride + src_idx) / 4], \ + (step * stride + src_idx) % 4)); + #define SHIFT_CAL_HELPER(ow_remain) \ template < \ int src_idx, int weight_idx, int stride, typename T, typename T2, \ @@ -108,6 +93,7 @@ SHIFT_CAL_HELPER(8) #undef SHIFT_CAL_HELPER #undef cb #undef cb2 +#undef MLA template < int src_idx, int weight_idx, int c_dim, int stride, int remain_w, typename T, diff --git a/dnn/src/fallback/conv_bias/gi/fp32/do_conv_stride1.cpp b/dnn/src/fallback/conv_bias/gi/fp32/do_conv_stride1.cpp index e7a865d1..5357e202 100644 --- a/dnn/src/fallback/conv_bias/gi/fp32/do_conv_stride1.cpp +++ b/dnn/src/fallback/conv_bias/gi/fp32/do_conv_stride1.cpp @@ -17,6 +17,30 @@ using namespace conv_stride1; using NCBKernSizeParam = fallback::ConvBiasImpl::NCBKernSizeParam; using NCBKernParam = fallback::ConvBiasImpl::NCBKernParam; +#if defined(GI_RVV_INTRINSICS) +#define PREFER_VF +#endif + +#if defined(PREFER_VF) +#define MLA(a, b, c, d) GiMultiplyAddScalarFloat32(a, b, *(c + d)) +namespace { +GI_FORCEINLINE void ext_float32_ptr( + const float* a, const float* b, const int n, float* ret) { + int t_count = GI_SIMD_LEN_BYTE / sizeof(float); + int a_count = t_count - n; + for (int i = 0; i < a_count; i++) { + ret[i] = a[i + n]; + } + for (int i = 0; i < n; i++) { + ret[i + a_count] = b[i]; + } +} +}; // namespace + +#else +#define MLA(a, b, c, d) GiSimdFmaLane(a, b, c, d) +#endif + void conv_stride1::do_conv_2x2_stride1( const float* src, const float* filter, float* dst, size_t IH, size_t IW, size_t OH, size_t OW, size_t IC) { @@ -143,10 +167,18 @@ void conv_stride1::do_conv_3x3_stride1( const float* k1 = filter + 3; const float* k2 = filter + 5; +#if defined(PREFER_VF) + const float* _k0123 = k0; + const float* _k3456 = k1; + const float* _k5678 = k2; + float _k6789[GI_SIMD_LEN_BYTE / sizeof(float)]; + ext_float32_ptr(_k5678, _k5678, 1, _k6789); +#else GI_FLOAT32_t _k0123 = GiLoadFloat32(k0); GI_FLOAT32_t _k3456 = GiLoadFloat32(k1); GI_FLOAT32_t _k5678 = GiLoadFloat32(k2); GI_FLOAT32_t _k6789 = GiExtqFloat32(_k5678, _k5678, 1); +#endif size_t h = 0; for (; h + 1 < OH; h += 2) { @@ -178,25 +210,25 @@ void conv_stride1::do_conv_3x3_stride1( GI_FLOAT32_t _r31 = GiExtqFloat32(_r30, _r30n, 1); GI_FLOAT32_t _r32 = GiExtqFloat32(_r30, _r30n, 2); - _sum1 = GiSimdFmaLane(_sum1, _r00, _k0123, 0); - _sum2 = GiSimdFmaLane(_sum2, _r01, _k0123, 1); - _sum1 = GiSimdFmaLane(_sum1, _r02, _k0123, 2); - _sum2 = GiSimdFmaLane(_sum2, _r10, _k3456, 0); - _sum1 = GiSimdFmaLane(_sum1, _r11, _k3456, 1); - _sum2 = GiSimdFmaLane(_sum2, _r12, _k3456, 2); - _sum1 = GiSimdFmaLane(_sum1, _r20, _k6789, 0); - _sum2 = GiSimdFmaLane(_sum2, _r21, _k6789, 1); - _sum1 = GiSimdFmaLane(_sum1, _r22, _k6789, 2); - - _sum3 = GiSimdFmaLane(_sum3, _r10, _k0123, 0); - _sum4 = GiSimdFmaLane(_sum4, _r11, _k0123, 1); - _sum3 = GiSimdFmaLane(_sum3, _r12, _k0123, 2); - _sum4 = GiSimdFmaLane(_sum4, _r20, _k3456, 0); - _sum3 = GiSimdFmaLane(_sum3, _r21, _k3456, 1); - _sum4 = GiSimdFmaLane(_sum4, _r22, _k3456, 2); - _sum3 = GiSimdFmaLane(_sum3, _r30, _k6789, 0); - _sum4 = GiSimdFmaLane(_sum4, _r31, _k6789, 1); - _sum3 = GiSimdFmaLane(_sum3, _r32, _k6789, 2); + _sum1 = MLA(_sum1, _r00, _k0123, 0); + _sum2 = MLA(_sum2, _r01, _k0123, 1); + _sum1 = MLA(_sum1, _r02, _k0123, 2); + _sum2 = MLA(_sum2, _r10, _k3456, 0); + _sum1 = MLA(_sum1, _r11, _k3456, 1); + _sum2 = MLA(_sum2, _r12, _k3456, 2); + _sum1 = MLA(_sum1, _r20, _k6789, 0); + _sum2 = MLA(_sum2, _r21, _k6789, 1); + _sum1 = MLA(_sum1, _r22, _k6789, 2); + + _sum3 = MLA(_sum3, _r10, _k0123, 0); + _sum4 = MLA(_sum4, _r11, _k0123, 1); + _sum3 = MLA(_sum3, _r12, _k0123, 2); + _sum4 = MLA(_sum4, _r20, _k3456, 0); + _sum3 = MLA(_sum3, _r21, _k3456, 1); + _sum4 = MLA(_sum4, _r22, _k3456, 2); + _sum3 = MLA(_sum3, _r30, _k6789, 0); + _sum4 = MLA(_sum4, _r31, _k6789, 1); + _sum3 = MLA(_sum3, _r32, _k6789, 2); _sum1 = GiAddFloat32(_sum1, _sum2); _sum3 = GiAddFloat32(_sum3, _sum4); @@ -243,15 +275,15 @@ void conv_stride1::do_conv_3x3_stride1( GI_FLOAT32_t _r21 = GiExtqFloat32(_r20, _r20n, 1); GI_FLOAT32_t _r22 = GiExtqFloat32(_r20, _r20n, 2); - _sum1 = GiSimdFmaLane(_sum1, _r00, _k0123, 0); - _sum2 = GiSimdFmaLane(_sum2, _r01, _k0123, 1); - _sum1 = GiSimdFmaLane(_sum1, _r02, _k0123, 2); - _sum2 = GiSimdFmaLane(_sum2, _r10, _k3456, 0); - _sum1 = GiSimdFmaLane(_sum1, _r11, _k3456, 1); - _sum2 = GiSimdFmaLane(_sum2, _r12, _k3456, 2); - _sum1 = GiSimdFmaLane(_sum1, _r20, _k6789, 0); - _sum2 = GiSimdFmaLane(_sum2, _r21, _k6789, 1); - _sum1 = GiSimdFmaLane(_sum1, _r22, _k6789, 2); + _sum1 = MLA(_sum1, _r00, _k0123, 0); + _sum2 = MLA(_sum2, _r01, _k0123, 1); + _sum1 = MLA(_sum1, _r02, _k0123, 2); + _sum2 = MLA(_sum2, _r10, _k3456, 0); + _sum1 = MLA(_sum1, _r11, _k3456, 1); + _sum2 = MLA(_sum2, _r12, _k3456, 2); + _sum1 = MLA(_sum1, _r20, _k6789, 0); + _sum2 = MLA(_sum2, _r21, _k6789, 1); + _sum1 = MLA(_sum1, _r22, _k6789, 2); _sum1 = GiAddFloat32(_sum1, _sum2); @@ -288,6 +320,15 @@ void conv_stride1::do_conv_5x5_stride1( const float* r4 = src_ptr + IW * 4; const float* r5 = src_ptr + IW * 5; +#if defined(PREFER_VF) + const float* _k0123 = filter; + const float* _k4567 = filter + 4; + const float* _k891011 = filter + 8; + const float* _k12131415 = filter + 12; + const float* _k16171819 = filter + 16; + const float* _k20212223 = filter + 20; + const float* _k24242424 = filter + 24; +#else GI_FLOAT32_t _k0123 = GiLoadFloat32(filter); GI_FLOAT32_t _k4567 = GiLoadFloat32(filter + 4); GI_FLOAT32_t _k891011 = GiLoadFloat32(filter + 8); @@ -295,6 +336,7 @@ void conv_stride1::do_conv_5x5_stride1( GI_FLOAT32_t _k16171819 = GiLoadFloat32(filter + 16); GI_FLOAT32_t _k20212223 = GiLoadFloat32(filter + 20); GI_FLOAT32_t _k24242424 = GiBroadcastFloat32(filter[24]); +#endif size_t h = 0; for (; h + 1 < OH; h += 2) { @@ -340,65 +382,65 @@ void conv_stride1::do_conv_5x5_stride1( GI_FLOAT32_t _r52 = GiExtqFloat32(_r50, _r54, 2); GI_FLOAT32_t _r53 = GiExtqFloat32(_r50, _r54, 3); - _sum = GiSimdFmaLane(_sum, _r00, _k0123, 0); - _sum = GiSimdFmaLane(_sum, _r01, _k0123, 1); - _sum = GiSimdFmaLane(_sum, _r02, _k0123, 2); - _sum = GiSimdFmaLane(_sum, _r03, _k0123, 3); - _sum = GiSimdFmaLane(_sum, _r04, _k4567, 0); - - _sum = GiSimdFmaLane(_sum, _r10, _k4567, 1); - _sum = GiSimdFmaLane(_sum, _r11, _k4567, 2); - _sum = GiSimdFmaLane(_sum, _r12, _k4567, 3); - _sum = GiSimdFmaLane(_sum, _r13, _k891011, 0); - _sum = GiSimdFmaLane(_sum, _r14, _k891011, 1); - - _sum = GiSimdFmaLane(_sum, _r20, _k891011, 2); - _sum = GiSimdFmaLane(_sum, _r21, _k891011, 3); - _sum = GiSimdFmaLane(_sum, _r22, _k12131415, 0); - _sum = GiSimdFmaLane(_sum, _r23, _k12131415, 1); - _sum = GiSimdFmaLane(_sum, _r24, _k12131415, 2); - - _sum = GiSimdFmaLane(_sum, _r30, _k12131415, 3); - _sum = GiSimdFmaLane(_sum, _r31, _k16171819, 0); - _sum = GiSimdFmaLane(_sum, _r32, _k16171819, 1); - _sum = GiSimdFmaLane(_sum, _r33, _k16171819, 2); - _sum = GiSimdFmaLane(_sum, _r34, _k16171819, 3); - - _sum = GiSimdFmaLane(_sum, _r40, _k20212223, 0); - _sum = GiSimdFmaLane(_sum, _r41, _k20212223, 1); - _sum = GiSimdFmaLane(_sum, _r42, _k20212223, 2); - _sum = GiSimdFmaLane(_sum, _r43, _k20212223, 3); - _sum = GiSimdFmaLane(_sum, _r44, _k24242424, 0); - - _sum2 = GiSimdFmaLane(_sum2, _r10, _k0123, 0); - _sum2 = GiSimdFmaLane(_sum2, _r11, _k0123, 1); - _sum2 = GiSimdFmaLane(_sum2, _r12, _k0123, 2); - _sum2 = GiSimdFmaLane(_sum2, _r13, _k0123, 3); - _sum2 = GiSimdFmaLane(_sum2, _r14, _k4567, 0); - - _sum2 = GiSimdFmaLane(_sum2, _r20, _k4567, 1); - _sum2 = GiSimdFmaLane(_sum2, _r21, _k4567, 2); - _sum2 = GiSimdFmaLane(_sum2, _r22, _k4567, 3); - _sum2 = GiSimdFmaLane(_sum2, _r23, _k891011, 0); - _sum2 = GiSimdFmaLane(_sum2, _r24, _k891011, 1); - - _sum2 = GiSimdFmaLane(_sum2, _r30, _k891011, 2); - _sum2 = GiSimdFmaLane(_sum2, _r31, _k891011, 3); - _sum2 = GiSimdFmaLane(_sum2, _r32, _k12131415, 0); - _sum2 = GiSimdFmaLane(_sum2, _r33, _k12131415, 1); - _sum2 = GiSimdFmaLane(_sum2, _r34, _k12131415, 2); - - _sum2 = GiSimdFmaLane(_sum2, _r40, _k12131415, 3); - _sum2 = GiSimdFmaLane(_sum2, _r41, _k16171819, 0); - _sum2 = GiSimdFmaLane(_sum2, _r42, _k16171819, 1); - _sum2 = GiSimdFmaLane(_sum2, _r43, _k16171819, 2); - _sum2 = GiSimdFmaLane(_sum2, _r44, _k16171819, 3); - - _sum2 = GiSimdFmaLane(_sum2, _r50, _k20212223, 0); - _sum2 = GiSimdFmaLane(_sum2, _r51, _k20212223, 1); - _sum2 = GiSimdFmaLane(_sum2, _r52, _k20212223, 2); - _sum2 = GiSimdFmaLane(_sum2, _r53, _k20212223, 3); - _sum2 = GiSimdFmaLane(_sum2, _r54, _k24242424, 0); + _sum = MLA(_sum, _r00, _k0123, 0); + _sum = MLA(_sum, _r01, _k0123, 1); + _sum = MLA(_sum, _r02, _k0123, 2); + _sum = MLA(_sum, _r03, _k0123, 3); + _sum = MLA(_sum, _r04, _k4567, 0); + + _sum = MLA(_sum, _r10, _k4567, 1); + _sum = MLA(_sum, _r11, _k4567, 2); + _sum = MLA(_sum, _r12, _k4567, 3); + _sum = MLA(_sum, _r13, _k891011, 0); + _sum = MLA(_sum, _r14, _k891011, 1); + + _sum = MLA(_sum, _r20, _k891011, 2); + _sum = MLA(_sum, _r21, _k891011, 3); + _sum = MLA(_sum, _r22, _k12131415, 0); + _sum = MLA(_sum, _r23, _k12131415, 1); + _sum = MLA(_sum, _r24, _k12131415, 2); + + _sum = MLA(_sum, _r30, _k12131415, 3); + _sum = MLA(_sum, _r31, _k16171819, 0); + _sum = MLA(_sum, _r32, _k16171819, 1); + _sum = MLA(_sum, _r33, _k16171819, 2); + _sum = MLA(_sum, _r34, _k16171819, 3); + + _sum = MLA(_sum, _r40, _k20212223, 0); + _sum = MLA(_sum, _r41, _k20212223, 1); + _sum = MLA(_sum, _r42, _k20212223, 2); + _sum = MLA(_sum, _r43, _k20212223, 3); + _sum = MLA(_sum, _r44, _k24242424, 0); + + _sum2 = MLA(_sum2, _r10, _k0123, 0); + _sum2 = MLA(_sum2, _r11, _k0123, 1); + _sum2 = MLA(_sum2, _r12, _k0123, 2); + _sum2 = MLA(_sum2, _r13, _k0123, 3); + _sum2 = MLA(_sum2, _r14, _k4567, 0); + + _sum2 = MLA(_sum2, _r20, _k4567, 1); + _sum2 = MLA(_sum2, _r21, _k4567, 2); + _sum2 = MLA(_sum2, _r22, _k4567, 3); + _sum2 = MLA(_sum2, _r23, _k891011, 0); + _sum2 = MLA(_sum2, _r24, _k891011, 1); + + _sum2 = MLA(_sum2, _r30, _k891011, 2); + _sum2 = MLA(_sum2, _r31, _k891011, 3); + _sum2 = MLA(_sum2, _r32, _k12131415, 0); + _sum2 = MLA(_sum2, _r33, _k12131415, 1); + _sum2 = MLA(_sum2, _r34, _k12131415, 2); + + _sum2 = MLA(_sum2, _r40, _k12131415, 3); + _sum2 = MLA(_sum2, _r41, _k16171819, 0); + _sum2 = MLA(_sum2, _r42, _k16171819, 1); + _sum2 = MLA(_sum2, _r43, _k16171819, 2); + _sum2 = MLA(_sum2, _r44, _k16171819, 3); + + _sum2 = MLA(_sum2, _r50, _k20212223, 0); + _sum2 = MLA(_sum2, _r51, _k20212223, 1); + _sum2 = MLA(_sum2, _r52, _k20212223, 2); + _sum2 = MLA(_sum2, _r53, _k20212223, 3); + _sum2 = MLA(_sum2, _r54, _k24242424, 0); GiStoreFloat32(outptr, _sum); GiStoreFloat32(outptr2, _sum2); @@ -460,35 +502,35 @@ void conv_stride1::do_conv_5x5_stride1( GI_FLOAT32_t _r42 = GiExtqFloat32(_r40, _r44, 2); GI_FLOAT32_t _r43 = GiExtqFloat32(_r40, _r44, 3); - _sum = GiSimdFmaLane(_sum, _r00, _k0123, 0); - _sum = GiSimdFmaLane(_sum, _r01, _k0123, 1); - _sum = GiSimdFmaLane(_sum, _r02, _k0123, 2); - _sum = GiSimdFmaLane(_sum, _r03, _k0123, 3); - _sum = GiSimdFmaLane(_sum, _r04, _k4567, 0); - - _sum = GiSimdFmaLane(_sum, _r10, _k4567, 1); - _sum = GiSimdFmaLane(_sum, _r11, _k4567, 2); - _sum = GiSimdFmaLane(_sum, _r12, _k4567, 3); - _sum = GiSimdFmaLane(_sum, _r13, _k891011, 0); - _sum = GiSimdFmaLane(_sum, _r14, _k891011, 1); - - _sum = GiSimdFmaLane(_sum, _r20, _k891011, 2); - _sum = GiSimdFmaLane(_sum, _r21, _k891011, 3); - _sum = GiSimdFmaLane(_sum, _r22, _k12131415, 0); - _sum = GiSimdFmaLane(_sum, _r23, _k12131415, 1); - _sum = GiSimdFmaLane(_sum, _r24, _k12131415, 2); - - _sum = GiSimdFmaLane(_sum, _r30, _k12131415, 3); - _sum = GiSimdFmaLane(_sum, _r31, _k16171819, 0); - _sum = GiSimdFmaLane(_sum, _r32, _k16171819, 1); - _sum = GiSimdFmaLane(_sum, _r33, _k16171819, 2); - _sum = GiSimdFmaLane(_sum, _r34, _k16171819, 3); - - _sum = GiSimdFmaLane(_sum, _r40, _k20212223, 0); - _sum = GiSimdFmaLane(_sum, _r41, _k20212223, 1); - _sum = GiSimdFmaLane(_sum, _r42, _k20212223, 2); - _sum = GiSimdFmaLane(_sum, _r43, _k20212223, 3); - _sum = GiSimdFmaLane(_sum, _r44, _k24242424, 0); + _sum = MLA(_sum, _r00, _k0123, 0); + _sum = MLA(_sum, _r01, _k0123, 1); + _sum = MLA(_sum, _r02, _k0123, 2); + _sum = MLA(_sum, _r03, _k0123, 3); + _sum = MLA(_sum, _r04, _k4567, 0); + + _sum = MLA(_sum, _r10, _k4567, 1); + _sum = MLA(_sum, _r11, _k4567, 2); + _sum = MLA(_sum, _r12, _k4567, 3); + _sum = MLA(_sum, _r13, _k891011, 0); + _sum = MLA(_sum, _r14, _k891011, 1); + + _sum = MLA(_sum, _r20, _k891011, 2); + _sum = MLA(_sum, _r21, _k891011, 3); + _sum = MLA(_sum, _r22, _k12131415, 0); + _sum = MLA(_sum, _r23, _k12131415, 1); + _sum = MLA(_sum, _r24, _k12131415, 2); + + _sum = MLA(_sum, _r30, _k12131415, 3); + _sum = MLA(_sum, _r31, _k16171819, 0); + _sum = MLA(_sum, _r32, _k16171819, 1); + _sum = MLA(_sum, _r33, _k16171819, 2); + _sum = MLA(_sum, _r34, _k16171819, 3); + + _sum = MLA(_sum, _r40, _k20212223, 0); + _sum = MLA(_sum, _r41, _k20212223, 1); + _sum = MLA(_sum, _r42, _k20212223, 2); + _sum = MLA(_sum, _r43, _k20212223, 3); + _sum = MLA(_sum, _r44, _k24242424, 0); GiStoreFloat32(outptr, _sum); @@ -542,8 +584,13 @@ void conv_stride1::do_conv_7x7_stride1( rep(i, width) { GI_FLOAT32_t _sum = GiLoadFloat32(outptr); +#if defined(PREFER_VF) + const float* _k0123 = k0; + const float* _k4567 = k0 + 4; +#else GI_FLOAT32_t _k0123 = GiLoadFloat32(k0); GI_FLOAT32_t _k4567 = GiLoadFloat32(k0 + 4); +#endif GI_FLOAT32_t _r00 = GiLoadFloat32(r0); // 0 1 2 3 GI_FLOAT32_t _r04 = GiLoadFloat32(r0 + 4); // 4 5 6 7 @@ -554,16 +601,21 @@ void conv_stride1::do_conv_7x7_stride1( GI_FLOAT32_t _r05 = GiExtqFloat32(_r04, _r00n, 1); // 5 6 7 8 GI_FLOAT32_t _r06 = GiExtqFloat32(_r04, _r00n, 2); // 6 7 8 9 - _sum = GiSimdFmaLane(_sum, _r00, _k0123, 0); - _sum = GiSimdFmaLane(_sum, _r01, _k0123, 1); - _sum = GiSimdFmaLane(_sum, _r02, _k0123, 2); - _sum = GiSimdFmaLane(_sum, _r03, _k0123, 3); - _sum = GiSimdFmaLane(_sum, _r04, _k4567, 0); - _sum = GiSimdFmaLane(_sum, _r05, _k4567, 1); - _sum = GiSimdFmaLane(_sum, _r06, _k4567, 2); - + _sum = MLA(_sum, _r00, _k0123, 0); + _sum = MLA(_sum, _r01, _k0123, 1); + _sum = MLA(_sum, _r02, _k0123, 2); + _sum = MLA(_sum, _r03, _k0123, 3); + _sum = MLA(_sum, _r04, _k4567, 0); + _sum = MLA(_sum, _r05, _k4567, 1); + _sum = MLA(_sum, _r06, _k4567, 2); + +#if defined(PREFER_VF) + const float* _k78910 = k1; + const float* _k11121314 = k1 + 4; +#else GI_FLOAT32_t _k78910 = GiLoadFloat32(k1); GI_FLOAT32_t _k11121314 = GiLoadFloat32(k1 + 4); +#endif GI_FLOAT32_t _r10 = GiLoadFloat32(r1); GI_FLOAT32_t _r14 = GiLoadFloat32(r1 + 4); @@ -574,16 +626,21 @@ void conv_stride1::do_conv_7x7_stride1( GI_FLOAT32_t _r15 = GiExtqFloat32(_r14, _r10n, 1); GI_FLOAT32_t _r16 = GiExtqFloat32(_r14, _r10n, 2); - _sum = GiSimdFmaLane(_sum, _r10, _k78910, 0); - _sum = GiSimdFmaLane(_sum, _r11, _k78910, 1); - _sum = GiSimdFmaLane(_sum, _r12, _k78910, 2); - _sum = GiSimdFmaLane(_sum, _r13, _k78910, 3); - _sum = GiSimdFmaLane(_sum, _r14, _k11121314, 0); - _sum = GiSimdFmaLane(_sum, _r15, _k11121314, 1); - _sum = GiSimdFmaLane(_sum, _r16, _k11121314, 2); - + _sum = MLA(_sum, _r10, _k78910, 0); + _sum = MLA(_sum, _r11, _k78910, 1); + _sum = MLA(_sum, _r12, _k78910, 2); + _sum = MLA(_sum, _r13, _k78910, 3); + _sum = MLA(_sum, _r14, _k11121314, 0); + _sum = MLA(_sum, _r15, _k11121314, 1); + _sum = MLA(_sum, _r16, _k11121314, 2); + +#if defined(PREFER_VF) + const float* _k14151617 = k2; + const float* _k18192021 = k2 + 4; +#else GI_FLOAT32_t _k14151617 = GiLoadFloat32(k2); GI_FLOAT32_t _k18192021 = GiLoadFloat32(k2 + 4); +#endif GI_FLOAT32_t _r20 = GiLoadFloat32(r2); GI_FLOAT32_t _r24 = GiLoadFloat32(r2 + 4); @@ -594,16 +651,21 @@ void conv_stride1::do_conv_7x7_stride1( GI_FLOAT32_t _r25 = GiExtqFloat32(_r24, _r20n, 1); GI_FLOAT32_t _r26 = GiExtqFloat32(_r24, _r20n, 2); - _sum = GiSimdFmaLane(_sum, _r20, _k14151617, 0); - _sum = GiSimdFmaLane(_sum, _r21, _k14151617, 1); - _sum = GiSimdFmaLane(_sum, _r22, _k14151617, 2); - _sum = GiSimdFmaLane(_sum, _r23, _k14151617, 3); - _sum = GiSimdFmaLane(_sum, _r24, _k18192021, 0); - _sum = GiSimdFmaLane(_sum, _r25, _k18192021, 1); - _sum = GiSimdFmaLane(_sum, _r26, _k18192021, 2); - + _sum = MLA(_sum, _r20, _k14151617, 0); + _sum = MLA(_sum, _r21, _k14151617, 1); + _sum = MLA(_sum, _r22, _k14151617, 2); + _sum = MLA(_sum, _r23, _k14151617, 3); + _sum = MLA(_sum, _r24, _k18192021, 0); + _sum = MLA(_sum, _r25, _k18192021, 1); + _sum = MLA(_sum, _r26, _k18192021, 2); + +#if defined(PREFER_VF) + const float* _k21222324 = k3; + const float* _k25262728 = k3 + 4; +#else GI_FLOAT32_t _k21222324 = GiLoadFloat32(k3); GI_FLOAT32_t _k25262728 = GiLoadFloat32(k3 + 4); +#endif GI_FLOAT32_t _r30 = GiLoadFloat32(r3); GI_FLOAT32_t _r34 = GiLoadFloat32(r3 + 4); @@ -614,16 +676,21 @@ void conv_stride1::do_conv_7x7_stride1( GI_FLOAT32_t _r35 = GiExtqFloat32(_r34, _r30n, 1); GI_FLOAT32_t _r36 = GiExtqFloat32(_r34, _r30n, 2); - _sum = GiSimdFmaLane(_sum, _r30, _k21222324, 0); - _sum = GiSimdFmaLane(_sum, _r31, _k21222324, 1); - _sum = GiSimdFmaLane(_sum, _r32, _k21222324, 2); - _sum = GiSimdFmaLane(_sum, _r33, _k21222324, 3); - _sum = GiSimdFmaLane(_sum, _r34, _k25262728, 0); - _sum = GiSimdFmaLane(_sum, _r35, _k25262728, 1); - _sum = GiSimdFmaLane(_sum, _r36, _k25262728, 2); - + _sum = MLA(_sum, _r30, _k21222324, 0); + _sum = MLA(_sum, _r31, _k21222324, 1); + _sum = MLA(_sum, _r32, _k21222324, 2); + _sum = MLA(_sum, _r33, _k21222324, 3); + _sum = MLA(_sum, _r34, _k25262728, 0); + _sum = MLA(_sum, _r35, _k25262728, 1); + _sum = MLA(_sum, _r36, _k25262728, 2); + +#if defined(PREFER_VF) + const float* _k28293031 = k4; + const float* _k32333435 = k4 + 4; +#else GI_FLOAT32_t _k28293031 = GiLoadFloat32(k4); GI_FLOAT32_t _k32333435 = GiLoadFloat32(k4 + 4); +#endif GI_FLOAT32_t _r40 = GiLoadFloat32(r4); GI_FLOAT32_t _r44 = GiLoadFloat32(r4 + 4); @@ -634,16 +701,21 @@ void conv_stride1::do_conv_7x7_stride1( GI_FLOAT32_t _r45 = GiExtqFloat32(_r44, _r40n, 1); GI_FLOAT32_t _r46 = GiExtqFloat32(_r44, _r40n, 2); - _sum = GiSimdFmaLane(_sum, _r40, _k28293031, 0); - _sum = GiSimdFmaLane(_sum, _r41, _k28293031, 1); - _sum = GiSimdFmaLane(_sum, _r42, _k28293031, 2); - _sum = GiSimdFmaLane(_sum, _r43, _k28293031, 3); - _sum = GiSimdFmaLane(_sum, _r44, _k32333435, 0); - _sum = GiSimdFmaLane(_sum, _r45, _k32333435, 1); - _sum = GiSimdFmaLane(_sum, _r46, _k32333435, 2); - + _sum = MLA(_sum, _r40, _k28293031, 0); + _sum = MLA(_sum, _r41, _k28293031, 1); + _sum = MLA(_sum, _r42, _k28293031, 2); + _sum = MLA(_sum, _r43, _k28293031, 3); + _sum = MLA(_sum, _r44, _k32333435, 0); + _sum = MLA(_sum, _r45, _k32333435, 1); + _sum = MLA(_sum, _r46, _k32333435, 2); + +#if defined(PREFER_VF) + const float* _k35363738 = k5; + const float* _k39404142 = k5 + 4; +#else GI_FLOAT32_t _k35363738 = GiLoadFloat32(k5); GI_FLOAT32_t _k39404142 = GiLoadFloat32(k5 + 4); +#endif GI_FLOAT32_t _r50 = GiLoadFloat32(r5); GI_FLOAT32_t _r54 = GiLoadFloat32(r5 + 4); @@ -654,17 +726,24 @@ void conv_stride1::do_conv_7x7_stride1( GI_FLOAT32_t _r55 = GiExtqFloat32(_r54, _r50n, 1); GI_FLOAT32_t _r56 = GiExtqFloat32(_r54, _r50n, 2); - _sum = GiSimdFmaLane(_sum, _r50, _k35363738, 0); - _sum = GiSimdFmaLane(_sum, _r51, _k35363738, 1); - _sum = GiSimdFmaLane(_sum, _r52, _k35363738, 2); - _sum = GiSimdFmaLane(_sum, _r53, _k35363738, 3); - _sum = GiSimdFmaLane(_sum, _r54, _k39404142, 0); - _sum = GiSimdFmaLane(_sum, _r55, _k39404142, 1); - _sum = GiSimdFmaLane(_sum, _r56, _k39404142, 2); - + _sum = MLA(_sum, _r50, _k35363738, 0); + _sum = MLA(_sum, _r51, _k35363738, 1); + _sum = MLA(_sum, _r52, _k35363738, 2); + _sum = MLA(_sum, _r53, _k35363738, 3); + _sum = MLA(_sum, _r54, _k39404142, 0); + _sum = MLA(_sum, _r55, _k39404142, 1); + _sum = MLA(_sum, _r56, _k39404142, 2); + +#if defined(PREFER_VF) + const float* _k42434445 = k6; + float _k46474849[GI_SIMD_LEN_BYTE / sizeof(float)]; + memcpy(_k46474849, k6 + 4, + sizeof(float) * GI_SIMD_LEN_BYTE / sizeof(float) - 1); +#else GI_FLOAT32_t _k42434445 = GiLoadFloat32(k6); GI_FLOAT32_t _k46474849 = GiLd1qLaneFloat32(k6 + 4 + 2, GiLoadFloat32LowHalf(k6 + 4), 2); +#endif GI_FLOAT32_t _r60 = GiLoadFloat32(r6); GI_FLOAT32_t _r64 = GiLoadFloat32(r6 + 4); @@ -675,13 +754,13 @@ void conv_stride1::do_conv_7x7_stride1( GI_FLOAT32_t _r65 = GiExtqFloat32(_r64, _r60n, 1); GI_FLOAT32_t _r66 = GiExtqFloat32(_r64, _r60n, 2); - _sum = GiSimdFmaLane(_sum, _r60, _k42434445, 0); - _sum = GiSimdFmaLane(_sum, _r61, _k42434445, 1); - _sum = GiSimdFmaLane(_sum, _r62, _k42434445, 2); - _sum = GiSimdFmaLane(_sum, _r63, _k42434445, 3); - _sum = GiSimdFmaLane(_sum, _r64, _k46474849, 0); - _sum = GiSimdFmaLane(_sum, _r65, _k46474849, 1); - _sum = GiSimdFmaLane(_sum, _r66, _k46474849, 2); + _sum = MLA(_sum, _r60, _k42434445, 0); + _sum = MLA(_sum, _r61, _k42434445, 1); + _sum = MLA(_sum, _r62, _k42434445, 2); + _sum = MLA(_sum, _r63, _k42434445, 3); + _sum = MLA(_sum, _r64, _k46474849, 0); + _sum = MLA(_sum, _r65, _k46474849, 1); + _sum = MLA(_sum, _r66, _k46474849, 2); GiStoreFloat32(outptr, _sum); diff --git a/dnn/src/fallback/conv_bias/gi/fp32/do_conv_stride2.cpp b/dnn/src/fallback/conv_bias/gi/fp32/do_conv_stride2.cpp index 5501a18c..b06a7af8 100644 --- a/dnn/src/fallback/conv_bias/gi/fp32/do_conv_stride2.cpp +++ b/dnn/src/fallback/conv_bias/gi/fp32/do_conv_stride2.cpp @@ -15,6 +15,30 @@ using namespace conv_stride2; using NCBKernSizeParam = fallback::ConvBiasImpl::NCBKernSizeParam; using NCBKernParam = fallback::ConvBiasImpl::NCBKernParam; +#if defined(GI_RVV_INTRINSICS) +#define PREFER_VF +#endif + +#if defined(PREFER_VF) +#define MLA(a, b, c, d) GiMultiplyAddScalarFloat32(a, b, *(c + d)) +namespace { +GI_FORCEINLINE void ext_float32_ptr( + const float* a, const float* b, const int n, float* ret) { + int t_count = GI_SIMD_LEN_BYTE / sizeof(float); + int a_count = t_count - n; + for (int i = 0; i < a_count; i++) { + ret[i] = a[i + n]; + } + for (int i = 0; i < n; i++) { + ret[i + a_count] = b[i]; + } +} +}; // namespace + +#else +#define MLA(a, b, c, d) GiSimdFmaLane(a, b, c, d) +#endif + void conv_stride2::do_conv_2x2_stride2( const float* src, const float* filter, float* dst, size_t IH, size_t IW, size_t OH, size_t OW, size_t IC) { @@ -29,7 +53,11 @@ void conv_stride2::do_conv_2x2_stride2( const float* k0 = filter; +#if defined(PREFER_VF) + const float* _k0123 = k0; +#else GI_FLOAT32_t _k0123 = GiLoadFloat32(k0); +#endif rep(h, OH) { int nn = OW >> 2; @@ -41,16 +69,16 @@ void conv_stride2::do_conv_2x2_stride2( GI_FLOAT32_t _r00 = GiGetSubVectorFloat32V2(_r0, 0); // 0 2 4 6 GI_FLOAT32_t _r01 = GiGetSubVectorFloat32V2(_r0, 1); // 1 3 5 7 - _outp = GiSimdFmaLane(_outp, _r00, _k0123, 0); - _outp = GiSimdFmaLane(_outp, _r01, _k0123, 1); + _outp = MLA(_outp, _r00, _k0123, 0); + _outp = MLA(_outp, _r01, _k0123, 1); GI_FLOAT32_V2_t _r1 = GiLoadUzipFloat32V2(r1); GI_FLOAT32_t _r10 = GiGetSubVectorFloat32V2(_r1, 0); GI_FLOAT32_t _r11 = GiGetSubVectorFloat32V2(_r1, 1); - _outp = GiSimdFmaLane(_outp, _r10, _k0123, 2); - _outp = GiSimdFmaLane(_outp, _r11, _k0123, 3); + _outp = MLA(_outp, _r10, _k0123, 2); + _outp = MLA(_outp, _r11, _k0123, 3); GiStoreFloat32(outptr, _outp); @@ -84,10 +112,18 @@ void conv_stride2::do_conv_3x3_stride2( const float* k1 = filter + 3; const float* k2 = filter + 5; +#if defined(PREFER_VF) + const float* _k0123 = k0; + const float* _k3456 = k1; + const float* _k5678 = k2; + float _k6789[GI_SIMD_LEN_BYTE / sizeof(float)]; + ext_float32_ptr(_k5678, _k5678, 1, _k6789); +#else GI_FLOAT32_t _k0123 = GiLoadFloat32(k0); GI_FLOAT32_t _k3456 = GiLoadFloat32(k1); GI_FLOAT32_t _k5678 = GiLoadFloat32(k2); GI_FLOAT32_t _k6789 = GiExtqFloat32(_k5678, _k5678, 1); +#endif rep(h, OH) { int nn = OW >> 2; @@ -102,9 +138,9 @@ void conv_stride2::do_conv_3x3_stride2( GI_FLOAT32_t _r02 = GiExtqFloat32( _r00, GiGetSubVectorFloat32V2(_r0n, 0), 1); // 2 4 6 8 - _outp = GiSimdFmaLane(_outp, _r00, _k0123, 0); - _outp = GiSimdFmaLane(_outp, _r01, _k0123, 1); - _outp = GiSimdFmaLane(_outp, _r02, _k0123, 2); + _outp = MLA(_outp, _r00, _k0123, 0); + _outp = MLA(_outp, _r01, _k0123, 1); + _outp = MLA(_outp, _r02, _k0123, 2); GI_FLOAT32_V2_t _r1 = GiLoadUzipFloat32V2(r1); GI_FLOAT32_V2_t _r1n = GiLoadUzipFloat32V2(r1 + 8); @@ -114,9 +150,9 @@ void conv_stride2::do_conv_3x3_stride2( GI_FLOAT32_t _r12 = GiExtqFloat32(_r10, GiGetSubVectorFloat32V2(_r1n, 0), 1); - _outp = GiSimdFmaLane(_outp, _r10, _k3456, 0); - _outp = GiSimdFmaLane(_outp, _r11, _k3456, 1); - _outp = GiSimdFmaLane(_outp, _r12, _k3456, 2); + _outp = MLA(_outp, _r10, _k3456, 0); + _outp = MLA(_outp, _r11, _k3456, 1); + _outp = MLA(_outp, _r12, _k3456, 2); GI_FLOAT32_V2_t _r2 = GiLoadUzipFloat32V2(r2); GI_FLOAT32_V2_t _r2n = GiLoadUzipFloat32V2(r2 + 8); @@ -126,9 +162,9 @@ void conv_stride2::do_conv_3x3_stride2( GI_FLOAT32_t _r22 = GiExtqFloat32(_r20, GiGetSubVectorFloat32V2(_r2n, 0), 1); - _outp = GiSimdFmaLane(_outp, _r20, _k6789, 0); - _outp = GiSimdFmaLane(_outp, _r21, _k6789, 1); - _outp = GiSimdFmaLane(_outp, _r22, _k6789, 2); + _outp = MLA(_outp, _r20, _k6789, 0); + _outp = MLA(_outp, _r21, _k6789, 1); + _outp = MLA(_outp, _r22, _k6789, 2); GiStoreFloat32(outptr, _outp); @@ -162,6 +198,15 @@ void conv_stride2::do_conv_5x5_stride2( const float* r3 = src_ptr + IW * 3; const float* r4 = src_ptr + IW * 4; +#if defined(PREFER_VF) + const float* _k0123 = filter; + const float* _k4567 = filter + 4; + const float* _k891011 = filter + 8; + const float* _k12131415 = filter + 12; + const float* _k16171819 = filter + 16; + const float* _k20212223 = filter + 20; + const float* _k24242424 = filter + 24; +#else GI_FLOAT32_t _k0123 = GiLoadFloat32(filter); GI_FLOAT32_t _k4567 = GiLoadFloat32(filter + 4); GI_FLOAT32_t _k891011 = GiLoadFloat32(filter + 8); @@ -169,6 +214,7 @@ void conv_stride2::do_conv_5x5_stride2( GI_FLOAT32_t _k16171819 = GiLoadFloat32(filter + 16); GI_FLOAT32_t _k20212223 = GiLoadFloat32(filter + 20); GI_FLOAT32_t _k24242424 = GiBroadcastFloat32(filter[24]); +#endif for (size_t i = 0; i < OH; i++) { int nn = OW >> 2; @@ -230,35 +276,35 @@ void conv_stride2::do_conv_5x5_stride2( GI_FLOAT32_t _r43 = GiExtqFloat32(_r41, _r4_9111315, 1); GI_FLOAT32_t _r44 = GiExtqFloat32(_r40, _r4_8101214, 2); - _sum = GiSimdFmaLane(_sum, _r00, _k0123, 0); - _sum = GiSimdFmaLane(_sum, _r01, _k0123, 1); - _sum = GiSimdFmaLane(_sum, _r02, _k0123, 2); - _sum = GiSimdFmaLane(_sum, _r03, _k0123, 3); - _sum = GiSimdFmaLane(_sum, _r04, _k4567, 0); - - _sum = GiSimdFmaLane(_sum, _r10, _k4567, 1); - _sum = GiSimdFmaLane(_sum, _r11, _k4567, 2); - _sum = GiSimdFmaLane(_sum, _r12, _k4567, 3); - _sum = GiSimdFmaLane(_sum, _r13, _k891011, 0); - _sum = GiSimdFmaLane(_sum, _r14, _k891011, 1); - - _sum = GiSimdFmaLane(_sum, _r20, _k891011, 2); - _sum = GiSimdFmaLane(_sum, _r21, _k891011, 3); - _sum = GiSimdFmaLane(_sum, _r22, _k12131415, 0); - _sum = GiSimdFmaLane(_sum, _r23, _k12131415, 1); - _sum = GiSimdFmaLane(_sum, _r24, _k12131415, 2); - - _sum = GiSimdFmaLane(_sum, _r30, _k12131415, 3); - _sum = GiSimdFmaLane(_sum, _r31, _k16171819, 0); - _sum = GiSimdFmaLane(_sum, _r32, _k16171819, 1); - _sum = GiSimdFmaLane(_sum, _r33, _k16171819, 2); - _sum = GiSimdFmaLane(_sum, _r34, _k16171819, 3); - - _sum = GiSimdFmaLane(_sum, _r40, _k20212223, 0); - _sum = GiSimdFmaLane(_sum, _r41, _k20212223, 1); - _sum = GiSimdFmaLane(_sum, _r42, _k20212223, 2); - _sum = GiSimdFmaLane(_sum, _r43, _k20212223, 3); - _sum = GiSimdFmaLane(_sum, _r44, _k24242424, 0); + _sum = MLA(_sum, _r00, _k0123, 0); + _sum = MLA(_sum, _r01, _k0123, 1); + _sum = MLA(_sum, _r02, _k0123, 2); + _sum = MLA(_sum, _r03, _k0123, 3); + _sum = MLA(_sum, _r04, _k4567, 0); + + _sum = MLA(_sum, _r10, _k4567, 1); + _sum = MLA(_sum, _r11, _k4567, 2); + _sum = MLA(_sum, _r12, _k4567, 3); + _sum = MLA(_sum, _r13, _k891011, 0); + _sum = MLA(_sum, _r14, _k891011, 1); + + _sum = MLA(_sum, _r20, _k891011, 2); + _sum = MLA(_sum, _r21, _k891011, 3); + _sum = MLA(_sum, _r22, _k12131415, 0); + _sum = MLA(_sum, _r23, _k12131415, 1); + _sum = MLA(_sum, _r24, _k12131415, 2); + + _sum = MLA(_sum, _r30, _k12131415, 3); + _sum = MLA(_sum, _r31, _k16171819, 0); + _sum = MLA(_sum, _r32, _k16171819, 1); + _sum = MLA(_sum, _r33, _k16171819, 2); + _sum = MLA(_sum, _r34, _k16171819, 3); + + _sum = MLA(_sum, _r40, _k20212223, 0); + _sum = MLA(_sum, _r41, _k20212223, 1); + _sum = MLA(_sum, _r42, _k20212223, 2); + _sum = MLA(_sum, _r43, _k20212223, 3); + _sum = MLA(_sum, _r44, _k24242424, 0); GiStoreFloat32(outptr, _sum); @@ -312,8 +358,13 @@ void conv_stride2::do_conv_7x7_stride2( rep(i, nn) { GI_FLOAT32_t _sum = GiLoadFloat32(outptr); +#if defined(PREFER_VF) + const float* _k0123 = k0; + const float* _k4567 = k0 + 4; +#else GI_FLOAT32_t _k0123 = GiLoadFloat32(k0); GI_FLOAT32_t _k4567 = GiLoadFloat32(k0 + 4); +#endif GI_FLOAT32_V2_t _r00_02461357 = GiLoadUzipFloat32V2(r0); GI_FLOAT32_V2_t _r00nx2 = GiLoadUzipFloat32V2(r0 + 8); @@ -331,16 +382,21 @@ void conv_stride2::do_conv_7x7_stride2( GI_FLOAT32_t _r05 = GiExtqFloat32(_r01, _r0_9111315, 2); // 5 7 9 11 GI_FLOAT32_t _r06 = GiExtqFloat32(_r00, _r0_8101214, 3); // 6 8 10 12 - _sum = GiSimdFmaLane(_sum, _r00, _k0123, 0); - _sum = GiSimdFmaLane(_sum, _r01, _k0123, 1); - _sum = GiSimdFmaLane(_sum, _r02, _k0123, 2); - _sum = GiSimdFmaLane(_sum, _r03, _k0123, 3); - _sum = GiSimdFmaLane(_sum, _r04, _k4567, 0); - _sum = GiSimdFmaLane(_sum, _r05, _k4567, 1); - _sum = GiSimdFmaLane(_sum, _r06, _k4567, 2); - + _sum = MLA(_sum, _r00, _k0123, 0); + _sum = MLA(_sum, _r01, _k0123, 1); + _sum = MLA(_sum, _r02, _k0123, 2); + _sum = MLA(_sum, _r03, _k0123, 3); + _sum = MLA(_sum, _r04, _k4567, 0); + _sum = MLA(_sum, _r05, _k4567, 1); + _sum = MLA(_sum, _r06, _k4567, 2); + +#if defined(PREFER_VF) + const float* _k78910 = k1; + const float* _k11121314 = k1 + 4; +#else GI_FLOAT32_t _k78910 = GiLoadFloat32(k1); GI_FLOAT32_t _k11121314 = GiLoadFloat32(k1 + 4); +#endif GI_FLOAT32_V2_t _r10_02461357 = GiLoadUzipFloat32V2(r1); GI_FLOAT32_V2_t _r10nx2 = GiLoadUzipFloat32V2(r1 + 8); @@ -354,16 +410,21 @@ void conv_stride2::do_conv_7x7_stride2( GI_FLOAT32_t _r15 = GiExtqFloat32(_r11, _r1_9111315, 2); GI_FLOAT32_t _r16 = GiExtqFloat32(_r10, _r1_8101214, 3); - _sum = GiSimdFmaLane(_sum, _r10, _k78910, 0); - _sum = GiSimdFmaLane(_sum, _r11, _k78910, 1); - _sum = GiSimdFmaLane(_sum, _r12, _k78910, 2); - _sum = GiSimdFmaLane(_sum, _r13, _k78910, 3); - _sum = GiSimdFmaLane(_sum, _r14, _k11121314, 0); - _sum = GiSimdFmaLane(_sum, _r15, _k11121314, 1); - _sum = GiSimdFmaLane(_sum, _r16, _k11121314, 2); - + _sum = MLA(_sum, _r10, _k78910, 0); + _sum = MLA(_sum, _r11, _k78910, 1); + _sum = MLA(_sum, _r12, _k78910, 2); + _sum = MLA(_sum, _r13, _k78910, 3); + _sum = MLA(_sum, _r14, _k11121314, 0); + _sum = MLA(_sum, _r15, _k11121314, 1); + _sum = MLA(_sum, _r16, _k11121314, 2); + +#if defined(PREFER_VF) + const float* _k14151617 = k2; + const float* _k18192021 = k2 + 4; +#else GI_FLOAT32_t _k14151617 = GiLoadFloat32(k2); GI_FLOAT32_t _k18192021 = GiLoadFloat32(k2 + 4); +#endif GI_FLOAT32_V2_t _r20_02461357 = GiLoadUzipFloat32V2(r2); GI_FLOAT32_V2_t _r20nx2 = GiLoadUzipFloat32V2(r2 + 8); @@ -377,16 +438,21 @@ void conv_stride2::do_conv_7x7_stride2( GI_FLOAT32_t _r25 = GiExtqFloat32(_r21, _r2_9111315, 2); GI_FLOAT32_t _r26 = GiExtqFloat32(_r20, _r2_8101214, 3); - _sum = GiSimdFmaLane(_sum, _r20, _k14151617, 0); - _sum = GiSimdFmaLane(_sum, _r21, _k14151617, 1); - _sum = GiSimdFmaLane(_sum, _r22, _k14151617, 2); - _sum = GiSimdFmaLane(_sum, _r23, _k14151617, 3); - _sum = GiSimdFmaLane(_sum, _r24, _k18192021, 0); - _sum = GiSimdFmaLane(_sum, _r25, _k18192021, 1); - _sum = GiSimdFmaLane(_sum, _r26, _k18192021, 2); - + _sum = MLA(_sum, _r20, _k14151617, 0); + _sum = MLA(_sum, _r21, _k14151617, 1); + _sum = MLA(_sum, _r22, _k14151617, 2); + _sum = MLA(_sum, _r23, _k14151617, 3); + _sum = MLA(_sum, _r24, _k18192021, 0); + _sum = MLA(_sum, _r25, _k18192021, 1); + _sum = MLA(_sum, _r26, _k18192021, 2); + +#if defined(PREFER_VF) + const float* _k21222324 = k3; + const float* _k25262728 = k3 + 4; +#else GI_FLOAT32_t _k21222324 = GiLoadFloat32(k3); GI_FLOAT32_t _k25262728 = GiLoadFloat32(k3 + 4); +#endif GI_FLOAT32_V2_t _r30_02461357 = GiLoadUzipFloat32V2(r3); GI_FLOAT32_V2_t _r30nx2 = GiLoadUzipFloat32V2(r3 + 8); @@ -400,16 +466,21 @@ void conv_stride2::do_conv_7x7_stride2( GI_FLOAT32_t _r35 = GiExtqFloat32(_r31, _r3_9111315, 2); GI_FLOAT32_t _r36 = GiExtqFloat32(_r30, _r3_8101214, 3); - _sum = GiSimdFmaLane(_sum, _r30, _k21222324, 0); - _sum = GiSimdFmaLane(_sum, _r31, _k21222324, 1); - _sum = GiSimdFmaLane(_sum, _r32, _k21222324, 2); - _sum = GiSimdFmaLane(_sum, _r33, _k21222324, 3); - _sum = GiSimdFmaLane(_sum, _r34, _k25262728, 0); - _sum = GiSimdFmaLane(_sum, _r35, _k25262728, 1); - _sum = GiSimdFmaLane(_sum, _r36, _k25262728, 2); - + _sum = MLA(_sum, _r30, _k21222324, 0); + _sum = MLA(_sum, _r31, _k21222324, 1); + _sum = MLA(_sum, _r32, _k21222324, 2); + _sum = MLA(_sum, _r33, _k21222324, 3); + _sum = MLA(_sum, _r34, _k25262728, 0); + _sum = MLA(_sum, _r35, _k25262728, 1); + _sum = MLA(_sum, _r36, _k25262728, 2); + +#if defined(PREFER_VF) + const float* _k28293031 = k4; + const float* _k32333435 = k4 + 4; +#else GI_FLOAT32_t _k28293031 = GiLoadFloat32(k4); GI_FLOAT32_t _k32333435 = GiLoadFloat32(k4 + 4); +#endif GI_FLOAT32_V2_t _r40_02461357 = GiLoadUzipFloat32V2(r4); GI_FLOAT32_V2_t _r40nx2 = GiLoadUzipFloat32V2(r4 + 8); @@ -423,16 +494,21 @@ void conv_stride2::do_conv_7x7_stride2( GI_FLOAT32_t _r45 = GiExtqFloat32(_r41, _r4_9111315, 2); GI_FLOAT32_t _r46 = GiExtqFloat32(_r40, _r4_8101214, 3); - _sum = GiSimdFmaLane(_sum, _r40, _k28293031, 0); - _sum = GiSimdFmaLane(_sum, _r41, _k28293031, 1); - _sum = GiSimdFmaLane(_sum, _r42, _k28293031, 2); - _sum = GiSimdFmaLane(_sum, _r43, _k28293031, 3); - _sum = GiSimdFmaLane(_sum, _r44, _k32333435, 0); - _sum = GiSimdFmaLane(_sum, _r45, _k32333435, 1); - _sum = GiSimdFmaLane(_sum, _r46, _k32333435, 2); - + _sum = MLA(_sum, _r40, _k28293031, 0); + _sum = MLA(_sum, _r41, _k28293031, 1); + _sum = MLA(_sum, _r42, _k28293031, 2); + _sum = MLA(_sum, _r43, _k28293031, 3); + _sum = MLA(_sum, _r44, _k32333435, 0); + _sum = MLA(_sum, _r45, _k32333435, 1); + _sum = MLA(_sum, _r46, _k32333435, 2); + +#if defined(PREFER_VF) + const float* _k35363738 = k5; + const float* _k39404142 = k5 + 4; +#else GI_FLOAT32_t _k35363738 = GiLoadFloat32(k5); GI_FLOAT32_t _k39404142 = GiLoadFloat32(k5 + 4); +#endif GI_FLOAT32_V2_t _r50_02461357 = GiLoadUzipFloat32V2(r5); GI_FLOAT32_V2_t _r50nx2 = GiLoadUzipFloat32V2(r5 + 8); @@ -446,16 +522,21 @@ void conv_stride2::do_conv_7x7_stride2( GI_FLOAT32_t _r55 = GiExtqFloat32(_r51, _r5_9111315, 2); GI_FLOAT32_t _r56 = GiExtqFloat32(_r50, _r5_8101214, 3); - _sum = GiSimdFmaLane(_sum, _r50, _k35363738, 0); - _sum = GiSimdFmaLane(_sum, _r51, _k35363738, 1); - _sum = GiSimdFmaLane(_sum, _r52, _k35363738, 2); - _sum = GiSimdFmaLane(_sum, _r53, _k35363738, 3); - _sum = GiSimdFmaLane(_sum, _r54, _k39404142, 0); - _sum = GiSimdFmaLane(_sum, _r55, _k39404142, 1); - _sum = GiSimdFmaLane(_sum, _r56, _k39404142, 2); - + _sum = MLA(_sum, _r50, _k35363738, 0); + _sum = MLA(_sum, _r51, _k35363738, 1); + _sum = MLA(_sum, _r52, _k35363738, 2); + _sum = MLA(_sum, _r53, _k35363738, 3); + _sum = MLA(_sum, _r54, _k39404142, 0); + _sum = MLA(_sum, _r55, _k39404142, 1); + _sum = MLA(_sum, _r56, _k39404142, 2); + +#if defined(PREFER_VF) + const float* _k42434445 = k6; + const float* _k45464748 = k6 + 3; +#else GI_FLOAT32_t _k42434445 = GiLoadFloat32(k6); GI_FLOAT32_t _k45464748 = GiLoadFloat32(k6 + 3); +#endif GI_FLOAT32_V2_t _r60_02461357 = GiLoadUzipFloat32V2(r6); GI_FLOAT32_V2_t _r60nx2 = GiLoadUzipFloat32V2(r6 + 8); @@ -469,13 +550,13 @@ void conv_stride2::do_conv_7x7_stride2( GI_FLOAT32_t _r65 = GiExtqFloat32(_r61, _r6_9111315, 2); GI_FLOAT32_t _r66 = GiExtqFloat32(_r60, _r6_8101214, 3); - _sum = GiSimdFmaLane(_sum, _r60, _k42434445, 0); - _sum = GiSimdFmaLane(_sum, _r61, _k42434445, 1); - _sum = GiSimdFmaLane(_sum, _r62, _k42434445, 2); - _sum = GiSimdFmaLane(_sum, _r63, _k42434445, 3); - _sum = GiSimdFmaLane(_sum, _r64, _k45464748, 1); - _sum = GiSimdFmaLane(_sum, _r65, _k45464748, 2); - _sum = GiSimdFmaLane(_sum, _r66, _k45464748, 3); + _sum = MLA(_sum, _r60, _k42434445, 0); + _sum = MLA(_sum, _r61, _k42434445, 1); + _sum = MLA(_sum, _r62, _k42434445, 2); + _sum = MLA(_sum, _r63, _k42434445, 3); + _sum = MLA(_sum, _r64, _k45464748, 1); + _sum = MLA(_sum, _r65, _k45464748, 2); + _sum = MLA(_sum, _r66, _k45464748, 3); GiStoreFloat32(outptr, _sum); diff --git a/dnn/src/fallback/conv_bias/gi/fp32/strategy_f73_mk4_nchw44.cpp b/dnn/src/fallback/conv_bias/gi/fp32/strategy_f73_mk4_nchw44.cpp index 41e29cb2..087a738c 100644 --- a/dnn/src/fallback/conv_bias/gi/fp32/strategy_f73_mk4_nchw44.cpp +++ b/dnn/src/fallback/conv_bias/gi/fp32/strategy_f73_mk4_nchw44.cpp @@ -75,6 +75,21 @@ struct InputTransformF73_NCHW44 { size_t icb = ic / pack_size; GI_FLOAT32_t d0, d1, d2, d3, d4, d5, d6, d7, d8; +#if defined(GI_TARGET_X86) || defined(GI_RVV_INTRINSICS) +//! x86 and rvv GiSimdFmaLane API is slowly, as an alternate, use +//! GiMultiplyAddScalarFloat32 +#define MADD(a, b, c, d) GiMultiplyAddScalarFloat32(a, b, *(c + d)) + const float* v0 = input_parameters + 0; + const float* v1 = input_parameters + 4; + const float* v2 = input_parameters + 8; + const float* v3 = input_parameters + 12; + const float* v4 = input_parameters + 16; + const float* v5 = input_parameters + 20; + const float* v6 = input_parameters + 24; +#define MSUB(a, b, c, d) GiMultiplySubScalarFloat32(a, b, *(c + d)) +#else +#define MADD(a, b, c, d) GiSimdFmaLane(a, b, c, d) +#define MSUB(a, b, c, d) GiFmsqLaneQFloat32(a, b, c, d) GI_FLOAT32_t v0 = GiLoadFloat32(input_parameters + 0); GI_FLOAT32_t v1 = GiLoadFloat32(input_parameters + 4); GI_FLOAT32_t v2 = GiLoadFloat32(input_parameters + 8); @@ -82,6 +97,7 @@ struct InputTransformF73_NCHW44 { GI_FLOAT32_t v4 = GiLoadFloat32(input_parameters + 16); GI_FLOAT32_t v5 = GiLoadFloat32(input_parameters + 20); GI_FLOAT32_t v6 = GiLoadFloat32(input_parameters + 24); +#endif //! B //! 1.5 0 0 0 0 0 0 0 0 @@ -120,59 +136,59 @@ struct InputTransformF73_NCHW44 { auto t##i##5 = d7; \ auto t##i##6 = d7; \ auto t##i##7 = d7; \ - t##i##8 = GiFmsqLaneQFloat32(t##i##8, d7, v0, 0); \ + t##i##8 = MSUB(t##i##8, d7, v0, 0); \ t##i##0 = GiSubtractFloat32(t##i##0, d1); \ - t##i##1 = GiFmsqLaneQFloat32(t##i##1, d1, v0, 0); \ - t##i##2 = GiSimdFmaLane(t##i##2, d1, v0, 0); \ - t##i##3 = GiFmsqLaneQFloat32(t##i##3, d1, v0, 1); \ - t##i##4 = GiSimdFmaLane(t##i##4, d1, v0, 1); \ - t##i##5 = GiFmsqLaneQFloat32(t##i##5, d1, v0, 2); \ - t##i##6 = GiSimdFmaLane(t##i##6, d1, v0, 2); \ + t##i##1 = MSUB(t##i##1, d1, v0, 0); \ + t##i##2 = MADD(t##i##2, d1, v0, 0); \ + t##i##3 = MSUB(t##i##3, d1, v0, 1); \ + t##i##4 = MADD(t##i##4, d1, v0, 1); \ + t##i##5 = MSUB(t##i##5, d1, v0, 2); \ + t##i##6 = MADD(t##i##6, d1, v0, 2); \ t##i##7 = GiSubtractFloat32(t##i##7, d1); \ - t##i##8 = GiSimdFmaLane(t##i##8, d1, v0, 0); \ - t##i##0 = GiFmsqLaneQFloat32(t##i##0, d2, v0, 3); \ - t##i##1 = GiFmsqLaneQFloat32(t##i##1, d2, v1, 0); \ - t##i##2 = GiFmsqLaneQFloat32(t##i##2, d2, v1, 1); \ - t##i##3 = GiSimdFmaLane(t##i##3, d2, v1, 2); \ - t##i##4 = GiFmsqLaneQFloat32(t##i##4, d2, v1, 3); \ - t##i##5 = GiFmsqLaneQFloat32(t##i##5, d2, v2, 0); \ - t##i##6 = GiFmsqLaneQFloat32(t##i##6, d2, v2, 1); \ + t##i##8 = MADD(t##i##8, d1, v0, 0); \ + t##i##0 = MSUB(t##i##0, d2, v0, 3); \ + t##i##1 = MSUB(t##i##1, d2, v1, 0); \ + t##i##2 = MSUB(t##i##2, d2, v1, 1); \ + t##i##3 = MADD(t##i##3, d2, v1, 2); \ + t##i##4 = MSUB(t##i##4, d2, v1, 3); \ + t##i##5 = MSUB(t##i##5, d2, v2, 0); \ + t##i##6 = MSUB(t##i##6, d2, v2, 1); \ t##i##8 = GiSubtractFloat32(t##i##8, d2); \ - t##i##0 = GiSimdFmaLane(t##i##0, d3, v2, 2); \ - t##i##1 = GiSimdFmaLane(t##i##1, d3, v2, 3); \ - t##i##2 = GiFmsqLaneQFloat32(t##i##2, d3, v3, 0); \ - t##i##3 = GiSimdFmaLane(t##i##3, d3, v2, 0); \ - t##i##4 = GiFmsqLaneQFloat32(t##i##4, d3, v3, 1); \ - t##i##5 = GiSimdFmaLane(t##i##5, d3, v3, 2); \ - t##i##6 = GiSimdFmaLane(t##i##6, d3, v3, 3); \ - t##i##7 = GiSimdFmaLane(t##i##7, d3, v2, 2); \ - t##i##8 = GiFmsqLaneQFloat32(t##i##8, d3, v0, 3); \ - t##i##0 = GiSimdFmaLane(t##i##0, d4, v0, 3); \ - t##i##1 = GiSimdFmaLane(t##i##1, d4, v4, 0); \ - t##i##2 = GiSimdFmaLane(t##i##2, d4, v4, 1); \ - t##i##3 = GiFmsqLaneQFloat32(t##i##3, d4, v4, 2); \ - t##i##4 = GiSimdFmaLane(t##i##4, d4, v4, 3); \ - t##i##5 = GiSimdFmaLane(t##i##5, d4, v5, 0); \ - t##i##6 = GiSimdFmaLane(t##i##6, d4, v5, 1); \ - t##i##8 = GiSimdFmaLane(t##i##8, d4, v2, 2); \ - t##i##0 = GiFmsqLaneQFloat32(t##i##0, d5, v2, 2); \ - t##i##1 = GiFmsqLaneQFloat32(t##i##1, d5, v5, 2); \ - t##i##2 = GiFmsqLaneQFloat32(t##i##2, d5, v5, 3); \ - t##i##3 = GiFmsqLaneQFloat32(t##i##3, d5, v6, 0); \ - t##i##4 = GiSimdFmaLane(t##i##4, d5, v6, 1); \ - t##i##5 = GiFmsqLaneQFloat32(t##i##5, d5, v5, 2); \ - t##i##6 = GiFmsqLaneQFloat32(t##i##6, d5, v6, 0); \ - t##i##7 = GiFmsqLaneQFloat32(t##i##7, d5, v2, 2); \ - t##i##8 = GiSimdFmaLane(t##i##8, d5, v0, 3); \ - t##i##0 = GiFmsqLaneQFloat32(t##i##0, d6, v0, 0); \ - t##i##1 = GiFmsqLaneQFloat32(t##i##1, d6, v1, 0); \ - t##i##2 = GiFmsqLaneQFloat32(t##i##2, d6, v1, 1); \ - t##i##3 = GiSimdFmaLane(t##i##3, d6, v1, 0); \ - t##i##4 = GiFmsqLaneQFloat32(t##i##4, d6, v3, 1); \ + t##i##0 = MADD(t##i##0, d3, v2, 2); \ + t##i##1 = MADD(t##i##1, d3, v2, 3); \ + t##i##2 = MSUB(t##i##2, d3, v3, 0); \ + t##i##3 = MADD(t##i##3, d3, v2, 0); \ + t##i##4 = MSUB(t##i##4, d3, v3, 1); \ + t##i##5 = MADD(t##i##5, d3, v3, 2); \ + t##i##6 = MADD(t##i##6, d3, v3, 3); \ + t##i##7 = MADD(t##i##7, d3, v2, 2); \ + t##i##8 = MSUB(t##i##8, d3, v0, 3); \ + t##i##0 = MADD(t##i##0, d4, v0, 3); \ + t##i##1 = MADD(t##i##1, d4, v4, 0); \ + t##i##2 = MADD(t##i##2, d4, v4, 1); \ + t##i##3 = MSUB(t##i##3, d4, v4, 2); \ + t##i##4 = MADD(t##i##4, d4, v4, 3); \ + t##i##5 = MADD(t##i##5, d4, v5, 0); \ + t##i##6 = MADD(t##i##6, d4, v5, 1); \ + t##i##8 = MADD(t##i##8, d4, v2, 2); \ + t##i##0 = MSUB(t##i##0, d5, v2, 2); \ + t##i##1 = MSUB(t##i##1, d5, v5, 2); \ + t##i##2 = MSUB(t##i##2, d5, v5, 3); \ + t##i##3 = MSUB(t##i##3, d5, v6, 0); \ + t##i##4 = MADD(t##i##4, d5, v6, 1); \ + t##i##5 = MSUB(t##i##5, d5, v5, 2); \ + t##i##6 = MSUB(t##i##6, d5, v6, 0); \ + t##i##7 = MSUB(t##i##7, d5, v2, 2); \ + t##i##8 = MADD(t##i##8, d5, v0, 3); \ + t##i##0 = MSUB(t##i##0, d6, v0, 0); \ + t##i##1 = MSUB(t##i##1, d6, v1, 0); \ + t##i##2 = MSUB(t##i##2, d6, v1, 1); \ + t##i##3 = MADD(t##i##3, d6, v1, 0); \ + t##i##4 = MSUB(t##i##4, d6, v3, 1); \ t##i##5 = GiSubtractFloat32(t##i##5, d6); \ - t##i##6 = GiFmsqLaneQFloat32(t##i##6, d6, v6, 2); \ - t##i##8 = GiFmsqLaneQFloat32(t##i##8, d6, v2, 2); \ - t##i##0 = GiSimdFmaLane(t##i##0, d0, v0, 0); + t##i##6 = MSUB(t##i##6, d6, v6, 2); \ + t##i##8 = MSUB(t##i##8, d6, v2, 2); \ + t##i##0 = MADD(t##i##0, d0, v0, 0); UNROLL_CALL_RAW(9, cb); #undef cb @@ -187,59 +203,59 @@ struct InputTransformF73_NCHW44 { d5 = t7##i; \ d6 = t7##i; \ d7 = t7##i; \ - d8 = GiFmsqLaneQFloat32(d8, t7##i, v0, 0); \ + d8 = MSUB(d8, t7##i, v0, 0); \ d0 = GiSubtractFloat32(d0, t1##i); \ - d1 = GiFmsqLaneQFloat32(d1, t1##i, v0, 0); \ - d2 = GiSimdFmaLane(d2, t1##i, v0, 0); \ - d3 = GiFmsqLaneQFloat32(d3, t1##i, v0, 1); \ - d4 = GiSimdFmaLane(d4, t1##i, v0, 1); \ - d5 = GiFmsqLaneQFloat32(d5, t1##i, v0, 2); \ - d6 = GiSimdFmaLane(d6, t1##i, v0, 2); \ + d1 = MSUB(d1, t1##i, v0, 0); \ + d2 = MADD(d2, t1##i, v0, 0); \ + d3 = MSUB(d3, t1##i, v0, 1); \ + d4 = MADD(d4, t1##i, v0, 1); \ + d5 = MSUB(d5, t1##i, v0, 2); \ + d6 = MADD(d6, t1##i, v0, 2); \ d7 = GiSubtractFloat32(d7, t1##i); \ - d8 = GiSimdFmaLane(d8, t1##i, v0, 0); \ - d0 = GiFmsqLaneQFloat32(d0, t2##i, v0, 3); \ - d1 = GiFmsqLaneQFloat32(d1, t2##i, v1, 0); \ - d2 = GiFmsqLaneQFloat32(d2, t2##i, v1, 1); \ - d3 = GiSimdFmaLane(d3, t2##i, v1, 2); \ - d4 = GiFmsqLaneQFloat32(d4, t2##i, v1, 3); \ - d5 = GiFmsqLaneQFloat32(d5, t2##i, v2, 0); \ - d6 = GiFmsqLaneQFloat32(d6, t2##i, v2, 1); \ + d8 = MADD(d8, t1##i, v0, 0); \ + d0 = MSUB(d0, t2##i, v0, 3); \ + d1 = MSUB(d1, t2##i, v1, 0); \ + d2 = MSUB(d2, t2##i, v1, 1); \ + d3 = MADD(d3, t2##i, v1, 2); \ + d4 = MSUB(d4, t2##i, v1, 3); \ + d5 = MSUB(d5, t2##i, v2, 0); \ + d6 = MSUB(d6, t2##i, v2, 1); \ d8 = GiSubtractFloat32(d8, t2##i); \ - d0 = GiSimdFmaLane(d0, t3##i, v2, 2); \ - d1 = GiSimdFmaLane(d1, t3##i, v2, 3); \ - d2 = GiFmsqLaneQFloat32(d2, t3##i, v3, 0); \ - d3 = GiSimdFmaLane(d3, t3##i, v2, 0); \ - d4 = GiFmsqLaneQFloat32(d4, t3##i, v3, 1); \ - d5 = GiSimdFmaLane(d5, t3##i, v3, 2); \ - d6 = GiSimdFmaLane(d6, t3##i, v3, 3); \ - d7 = GiSimdFmaLane(d7, t3##i, v2, 2); \ - d8 = GiFmsqLaneQFloat32(d8, t3##i, v0, 3); \ - d0 = GiSimdFmaLane(d0, t4##i, v0, 3); \ - d1 = GiSimdFmaLane(d1, t4##i, v4, 0); \ - d2 = GiSimdFmaLane(d2, t4##i, v4, 1); \ - d3 = GiFmsqLaneQFloat32(d3, t4##i, v4, 2); \ - d4 = GiSimdFmaLane(d4, t4##i, v4, 3); \ - d5 = GiSimdFmaLane(d5, t4##i, v5, 0); \ - d6 = GiSimdFmaLane(d6, t4##i, v5, 1); \ - d8 = GiSimdFmaLane(d8, t4##i, v2, 2); \ - d0 = GiFmsqLaneQFloat32(d0, t5##i, v2, 2); \ - d1 = GiFmsqLaneQFloat32(d1, t5##i, v5, 2); \ - d2 = GiFmsqLaneQFloat32(d2, t5##i, v5, 3); \ - d3 = GiFmsqLaneQFloat32(d3, t5##i, v6, 0); \ - d4 = GiSimdFmaLane(d4, t5##i, v6, 1); \ - d5 = GiFmsqLaneQFloat32(d5, t5##i, v5, 2); \ - d6 = GiFmsqLaneQFloat32(d6, t5##i, v6, 0); \ - d7 = GiFmsqLaneQFloat32(d7, t5##i, v2, 2); \ - d8 = GiSimdFmaLane(d8, t5##i, v0, 3); \ - d0 = GiFmsqLaneQFloat32(d0, t6##i, v0, 0); \ - d1 = GiFmsqLaneQFloat32(d1, t6##i, v1, 0); \ - d2 = GiFmsqLaneQFloat32(d2, t6##i, v1, 1); \ - d3 = GiSimdFmaLane(d3, t6##i, v1, 0); \ - d4 = GiFmsqLaneQFloat32(d4, t6##i, v3, 1); \ + d0 = MADD(d0, t3##i, v2, 2); \ + d1 = MADD(d1, t3##i, v2, 3); \ + d2 = MSUB(d2, t3##i, v3, 0); \ + d3 = MADD(d3, t3##i, v2, 0); \ + d4 = MSUB(d4, t3##i, v3, 1); \ + d5 = MADD(d5, t3##i, v3, 2); \ + d6 = MADD(d6, t3##i, v3, 3); \ + d7 = MADD(d7, t3##i, v2, 2); \ + d8 = MSUB(d8, t3##i, v0, 3); \ + d0 = MADD(d0, t4##i, v0, 3); \ + d1 = MADD(d1, t4##i, v4, 0); \ + d2 = MADD(d2, t4##i, v4, 1); \ + d3 = MSUB(d3, t4##i, v4, 2); \ + d4 = MADD(d4, t4##i, v4, 3); \ + d5 = MADD(d5, t4##i, v5, 0); \ + d6 = MADD(d6, t4##i, v5, 1); \ + d8 = MADD(d8, t4##i, v2, 2); \ + d0 = MSUB(d0, t5##i, v2, 2); \ + d1 = MSUB(d1, t5##i, v5, 2); \ + d2 = MSUB(d2, t5##i, v5, 3); \ + d3 = MSUB(d3, t5##i, v6, 0); \ + d4 = MADD(d4, t5##i, v6, 1); \ + d5 = MSUB(d5, t5##i, v5, 2); \ + d6 = MSUB(d6, t5##i, v6, 0); \ + d7 = MSUB(d7, t5##i, v2, 2); \ + d8 = MADD(d8, t5##i, v0, 3); \ + d0 = MSUB(d0, t6##i, v0, 0); \ + d1 = MSUB(d1, t6##i, v1, 0); \ + d2 = MSUB(d2, t6##i, v1, 1); \ + d3 = MADD(d3, t6##i, v1, 0); \ + d4 = MSUB(d4, t6##i, v3, 1); \ d5 = GiSubtractFloat32(d5, t6##i); \ - d6 = GiFmsqLaneQFloat32(d6, t6##i, v6, 2); \ - d8 = GiFmsqLaneQFloat32(d8, t6##i, v2, 2); \ - d0 = GiSimdFmaLane(d0, t0##i, v0, 0); \ + d6 = MSUB(d6, t6##i, v6, 2); \ + d8 = MSUB(d8, t6##i, v2, 2); \ + d0 = MADD(d0, t0##i, v0, 0); \ GiStoreFloat32( \ input_transform_buf + \ (0 * alpha + i) * ICB * nr_units_in_tile * pack_size + \ @@ -288,6 +304,8 @@ struct InputTransformF73_NCHW44 { UNROLL_CALL_RAW(9, cb); #undef cb +#undef MADD +#undef MSUB } }; diff --git a/dnn/src/fallback/general_intrinsic/gi_float.h b/dnn/src/fallback/general_intrinsic/gi_float.h index 8da92f43..8f9edde3 100644 --- a/dnn/src/fallback/general_intrinsic/gi_float.h +++ b/dnn/src/fallback/general_intrinsic/gi_float.h @@ -224,9 +224,7 @@ GI_FLOAT32_t GiMlaqFloat32(GI_FLOAT32_t a, GI_FLOAT32_t b, GI_FLOAT32_t c) { #endif #elif defined(GI_SSE2_INTRINSICS) // fma is coming soon, but right now: - __m128 res; - res = _mm_mul_ps(c, b); - return _mm_add_ps(a, res); + return _mm_add_ps(a, _mm_mul_ps(c, b)); #elif defined(GI_RVV_INTRINSICS) return vfmadd_vv_f32m1(b, c, a, GI_SIMD_LEN_BYTE / sizeof(float)); #else