diff --git a/dnn/src/fallback/conv_bias/gi/fp32/channel_wise_nchw44_kern.cpp b/dnn/src/fallback/conv_bias/gi/fp32/channel_wise_nchw44_kern.cpp index fc8d0084..b64b2557 100644 --- a/dnn/src/fallback/conv_bias/gi/fp32/channel_wise_nchw44_kern.cpp +++ b/dnn/src/fallback/conv_bias/gi/fp32/channel_wise_nchw44_kern.cpp @@ -748,7 +748,7 @@ void channel_wise_nchw44_float::do_conv_kern_stride1_5x5( GI_FLOAT32_FIXLEN_t src_v[2][5]; #define COMPUTE_5X5_2(i, dst, src, kernel0, kernel1) \ load_vec<5>(kernel0, filter + i * 5 * 4); \ - load_vec<6>(src, input + i * IW * 4); \ + load_vec<5>(src, input + i * IW * 4); \ compute_vec<5>(dst[0][0], &src[0], kernel0); \ compute_vec<5>(dst[1][0], &src[0], kernel1); // line 0 @@ -813,7 +813,7 @@ void channel_wise_nchw44_float::do_conv_kern_stride1_5x5( GI_FLOAT32_FIXLEN_t src_v[2][5]; #define COMPUTE_5X5_1(i, dst, src, kernel) \ load_vec<5>(kernel, filter + i * 5 * 4); \ - load_vec<6>(src, input + i * IW * 4); \ + load_vec<5>(src, input + i * IW * 4); \ compute_vec<5>(dst, &src[0], kernel) // line 0 COMPUTE_5X5_1(0, dst_v, src_v[0], kernel[0]); @@ -1148,7 +1148,7 @@ void channel_wise_nchw44_float::do_conv_kern_stride2_5x5( GI_FLOAT32_FIXLEN_t src_v[2][5]; #define COMPUTE_5X5_1(i, dst, src, kernel) \ load_vec<5>(kernel, filter + i * 5 * 4); \ - load_vec<6>(src, input + i * IW * 4); \ + load_vec<5>(src, input + i * IW * 4); \ compute_vec<5>(dst, &src[0], kernel) // line 0 COMPUTE_5X5_1(0, dst_v, src_v[0], kernel[0]); diff --git a/dnn/src/fallback/conv_bias/gi/fp32/direct_kernels/f32_direct_nchw_nchw44_kern_common.h b/dnn/src/fallback/conv_bias/gi/fp32/direct_kernels/f32_direct_nchw_nchw44_kern_common.h index 2a434158..43e43260 100644 --- a/dnn/src/fallback/conv_bias/gi/fp32/direct_kernels/f32_direct_nchw_nchw44_kern_common.h +++ b/dnn/src/fallback/conv_bias/gi/fp32/direct_kernels/f32_direct_nchw_nchw44_kern_common.h @@ -37,6 +37,26 @@ struct ShiftCalHelper { static MEGDNN_ALWAYS_INLINE void impl(T&, T2&, T3&) {} }; +#if defined(GI_TARGET_X86) || defined(GI_RVV_INTRINSICS) +//! x86 and rvv GiSimdFmaLane API is slowly, as an alternate, use +//! GiMultiplyAddScalarFloat32 +#define MLA GiMultiplyAddScalarFloat32 +#define cb(step) \ + c[0][step] = GiFloat32Type2FixLenType(MLA( \ + GiFixLenType2GiFloat32Type(c[0][step]), \ + GiFixLenType2GiFloat32Type(weight[0][weight_idx]), \ + *(src[(step * stride + src_idx) / 4] + (step * stride + src_idx) % 4))); \ + c[1][step] = GiFloat32Type2FixLenType(MLA( \ + GiFixLenType2GiFloat32Type(c[1][step]), \ + GiFixLenType2GiFloat32Type(weight[1][weight_idx]), \ + *(src[(step * stride + src_idx) / 4] + (step * stride + src_idx) % 4))); + +#define cb2(step) \ + c[0][step] = GiFloat32Type2FixLenType(MLA( \ + GiFixLenType2GiFloat32Type(c[0][step]), \ + GiFixLenType2GiFloat32Type(weight[0][weight_idx]), \ + *(src[(step * stride + src_idx) / 4] + (step * stride + src_idx) % 4))); +#else #define cb(step) \ c[0][step] = GiFloat32Type2FixLenType(GiSimdFmaLane( \ GiFixLenType2GiFloat32Type(c[0][step]), \ @@ -55,6 +75,8 @@ struct ShiftCalHelper { GiFixLenType2GiFloat32Type(weight[0][weight_idx]), \ GiFixLenType2GiFloat32Type(src[(step * stride + src_idx) / 4]), \ (step * stride + src_idx) % 4)); +#undef MLA +#endif #define SHIFT_CAL_HELPER(ow_remain) \ template < \ @@ -151,23 +173,38 @@ struct KerGiXXs2NchwNchw44FP32(c, bias_ptr, oc_step); for (int ic_idx = 0; ic_idx < ic; ic_idx += loop_ic_step) { + //! x86 and rvv GiSimdFmaLane API is slowly, as an alternate, use + //! GiMultiplyAddScalarFloat32 +#if defined(GI_TARGET_X86) || defined(GI_RVV_INTRINSICS) + const float* src[src_reg_size]; +#else GI_FLOAT32_FIXLEN_t src[src_reg_size]; +#endif GI_FLOAT32_FIXLEN_t weight[c_dim][filter_size]; -#define KERNEL_CB(step) \ - load_helper(src, src_ptr + step * iw, 0); \ - load_helper( \ - weight, weight_ptr + step * ld_weight_fw, ld_weight_oc); \ - cal_helper<0, 0, c_dim, stride, remain_w>(c, src, weight); \ - cal_helper<1, 1, c_dim, stride, remain_w>(c, src, weight); \ - cal_helper<2, 2, c_dim, stride, remain_w>(c, src, weight); \ - cal_helper<3, 3, c_dim, stride, remain_w>(c, src, weight); \ - cal_helper<4, 4, c_dim, stride, remain_w>(c, src, weight); \ - cal_helper<5, 5, c_dim, stride, remain_w>(c, src, weight); \ +#if defined(GI_TARGET_X86) || defined(GI_RVV_INTRINSICS) +#define SRC_LOAD(step) \ + load_ptr_helper(src, src_ptr + step * iw, 0) +#else +#define SRC_LOAD(step) \ + load_helper(src, src_ptr + step * iw, 0) +#endif + +#define KERNEL_CB(step) \ + SRC_LOAD(step); \ + load_helper( \ + weight, weight_ptr + step * ld_weight_fw, ld_weight_oc); \ + cal_helper<0, 0, c_dim, stride, remain_w>(c, src, weight); \ + cal_helper<1, 1, c_dim, stride, remain_w>(c, src, weight); \ + cal_helper<2, 2, c_dim, stride, remain_w>(c, src, weight); \ + cal_helper<3, 3, c_dim, stride, remain_w>(c, src, weight); \ + cal_helper<4, 4, c_dim, stride, remain_w>(c, src, weight); \ + cal_helper<5, 5, c_dim, stride, remain_w>(c, src, weight); \ cal_helper<6, 6, c_dim, stride, remain_w>(c, src, weight); UNROLL_CALL_RAW(7, KERNEL_CB) #undef KERNEL_CB +#undef SRC_LOAD src_ptr += ld_src_ic; weight_ptr += ld_weight_ic; @@ -200,20 +237,33 @@ struct KerGiXXs2NchwNchw44FP32(c, bias_ptr, oc_step); for (int ic_idx = 0; ic_idx < ic; ic_idx += loop_ic_step) { +#if defined(GI_TARGET_X86) || defined(GI_RVV_INTRINSICS) + const float* src[src_reg_size]; +#else GI_FLOAT32_FIXLEN_t src[src_reg_size]; +#endif GI_FLOAT32_FIXLEN_t weight[c_dim][filter_size]; -#define KERNEL_CB(step) \ - load_helper(src, src_ptr + step * iw, 0); \ - load_helper( \ - weight, weight_ptr + step * ld_weight_fw, ld_weight_oc); \ - cal_helper<0, 0, c_dim, stride, remain_w>(c, src, weight); \ - cal_helper<1, 1, c_dim, stride, remain_w>(c, src, weight); \ - cal_helper<2, 2, c_dim, stride, remain_w>(c, src, weight); \ - cal_helper<3, 3, c_dim, stride, remain_w>(c, src, weight); \ +#if defined(GI_TARGET_X86) || defined(GI_RVV_INTRINSICS) +#define SRC_LOAD(step) \ + load_ptr_helper(src, src_ptr + step * iw, 0); +#else +#define SRC_LOAD(step) \ + load_helper(src, src_ptr + step * iw, 0); +#endif + +#define KERNEL_CB(step) \ + SRC_LOAD(step); \ + load_helper( \ + weight, weight_ptr + step * ld_weight_fw, ld_weight_oc); \ + cal_helper<0, 0, c_dim, stride, remain_w>(c, src, weight); \ + cal_helper<1, 1, c_dim, stride, remain_w>(c, src, weight); \ + cal_helper<2, 2, c_dim, stride, remain_w>(c, src, weight); \ + cal_helper<3, 3, c_dim, stride, remain_w>(c, src, weight); \ cal_helper<4, 4, c_dim, stride, remain_w>(c, src, weight); UNROLL_CALL_RAW(5, KERNEL_CB) #undef KERNEL_CB +#undef SRC_LOAD src_ptr += ld_src_ic; weight_ptr += ld_weight_ic; @@ -246,10 +296,18 @@ struct KerGiXXs2NchwNchw44FP32(c, bias_ptr, oc_step); for (int ic_idx = 0; ic_idx < ic; ic_idx += loop_ic_step) { +#if defined(GI_TARGET_X86) || defined(GI_RVV_INTRINSICS) + const float* src[src_reg_size]; +#else GI_FLOAT32_FIXLEN_t src[src_reg_size]; +#endif GI_FLOAT32_FIXLEN_t weight[c_dim][filter_size]; // row 0 +#if defined(GI_TARGET_X86) || defined(GI_RVV_INTRINSICS) + load_ptr_helper(src, src_ptr, 0); +#else load_helper(src, src_ptr, 0); +#endif load_helper( weight, weight_ptr, ld_weight_oc); cal_helper<0, 0, c_dim, stride, remain_w>(c, src, weight); @@ -257,7 +315,11 @@ struct KerGiXXs2NchwNchw44FP32(c, src, weight); // row 1 +#if defined(GI_TARGET_X86) || defined(GI_RVV_INTRINSICS) + load_ptr_helper(src, src_ptr + iw, 0); +#else load_helper(src, src_ptr + iw, 0); +#endif load_helper( weight, weight_ptr + 1 * ld_weight_fw, ld_weight_oc); cal_helper<0, 0, c_dim, stride, remain_w>(c, src, weight); @@ -265,8 +327,12 @@ struct KerGiXXs2NchwNchw44FP32(c, src, weight); // row 2 +#if defined(GI_TARGET_X86) || defined(GI_RVV_INTRINSICS) + load_ptr_helper(src, src_ptr + 2 * iw, 0); +#else load_helper( src, src_ptr + 2 * iw, 0); +#endif load_helper( weight, weight_ptr + 2 * ld_weight_fw, ld_weight_oc); cal_helper<0, 0, c_dim, stride, remain_w>(c, src, weight); @@ -637,17 +703,29 @@ struct KerGiXXs2NchwNchw44FP32(c, bias_ptr, oc_step); for (int ic_idx = 0; ic_idx < ic; ic_idx += loop_ic_step) { +#if defined(GI_TARGET_X86) || defined(GI_RVV_INTRINSICS) + const float* src[src_reg_size]; +#else GI_FLOAT32_FIXLEN_t src[src_reg_size]; +#endif GI_FLOAT32_FIXLEN_t weight[c_dim][filter_size]; // row 0 +#if defined(GI_TARGET_X86) || defined(GI_RVV_INTRINSICS) + load_ptr_helper(src, src_ptr, 0); +#else load_helper(src, src_ptr, 0); +#endif load_helper( weight, weight_ptr, ld_weight_oc); cal_helper<0, 0, c_dim, stride, remain_w>(c, src, weight); cal_helper<1, 1, c_dim, stride, remain_w>(c, src, weight); // row 1 +#if defined(GI_TARGET_X86) || defined(GI_RVV_INTRINSICS) + load_ptr_helper(src, src_ptr + iw, 0); +#else load_helper(src, src_ptr + iw, 0); +#endif load_helper( weight, weight_ptr + 1 * ld_weight_fw, ld_weight_oc); cal_helper<0, 0, c_dim, stride, remain_w>(c, src, weight); @@ -670,7 +748,7 @@ struct ConvDirectFp32NchwNchw44 { constexpr int fh = filter_size; constexpr int fw = filter_size; constexpr int ic_step = 1; -#if MEGDNN_ARMV7 +#if defined(GI_TARGET_X86) || defined(GI_RVV_INTRINSICS) || defined(MEGDNN_ARMV7) constexpr int big_oc_step = 4; #else constexpr int big_oc_step = 8; diff --git a/dnn/src/fallback/conv_bias/gi/intrinsic_helper.h b/dnn/src/fallback/conv_bias/gi/intrinsic_helper.h index f34a221c..e26c601b 100644 --- a/dnn/src/fallback/conv_bias/gi/intrinsic_helper.h +++ b/dnn/src/fallback/conv_bias/gi/intrinsic_helper.h @@ -62,6 +62,13 @@ struct LoadHelper { static GI_FORCEINLINE void impl(T& weight, T2 ptr, int oc_offset, XT... args); }; +template < + int weight_number, int base_offset, int ptr_step, int oc_block, typename T, + typename T2> +struct LoadPtrHelper { + static GI_FORCEINLINE void impl(T& weight, T2 ptr, int oc_offset); +}; + #define WEIGHT_CB(step) \ src[step] = GiFloat32Type2FixLenType( \ Func::impl(ptr + base_offset + step * ptr_step, args...)); @@ -96,6 +103,36 @@ LOAD_HELPER(16); #undef LOAD_HELPER #undef WEIGHT_CB +#define WEIGHT_PTR_CB(step) src[step] = ptr + base_offset + step * ptr_step; + +#define LOAD_PTR_HELPER(step) \ + template \ + struct LoadPtrHelper { \ + static GI_FORCEINLINE void impl(T& src, T2 ptr, int) { \ + UNROLL_CALL_RAW(step, WEIGHT_PTR_CB); \ + } \ + } + +LOAD_PTR_HELPER(1); +LOAD_PTR_HELPER(2); +LOAD_PTR_HELPER(3); +LOAD_PTR_HELPER(4); +LOAD_PTR_HELPER(5); +LOAD_PTR_HELPER(6); +LOAD_PTR_HELPER(7); +LOAD_PTR_HELPER(8); +LOAD_PTR_HELPER(9); +LOAD_PTR_HELPER(10); +LOAD_PTR_HELPER(11); +LOAD_PTR_HELPER(12); +LOAD_PTR_HELPER(13); +LOAD_PTR_HELPER(14); +LOAD_PTR_HELPER(15); +LOAD_PTR_HELPER(16); + +#undef LOAD_PTR_HELPER +#undef WEIGHT_PTR_CB + ///////////////////////////c_dim = 1///////////////////////// #define WEIGHT_CB(step) \ src[0][step] = \ @@ -122,6 +159,29 @@ LOAD_HELPER(9); #undef LOAD_HELPER #undef WEIGHT_CB +#define WEIGHT_PTR_CB(step) src[0][step] = ptr + base_offset + step * ptr_step; + +#define LOAD_PTR_HELPER(step) \ + template \ + struct LoadPtrHelper { \ + static GI_FORCEINLINE void impl(T& src, T2 ptr, int) { \ + UNROLL_CALL_RAW(step, WEIGHT_PTR_CB); \ + } \ + } + +LOAD_PTR_HELPER(1); +LOAD_PTR_HELPER(2); +LOAD_PTR_HELPER(3); +LOAD_PTR_HELPER(4); +LOAD_PTR_HELPER(5); +LOAD_PTR_HELPER(6); +LOAD_PTR_HELPER(7); +LOAD_PTR_HELPER(8); +LOAD_PTR_HELPER(9); + +#undef LOAD_PTR_HELPER +#undef WEIGHT_PTR_CB + /////////////////////////c_dim = 2/////////////////////////////// #define WEIGHT_CB(step) \ src[0][step] = \ @@ -149,6 +209,30 @@ LOAD_HELPER(8); #undef LOAD_HELPER #undef WEIGHT_CB +#define WEIGHT_PTR_CB(step) \ + src[0][step] = ptr + base_offset + step * ptr_step; \ + src[1][step] = ptr + base_offset + step * ptr_step + oc_offset; + +#define LOAD_PTR_HELPER(step) \ + template \ + struct LoadPtrHelper { \ + static GI_FORCEINLINE void impl(T& src, T2 ptr, int oc_offset) { \ + UNROLL_CALL_RAW(step, WEIGHT_PTR_CB); \ + } \ + } + +LOAD_PTR_HELPER(1); +LOAD_PTR_HELPER(2); +LOAD_PTR_HELPER(3); +LOAD_PTR_HELPER(4); +LOAD_PTR_HELPER(5); +LOAD_PTR_HELPER(6); +LOAD_PTR_HELPER(7); +LOAD_PTR_HELPER(8); + +#undef LOAD_HELPER +#undef WEIGHT_PTR_CB + template < int weight_number, int base_offset, int ptr_step, int c_dim, typename Func, typename T, typename T2> @@ -157,6 +241,14 @@ GI_FORCEINLINE void load_helper(T& weight, T2 ptr, int oc_offset) { weight, ptr, oc_offset); } +template < + int weight_number, int base_offset, int ptr_step, int c_dim, typename T, + typename T2> +GI_FORCEINLINE void load_ptr_helper(T& weight, T2 ptr, int oc_offset) { + LoadPtrHelper::impl( + weight, ptr, oc_offset); +} + ////////////////////Store_OCX_OW8_Remain///////////////////////// template struct StoreOcxOw8Remain { diff --git a/dnn/src/fallback/general_intrinsic/gi_float.h b/dnn/src/fallback/general_intrinsic/gi_float.h index 691da937..304e4630 100644 --- a/dnn/src/fallback/general_intrinsic/gi_float.h +++ b/dnn/src/fallback/general_intrinsic/gi_float.h @@ -1110,7 +1110,7 @@ GI_FORCEINLINE GI_FLOAT32_t GiMaximumFloat32(GI_FLOAT32_t Vector1, GI_FLOAT32_t Vector2) { #if defined(GI_NEON_INTRINSICS) return vmaxq_f32(Vector1, Vector2); -#elif defined(GI_NEON32_INTRINSICS) +#elif defined(GI_SSE2_INTRINSICS) return _mm_max_ps(Vector1, Vector2); #elif defined(GI_RVV_INTRINSICS) return vfmax_vv_f32m1(Vector1, Vector2, GI_SIMD_LEN_BYTE / sizeof(float)); @@ -1127,7 +1127,7 @@ GI_FORCEINLINE GI_FLOAT32_t GiMinimumFloat32(GI_FLOAT32_t Vector1, GI_FLOAT32_t Vector2) { #if defined(GI_NEON_INTRINSICS) return vminq_f32(Vector1, Vector2); -#elif defined(GI_NEON32_INTRINSICS) +#elif defined(GI_SSE2_INTRINSICS) return _mm_min_ps(Vector1, Vector2); #elif defined(GI_RVV_INTRINSICS) return vfmin_vv_f32m1(Vector1, Vector2, GI_SIMD_LEN_BYTE / sizeof(float));