GitOrigin-RevId: f29593be4d
HuaHua404-patch-4
@@ -24,21 +24,27 @@ struct ShiftCalHelper<src_idx, weight_idx, c_dim, ow_block, 0, T, T2, T3, T4> { | |||
static MEGDNN_ALWAYS_INLINE void impl(T&, T2&, T3&) {} | |||
}; | |||
#define cb2(step, lane, ow_block) \ | |||
c[0][step] = GiFloat32Type2FixLenType(GiSimdFmaLane( \ | |||
GiFixLenType2GiFloat32Type(c[0][step]), \ | |||
GiFixLenType2GiFloat32Type(weight[0][lane]), \ | |||
GiFixLenType2GiFloat32Type(src[(step + src_idx) % ow_block]), lane)); \ | |||
c[1][step] = GiFloat32Type2FixLenType(GiSimdFmaLane( \ | |||
GiFixLenType2GiFloat32Type(c[1][step]), \ | |||
GiFixLenType2GiFloat32Type(weight[1][lane]), \ | |||
GiFixLenType2GiFloat32Type(src[(step + src_idx) % ow_block]), lane)); | |||
#define cb(step, lane, ow_block) \ | |||
c[0][step] = GiFloat32Type2FixLenType(GiSimdFmaLane( \ | |||
GiFixLenType2GiFloat32Type(c[0][step]), \ | |||
GiFixLenType2GiFloat32Type(weight[0][lane]), \ | |||
GiFixLenType2GiFloat32Type(src[(step + src_idx) % ow_block]), lane)); | |||
#if defined(GI_TARGET_X86) || defined(GI_RVV_INTRINSICS) | |||
//! x86 and rvv GiSimdFmaLane API is slowly, as an alternate, use | |||
//! GiMultiplyAddScalarFloat32 | |||
#define MLA(a, b, c, d) \ | |||
GiMultiplyAddScalarFloat32( \ | |||
GiFixLenType2GiFloat32Type(a), GiFixLenType2GiFloat32Type(b), *(c + d)) | |||
#else | |||
#define MLA(a, b, c, d) \ | |||
GiSimdFmaLane( \ | |||
GiFixLenType2GiFloat32Type(a), GiFixLenType2GiFloat32Type(b), \ | |||
GiFixLenType2GiFloat32Type(c), d) | |||
#endif | |||
#define cb2(step, lane, ow_block) \ | |||
c[0][step] = GiFloat32Type2FixLenType( \ | |||
MLA(c[0][step], weight[0][lane], src[(step + src_idx) % ow_block], lane)); \ | |||
c[1][step] = GiFloat32Type2FixLenType( \ | |||
MLA(c[1][step], weight[1][lane], src[(step + src_idx) % ow_block], lane)); | |||
#define cb(step, lane, ow_block) \ | |||
c[0][step] = GiFloat32Type2FixLenType( \ | |||
MLA(c[0][step], weight[0][lane], src[(step + src_idx) % ow_block], lane)); | |||
#define SHIFT_CAL_HELPER(ow_block, remain_w) \ | |||
template < \ | |||
@@ -81,6 +87,7 @@ SHIFT_CAL_HELPER(4, 4); | |||
#undef SHIFT_CAL_HELPER | |||
#undef cb | |||
#undef cb2 | |||
#undef MLA | |||
template < | |||
int src_idx, int weight_idx, int c_dim, int ow_block, int remain_w, typename T, | |||
@@ -145,14 +152,23 @@ struct KerGiXXs1Nchw44FP32<bias_mode, Op, remain_w, 2, oc_block, ow_block> { | |||
for (int ic_idx = 0; ic_idx < ic; ic_idx += ic_step) { | |||
const float* src_ptr = src_ptr_origin + ic_idx * ld_src_ic; | |||
for (int fh_idx = 0; fh_idx < filter_size; ++fh_idx) { | |||
GI_FLOAT32_FIXLEN_t src[ow_block]; | |||
GI_FLOAT32_FIXLEN_t weight[c_dim][ic_step]; | |||
#if defined(GI_TARGET_X86) || defined(GI_RVV_INTRINSICS) | |||
const float* src[ow_block]; | |||
load_ptr_helper<ow_block, 0, simd_len, 0>(src, src_ptr, 0); | |||
#else | |||
GI_FLOAT32_FIXLEN_t src[ow_block]; | |||
load_helper<ow_block, 0, simd_len, 0, Vld1qF32S>(src, src_ptr, 0); | |||
#endif | |||
load_helper<ic_step, 0, oc_step, c_dim, Vld1qF32S>( | |||
weight, weight_ptr, ld_weight_oc); | |||
cal_helper<0, 0, c_dim, ow_block, remain_w>(c, src, weight); | |||
#if defined(GI_TARGET_X86) || defined(GI_RVV_INTRINSICS) | |||
src[0] = src_ptr + (ow_block)*ic_step; | |||
#else | |||
src[0] = GiFloat32Type2FixLenType( | |||
GiLoadFloat32(src_ptr + (ow_block)*ic_step)); | |||
#endif | |||
load_helper<ic_step, 1 * ld_weight, oc_step, c_dim, Vld1qF32S>( | |||
weight, weight_ptr, ld_weight_oc); | |||
cal_helper<1, 0, c_dim, ow_block, remain_w>(c, src, weight); | |||
@@ -188,19 +204,32 @@ struct KerGiXXs1Nchw44FP32<bias_mode, Op, remain_w, 3, oc_block, ow_block> { | |||
for (int ic_idx = 0; ic_idx < ic; ic_idx += ic_step) { | |||
const float* src_ptr = src_ptr_origin + ic_idx * ld_src_ic; | |||
for (int fh_idx = 0; fh_idx < filter_size; ++fh_idx) { | |||
#if defined(GI_TARGET_X86) || defined(GI_RVV_INTRINSICS) | |||
const float* src[ow_block]; | |||
load_ptr_helper<ow_block, 0, simd_len, 0>(src, src_ptr, 0); | |||
#else | |||
GI_FLOAT32_FIXLEN_t src[ow_block]; | |||
GI_FLOAT32_FIXLEN_t weight[c_dim][ic_step]; | |||
load_helper<ow_block, 0, simd_len, 0, Vld1qF32S>(src, src_ptr, 0); | |||
#endif | |||
GI_FLOAT32_FIXLEN_t weight[c_dim][ic_step]; | |||
load_helper<ic_step, 0, oc_step, c_dim, Vld1qF32S>( | |||
weight, weight_ptr, ld_weight_oc); | |||
cal_helper<0, 0, c_dim, ow_block, remain_w>(c, src, weight); | |||
#if defined(GI_TARGET_X86) || defined(GI_RVV_INTRINSICS) | |||
src[0] = src_ptr + (ow_block)*ic_step; | |||
#else | |||
src[0] = GiFloat32Type2FixLenType( | |||
GiLoadFloat32(src_ptr + (ow_block)*ic_step)); | |||
#endif | |||
load_helper<ic_step, 1 * ld_weight, oc_step, c_dim, Vld1qF32S>( | |||
weight, weight_ptr, ld_weight_oc); | |||
cal_helper<1, 0, c_dim, ow_block, remain_w>(c, src, weight); | |||
#if defined(GI_TARGET_X86) || defined(GI_RVV_INTRINSICS) | |||
src[1] = src_ptr + (ow_block + 1) * ic_step; | |||
#else | |||
src[1] = GiFloat32Type2FixLenType( | |||
GiLoadFloat32(src_ptr + (ow_block + 1) * ic_step)); | |||
#endif | |||
load_helper<ic_step, 2 * ld_weight, oc_step, c_dim, Vld1qF32S>( | |||
weight, weight_ptr, ld_weight_oc); | |||
cal_helper<2, 0, c_dim, ow_block, remain_w>(c, src, weight); | |||
@@ -235,33 +264,54 @@ struct KerGiXXs1Nchw44FP32<bias_mode, Op, remain_w, 5, oc_block, ow_block> { | |||
for (int ic_idx = 0; ic_idx < ic; ic_idx += ic_step) { | |||
const float* src_ptr = src_ptr_origin + ic_idx * ld_src_ic; | |||
for (int fh_idx = 0; fh_idx < filter_size; ++fh_idx) { | |||
#if defined(GI_TARGET_X86) || defined(GI_RVV_INTRINSICS) | |||
const float* src[ow_block]; | |||
load_ptr_helper<ow_block, 0, simd_len, 0>(src, src_ptr, 0); | |||
#else | |||
GI_FLOAT32_FIXLEN_t src[ow_block]; | |||
GI_FLOAT32_FIXLEN_t weight[c_dim][ic_step]; | |||
load_helper<ow_block, 0, simd_len, 0, Vld1qF32S>(src, src_ptr, 0); | |||
#endif | |||
GI_FLOAT32_FIXLEN_t weight[c_dim][ic_step]; | |||
load_helper<ic_step, 0, oc_step, c_dim, Vld1qF32S>( | |||
weight, weight_ptr, ld_weight_oc); | |||
cal_helper<0, 0, c_dim, ow_block, remain_w>(c, src, weight); | |||
#if defined(GI_TARGET_X86) || defined(GI_RVV_INTRINSICS) | |||
src[0] = src_ptr + (ow_block)*ic_step; | |||
#else | |||
src[0] = GiFloat32Type2FixLenType( | |||
GiLoadFloat32(src_ptr + (ow_block)*ic_step)); | |||
#endif | |||
load_helper<ic_step, 1 * ld_weight, oc_step, c_dim, Vld1qF32S>( | |||
weight, weight_ptr, ld_weight_oc); | |||
cal_helper<1, 0, c_dim, ow_block, remain_w>(c, src, weight); | |||
#if defined(GI_TARGET_X86) || defined(GI_RVV_INTRINSICS) | |||
src[1] = src_ptr + (ow_block + 1) * ic_step; | |||
#else | |||
src[1] = GiFloat32Type2FixLenType( | |||
GiLoadFloat32(src_ptr + (ow_block + 1) * ic_step)); | |||
#endif | |||
load_helper<ic_step, 2 * ld_weight, oc_step, c_dim, Vld1qF32S>( | |||
weight, weight_ptr, ld_weight_oc); | |||
cal_helper<2, 0, c_dim, ow_block, remain_w>(c, src, weight); | |||
#if defined(GI_TARGET_X86) || defined(GI_RVV_INTRINSICS) | |||
src[2] = src_ptr + (ow_block + 2) * ic_step; | |||
#else | |||
src[2] = GiFloat32Type2FixLenType( | |||
GiLoadFloat32(src_ptr + (ow_block + 2) * ic_step)); | |||
#endif | |||
load_helper<ic_step, 3 * ld_weight, oc_step, c_dim, Vld1qF32S>( | |||
weight, weight_ptr, ld_weight_oc); | |||
cal_helper<3, 0, c_dim, ow_block, remain_w>(c, src, weight); | |||
#if defined(GI_TARGET_X86) || defined(GI_RVV_INTRINSICS) | |||
src[3] = src_ptr + (ow_block + 3) * ic_step; | |||
#else | |||
src[3] = GiFloat32Type2FixLenType( | |||
GiLoadFloat32(src_ptr + (ow_block + 3) * ic_step)); | |||
#endif | |||
load_helper<ic_step, 4 * ld_weight, oc_step, c_dim, Vld1qF32S>( | |||
weight, weight_ptr, ld_weight_oc); | |||
cal_helper<4, 0, c_dim, ow_block, remain_w>(c, src, weight); | |||
@@ -297,45 +347,74 @@ struct KerGiXXs1Nchw44FP32<bias_mode, Op, remain_w, 7, oc_block, ow_block> { | |||
for (int ic_idx = 0; ic_idx < ic; ic_idx += ic_step) { | |||
const float* src_ptr = src_ptr_origin + ic_idx * ld_src_ic; | |||
for (int fh_idx = 0; fh_idx < filter_size; ++fh_idx) { | |||
#if defined(GI_TARGET_X86) || defined(GI_RVV_INTRINSICS) | |||
const float* src[ow_block]; | |||
load_ptr_helper<ow_block, 0, simd_len, 0>(src, src_ptr, 0); | |||
#else | |||
GI_FLOAT32_FIXLEN_t src[ow_block]; | |||
GI_FLOAT32_FIXLEN_t weight[c_dim][ic_step]; | |||
load_helper<ow_block, 0, simd_len, 0, Vld1qF32S>(src, src_ptr, 0); | |||
#endif | |||
GI_FLOAT32_FIXLEN_t weight[c_dim][ic_step]; | |||
load_helper<ic_step, 0, oc_step, c_dim, Vld1qF32S>( | |||
weight, weight_ptr, ld_weight_oc); | |||
cal_helper<0, 0, c_dim, ow_block, remain_w>(c, src, weight); | |||
#if defined(GI_TARGET_X86) || defined(GI_RVV_INTRINSICS) | |||
src[0] = src_ptr + (ow_block)*ic_step; | |||
#else | |||
src[0] = GiFloat32Type2FixLenType( | |||
GiLoadFloat32(src_ptr + (ow_block)*ic_step)); | |||
#endif | |||
load_helper<ic_step, 1 * ld_weight, oc_step, c_dim, Vld1qF32S>( | |||
weight, weight_ptr, ld_weight_oc); | |||
cal_helper<1, 0, c_dim, ow_block, remain_w>(c, src, weight); | |||
#if defined(GI_TARGET_X86) || defined(GI_RVV_INTRINSICS) | |||
src[1] = src_ptr + (ow_block + 1) * ic_step; | |||
#else | |||
src[1] = GiFloat32Type2FixLenType( | |||
GiLoadFloat32(src_ptr + (ow_block + 1) * ic_step)); | |||
#endif | |||
load_helper<ic_step, 2 * ld_weight, oc_step, c_dim, Vld1qF32S>( | |||
weight, weight_ptr, ld_weight_oc); | |||
cal_helper<2, 0, c_dim, ow_block, remain_w>(c, src, weight); | |||
#if defined(GI_TARGET_X86) || defined(GI_RVV_INTRINSICS) | |||
src[2] = src_ptr + (ow_block + 2) * ic_step; | |||
#else | |||
src[2] = GiFloat32Type2FixLenType( | |||
GiLoadFloat32(src_ptr + (ow_block + 2) * ic_step)); | |||
#endif | |||
load_helper<ic_step, 3 * ld_weight, oc_step, c_dim, Vld1qF32S>( | |||
weight, weight_ptr, ld_weight_oc); | |||
cal_helper<3, 0, c_dim, ow_block, remain_w>(c, src, weight); | |||
#if defined(GI_TARGET_X86) || defined(GI_RVV_INTRINSICS) | |||
src[3] = src_ptr + (ow_block + 3) * ic_step; | |||
#else | |||
src[3] = GiFloat32Type2FixLenType( | |||
GiLoadFloat32(src_ptr + (ow_block + 3) * ic_step)); | |||
#endif | |||
load_helper<ic_step, 4 * ld_weight, oc_step, c_dim, Vld1qF32S>( | |||
weight, weight_ptr, ld_weight_oc); | |||
cal_helper<4, 0, c_dim, ow_block, remain_w>(c, src, weight); | |||
#if defined(GI_TARGET_X86) || defined(GI_RVV_INTRINSICS) | |||
src[4] = src_ptr + (ow_block + 4) * ic_step; | |||
#else | |||
src[4] = GiFloat32Type2FixLenType( | |||
GiLoadFloat32(src_ptr + (ow_block + 4) * ic_step)); | |||
#endif | |||
load_helper<ic_step, 5 * ld_weight, oc_step, c_dim, Vld1qF32S>( | |||
weight, weight_ptr, ld_weight_oc); | |||
cal_helper<5, 0, c_dim, ow_block, remain_w>(c, src, weight); | |||
#if defined(GI_TARGET_X86) || defined(GI_RVV_INTRINSICS) | |||
src[5] = src_ptr + (ow_block + 5) * ic_step; | |||
#else | |||
src[5] = GiFloat32Type2FixLenType( | |||
GiLoadFloat32(src_ptr + (ow_block + 5) * ic_step)); | |||
#endif | |||
load_helper<ic_step, 6 * ld_weight, oc_step, c_dim, Vld1qF32S>( | |||
weight, weight_ptr, ld_weight_oc); | |||
cal_helper<6, 0, c_dim, ow_block, remain_w>(c, src, weight); | |||
@@ -24,21 +24,28 @@ struct ShiftCalHelper<src_idx, weight_idx, c_dim, ow_block, 0, T, T2, T3, T4> { | |||
static MEGDNN_ALWAYS_INLINE void impl(T&, T2&, T3&) {} | |||
}; | |||
#define cb2(step, lane, ow_block) \ | |||
c[0][step] = GiFloat32Type2FixLenType(GiSimdFmaLane( \ | |||
GiFixLenType2GiFloat32Type(c[0][step]), \ | |||
GiFixLenType2GiFloat32Type(weight[0][lane]), \ | |||
GiFixLenType2GiFloat32Type(src[(step + src_idx) % ow_block]), lane)); \ | |||
c[1][step] = GiFloat32Type2FixLenType(GiSimdFmaLane( \ | |||
GiFixLenType2GiFloat32Type(c[1][step]), \ | |||
GiFixLenType2GiFloat32Type(weight[1][lane]), \ | |||
GiFixLenType2GiFloat32Type(src[(step + src_idx) % ow_block]), lane)); | |||
#define cb(step, lane, ow_block) \ | |||
c[0][step] = GiFloat32Type2FixLenType(GiSimdFmaLane( \ | |||
GiFixLenType2GiFloat32Type(c[0][step]), \ | |||
GiFixLenType2GiFloat32Type(weight[0][lane]), \ | |||
GiFixLenType2GiFloat32Type(src[(step + src_idx) % ow_block]), lane)); | |||
#if defined(GI_TARGET_X86) || defined(GI_RVV_INTRINSICS) | |||
//! x86 and rvv GiSimdFmaLane API is slowly, as an alternate, use | |||
//! GiMultiplyAddScalarFloat32 | |||
#define MLA(a, b, c, d) \ | |||
GiMultiplyAddScalarFloat32( \ | |||
GiFixLenType2GiFloat32Type(a), GiFixLenType2GiFloat32Type(b), *(c + d)) | |||
#else | |||
#define MLA(a, b, c, d) \ | |||
GiSimdFmaLane( \ | |||
GiFixLenType2GiFloat32Type(a), GiFixLenType2GiFloat32Type(b), \ | |||
GiFixLenType2GiFloat32Type(c), d) | |||
#endif | |||
#define cb2(step, lane, ow_block) \ | |||
c[0][step] = GiFloat32Type2FixLenType( \ | |||
MLA(c[0][step], weight[0][lane], src[(step + src_idx) % ow_block], lane)); \ | |||
c[1][step] = GiFloat32Type2FixLenType( \ | |||
MLA(c[1][step], weight[1][lane], src[(step + src_idx) % ow_block], lane)); | |||
#define cb(step, lane, ow_block) \ | |||
c[0][step] = GiFloat32Type2FixLenType( \ | |||
MLA(c[0][step], weight[0][lane], src[(step + src_idx) % ow_block], lane)); | |||
#define SHIFT_CAL_HELPER(ow_block, remain_w) \ | |||
template < \ | |||
@@ -81,6 +88,7 @@ SHIFT_CAL_HELPER(4, 4); | |||
#undef SHIFT_CAL_HELPER | |||
#undef cb | |||
#undef cb2 | |||
#undef MLA | |||
template < | |||
int src_idx, int weight_idx, int c_dim, int ow_block, int remain_w, typename T, | |||
@@ -146,15 +154,24 @@ struct KerGiXXs2Nchw44FP32<bias_mode, Op, remain_w, 2, oc_block, ow_block> { | |||
const float* src_ptr = src_ptr_origin + ic_idx * ld_src_ic; | |||
const float* src_ptr_odd = src_ptr_odd_origin + ic_idx * ld_src_ic; | |||
GI_FLOAT32_FIXLEN_t src[ow_block]; | |||
GI_FLOAT32_FIXLEN_t weight[c_dim][4]; | |||
/////////row 0///////////// | |||
#if defined(GI_TARGET_X86) || defined(GI_RVV_INTRINSICS) | |||
const float* src[ow_block]; | |||
load_ptr_helper<ow_block, 0, simd_len, 0>(src, src_ptr, 0); | |||
#else | |||
GI_FLOAT32_FIXLEN_t src[ow_block]; | |||
load_helper<ow_block, 0, simd_len, 0, Vld1qF32S>(src, src_ptr, 0); | |||
#endif | |||
/////////row 0///////////// | |||
load_helper<4, 0, oc_step, c_dim, Vld1qF32S>( | |||
weight, weight_ptr, ld_weight_oc); | |||
cal_helper<0, 0, c_dim, ow_block, remain_w>(c, src, weight); | |||
#if defined(GI_TARGET_X86) || defined(GI_RVV_INTRINSICS) | |||
load_ptr_helper<ow_block, 0, simd_len, 0>(src, src_ptr_odd, 0); | |||
#else | |||
load_helper<ow_block, 0, simd_len, 0, Vld1qF32S>(src, src_ptr_odd, 0); | |||
#endif | |||
load_helper<4, 1 * ld_weight, oc_step, c_dim, Vld1qF32S>( | |||
weight, weight_ptr, ld_weight_oc); | |||
cal_helper<0, 0, c_dim, ow_block, remain_w>(c, src, weight); | |||
@@ -162,12 +179,20 @@ struct KerGiXXs2Nchw44FP32<bias_mode, Op, remain_w, 2, oc_block, ow_block> { | |||
src_ptr_odd += ld_src_iw; | |||
weight_ptr += ld_weight_fh; | |||
/////////row 1///////////// | |||
#if defined(GI_TARGET_X86) || defined(GI_RVV_INTRINSICS) | |||
load_ptr_helper<ow_block, 0, simd_len, 0>(src, src_ptr, 0); | |||
#else | |||
load_helper<ow_block, 0, simd_len, 0, Vld1qF32S>(src, src_ptr, 0); | |||
#endif | |||
load_helper<4, 0, oc_step, c_dim, Vld1qF32S>( | |||
weight, weight_ptr, ld_weight_oc); | |||
cal_helper<0, 0, c_dim, ow_block, remain_w>(c, src, weight); | |||
#if defined(GI_TARGET_X86) || defined(GI_RVV_INTRINSICS) | |||
load_ptr_helper<ow_block, 0, simd_len, 0>(src, src_ptr_odd, 0); | |||
#else | |||
load_helper<ow_block, 0, simd_len, 0, Vld1qF32S>(src, src_ptr_odd, 0); | |||
#endif | |||
load_helper<4, 1 * ld_weight, oc_step, c_dim, Vld1qF32S>( | |||
weight, weight_ptr, ld_weight_oc); | |||
cal_helper<0, 0, c_dim, ow_block, remain_w>(c, src, weight); | |||
@@ -203,21 +228,34 @@ struct KerGiXXs2Nchw44FP32<bias_mode, Op, remain_w, 3, oc_block, ow_block> { | |||
const float* src_ptr = src_ptr_origin + ic_idx * ld_src_ic; | |||
const float* src_ptr_odd = src_ptr_odd_origin + ic_idx * ld_src_ic; | |||
GI_FLOAT32_FIXLEN_t src[ow_block]; | |||
GI_FLOAT32_FIXLEN_t weight[c_dim][4]; | |||
/////////row 0///////////// | |||
#if defined(GI_TARGET_X86) || defined(GI_RVV_INTRINSICS) | |||
const float* src[ow_block]; | |||
load_ptr_helper<ow_block, 0, simd_len, 0>(src, src_ptr, 0); | |||
#else | |||
GI_FLOAT32_FIXLEN_t src[ow_block]; | |||
load_helper<ow_block, 0, simd_len, 0, Vld1qF32S>(src, src_ptr, 0); | |||
#endif | |||
/////////row 0///////////// | |||
load_helper<4, 0, oc_step, c_dim, Vld1qF32S>( | |||
weight, weight_ptr, ld_weight_oc); | |||
cal_helper<0, 0, c_dim, ow_block, remain_w>(c, src, weight); | |||
#if defined(GI_TARGET_X86) || defined(GI_RVV_INTRINSICS) | |||
src[0] = src_ptr + ow_block * simd_len; | |||
#else | |||
src[0] = GiFloat32Type2FixLenType( | |||
GiLoadFloat32(src_ptr + ow_block * simd_len)); | |||
#endif | |||
load_helper<4, 2 * ld_weight, oc_step, c_dim, Vld1qF32S>( | |||
weight, weight_ptr, ld_weight_oc); | |||
cal_helper<1, 0, c_dim, ow_block, remain_w>(c, src, weight); | |||
#if defined(GI_TARGET_X86) || defined(GI_RVV_INTRINSICS) | |||
load_ptr_helper<ow_block, 0, simd_len, 0>(src, src_ptr_odd, 0); | |||
#else | |||
load_helper<ow_block, 0, simd_len, 0, Vld1qF32S>(src, src_ptr_odd, 0); | |||
#endif | |||
load_helper<4, 1 * ld_weight, oc_step, c_dim, Vld1qF32S>( | |||
weight, weight_ptr, ld_weight_oc); | |||
cal_helper<0, 0, c_dim, ow_block, remain_w>(c, src, weight); | |||
@@ -225,17 +263,29 @@ struct KerGiXXs2Nchw44FP32<bias_mode, Op, remain_w, 3, oc_block, ow_block> { | |||
src_ptr_odd += ld_src_iw; | |||
weight_ptr += ld_weight_fh; | |||
/////////row 1///////////// | |||
#if defined(GI_TARGET_X86) || defined(GI_RVV_INTRINSICS) | |||
load_ptr_helper<ow_block, 0, simd_len, 0>(src, src_ptr, 0); | |||
#else | |||
load_helper<ow_block, 0, simd_len, 0, Vld1qF32S>(src, src_ptr, 0); | |||
#endif | |||
load_helper<4, 0, oc_step, c_dim, Vld1qF32S>( | |||
weight, weight_ptr, ld_weight_oc); | |||
cal_helper<0, 0, c_dim, ow_block, remain_w>(c, src, weight); | |||
#if defined(GI_TARGET_X86) || defined(GI_RVV_INTRINSICS) | |||
src[0] = src_ptr + ow_block * simd_len; | |||
#else | |||
src[0] = GiFloat32Type2FixLenType( | |||
GiLoadFloat32(src_ptr + ow_block * simd_len)); | |||
#endif | |||
load_helper<4, 2 * ld_weight, oc_step, c_dim, Vld1qF32S>( | |||
weight, weight_ptr, ld_weight_oc); | |||
cal_helper<1, 0, c_dim, ow_block, remain_w>(c, src, weight); | |||
#if defined(GI_TARGET_X86) || defined(GI_RVV_INTRINSICS) | |||
load_ptr_helper<ow_block, 0, simd_len, 0>(src, src_ptr_odd, 0); | |||
#else | |||
load_helper<ow_block, 0, simd_len, 0, Vld1qF32S>(src, src_ptr_odd, 0); | |||
#endif | |||
load_helper<4, 1 * ld_weight, oc_step, c_dim, Vld1qF32S>( | |||
weight, weight_ptr, ld_weight_oc); | |||
cal_helper<0, 0, c_dim, ow_block, remain_w>(c, src, weight); | |||
@@ -243,18 +293,30 @@ struct KerGiXXs2Nchw44FP32<bias_mode, Op, remain_w, 3, oc_block, ow_block> { | |||
src_ptr_odd += ld_src_iw; | |||
weight_ptr += ld_weight_fh; | |||
//////////row 2///////////// | |||
#if defined(GI_TARGET_X86) || defined(GI_RVV_INTRINSICS) | |||
load_ptr_helper<ow_block, 0, simd_len, 0>(src, src_ptr, 0); | |||
#else | |||
load_helper<ow_block, 0, simd_len, 0, Vld1qF32S>(src, src_ptr, 0); | |||
#endif | |||
load_helper<4, 0, oc_step, c_dim, Vld1qF32S>( | |||
weight, weight_ptr, ld_weight_oc); | |||
cal_helper<0, 0, c_dim, ow_block, remain_w>(c, src, weight); | |||
#if defined(GI_TARGET_X86) || defined(GI_RVV_INTRINSICS) | |||
src[0] = src_ptr + ow_block * simd_len; | |||
#else | |||
src[0] = GiFloat32Type2FixLenType( | |||
GiLoadFloat32(src_ptr + ow_block * simd_len)); | |||
#endif | |||
load_helper<4, 2 * ld_weight, oc_step, c_dim, Vld1qF32S>( | |||
weight, weight_ptr, ld_weight_oc); | |||
cal_helper<1, 0, c_dim, ow_block, remain_w>(c, src, weight); | |||
#if defined(GI_TARGET_X86) || defined(GI_RVV_INTRINSICS) | |||
load_ptr_helper<ow_block, 0, simd_len, 0>(src, src_ptr_odd, 0); | |||
#else | |||
load_helper<ow_block, 0, simd_len, 0, Vld1qF32S>(src, src_ptr_odd, 0); | |||
#endif | |||
load_helper<4, 1 * ld_weight, oc_step, c_dim, Vld1qF32S>( | |||
weight, weight_ptr, ld_weight_oc); | |||
cal_helper<0, 0, c_dim, ow_block, remain_w>(c, src, weight); | |||
@@ -292,30 +354,51 @@ struct KerGiXXs2Nchw44FP32<bias_mode, Op, remain_w, 5, oc_block, ow_block> { | |||
const float* src_ptr_odd = src_ptr_odd_origin + ic_idx * ld_src_ic; | |||
for (int fh_idx = 0; fh_idx < filter_size; ++fh_idx) { | |||
GI_FLOAT32_FIXLEN_t src[ow_block]; | |||
GI_FLOAT32_FIXLEN_t weight[c_dim][4]; | |||
// even element | |||
#if defined(GI_TARGET_X86) || defined(GI_RVV_INTRINSICS) | |||
const float* src[ow_block]; | |||
load_ptr_helper<ow_block, 0, simd_len, 0>(src, src_ptr, 0); | |||
#else | |||
GI_FLOAT32_FIXLEN_t src[ow_block]; | |||
load_helper<ow_block, 0, simd_len, 0, Vld1qF32S>(src, src_ptr, 0); | |||
#endif | |||
// even element | |||
load_helper<4, 0, oc_step, c_dim, Vld1qF32S>( | |||
weight, weight_ptr, ld_weight_oc); | |||
cal_helper<0, 0, c_dim, ow_block, remain_w>(c, src, weight); | |||
#if defined(GI_TARGET_X86) || defined(GI_RVV_INTRINSICS) | |||
src[0] = src_ptr + ow_block * simd_len; | |||
#else | |||
src[0] = GiFloat32Type2FixLenType( | |||
GiLoadFloat32(src_ptr + ow_block * simd_len)); | |||
#endif | |||
load_helper<4, 2 * ld_weight, oc_step, c_dim, Vld1qF32S>( | |||
weight, weight_ptr, ld_weight_oc); | |||
cal_helper<1, 0, c_dim, ow_block, remain_w>(c, src, weight); | |||
#if defined(GI_TARGET_X86) || defined(GI_RVV_INTRINSICS) | |||
src[1] = src_ptr + (ow_block + 1) * simd_len; | |||
#else | |||
src[1] = GiFloat32Type2FixLenType( | |||
GiLoadFloat32(src_ptr + (ow_block + 1) * simd_len)); | |||
#endif | |||
load_helper<4, 4 * ld_weight, oc_step, c_dim, Vld1qF32S>( | |||
weight, weight_ptr, ld_weight_oc); | |||
cal_helper<2, 0, c_dim, ow_block, remain_w>(c, src, weight); | |||
// odd element | |||
#if defined(GI_TARGET_X86) || defined(GI_RVV_INTRINSICS) | |||
load_ptr_helper<ow_block, 0, simd_len, 0>(src, src_ptr_odd, 0); | |||
#else | |||
load_helper<ow_block, 0, simd_len, 0, Vld1qF32S>(src, src_ptr_odd, 0); | |||
#endif | |||
load_helper<4, 1 * ld_weight, oc_step, c_dim, Vld1qF32S>( | |||
weight, weight_ptr, ld_weight_oc); | |||
cal_helper<0, 0, c_dim, ow_block, remain_w>(c, src, weight); | |||
#if defined(GI_TARGET_X86) || defined(GI_RVV_INTRINSICS) | |||
src[0] = src_ptr_odd + ow_block * simd_len; | |||
#else | |||
src[0] = GiFloat32Type2FixLenType( | |||
GiLoadFloat32(src_ptr_odd + ow_block * simd_len)); | |||
#endif | |||
load_helper<4, 3 * ld_weight, oc_step, c_dim, Vld1qF32S>( | |||
weight, weight_ptr, ld_weight_oc); | |||
cal_helper<1, 0, c_dim, ow_block, remain_w>(c, src, weight); | |||
@@ -360,40 +443,69 @@ struct KerGiXXs2Nchw44FP32<bias_mode, Op, remain_w, 7, oc_block, ow_block> { | |||
const float* src_ptr_odd = src_ptr_odd_origin + ic_idx * ld_src_ic; | |||
for (int fh_idx = 0; fh_idx < filter_size; ++fh_idx) { | |||
GI_FLOAT32_FIXLEN_t src[ow_block]; | |||
GI_FLOAT32_FIXLEN_t weight[c_dim][4]; | |||
// even element | |||
#if defined(GI_TARGET_X86) || defined(GI_RVV_INTRINSICS) | |||
const float* src[ow_block]; | |||
load_ptr_helper<ow_block, 0, simd_len, 0>(src, src_ptr, 0); | |||
#else | |||
GI_FLOAT32_FIXLEN_t src[ow_block]; | |||
load_helper<ow_block, 0, simd_len, 0, Vld1qF32S>(src, src_ptr, 0); | |||
#endif | |||
// even element | |||
load_helper<4, 0, oc_step, c_dim, Vld1qF32S>( | |||
weight, weight_ptr, ld_weight_oc); | |||
cal_helper<0, 0, c_dim, ow_block, remain_w>(c, src, weight); | |||
#if defined(GI_TARGET_X86) || defined(GI_RVV_INTRINSICS) | |||
src[0] = src_ptr + ow_block * simd_len; | |||
#else | |||
src[0] = GiFloat32Type2FixLenType( | |||
GiLoadFloat32(src_ptr + ow_block * simd_len)); | |||
#endif | |||
load_helper<4, 2 * ld_weight, oc_step, c_dim, Vld1qF32S>( | |||
weight, weight_ptr, ld_weight_oc); | |||
cal_helper<1, 0, c_dim, ow_block, remain_w>(c, src, weight); | |||
#if defined(GI_TARGET_X86) || defined(GI_RVV_INTRINSICS) | |||
src[1] = src_ptr + (ow_block + 1) * simd_len; | |||
#else | |||
src[1] = GiFloat32Type2FixLenType( | |||
GiLoadFloat32(src_ptr + (ow_block + 1) * simd_len)); | |||
#endif | |||
load_helper<4, 4 * ld_weight, oc_step, c_dim, Vld1qF32S>( | |||
weight, weight_ptr, ld_weight_oc); | |||
cal_helper<2, 0, c_dim, ow_block, remain_w>(c, src, weight); | |||
#if defined(GI_TARGET_X86) || defined(GI_RVV_INTRINSICS) | |||
src[2] = src_ptr + (ow_block + 2) * simd_len; | |||
#else | |||
src[2] = GiFloat32Type2FixLenType( | |||
GiLoadFloat32(src_ptr + (ow_block + 2) * simd_len)); | |||
#endif | |||
load_helper<4, 6 * ld_weight, oc_step, c_dim, Vld1qF32S>( | |||
weight, weight_ptr, ld_weight_oc); | |||
cal_helper<3, 0, c_dim, ow_block, remain_w>(c, src, weight); | |||
// odd element | |||
#if defined(GI_TARGET_X86) || defined(GI_RVV_INTRINSICS) | |||
load_ptr_helper<ow_block, 0, simd_len, 0>(src, src_ptr_odd, 0); | |||
#else | |||
load_helper<ow_block, 0, simd_len, 0, Vld1qF32S>(src, src_ptr_odd, 0); | |||
#endif | |||
load_helper<4, 1 * ld_weight, oc_step, c_dim, Vld1qF32S>( | |||
weight, weight_ptr, ld_weight_oc); | |||
cal_helper<0, 0, c_dim, ow_block, remain_w>(c, src, weight); | |||
#if defined(GI_TARGET_X86) || defined(GI_RVV_INTRINSICS) | |||
src[0] = src_ptr_odd + ow_block * simd_len; | |||
#else | |||
src[0] = GiFloat32Type2FixLenType( | |||
GiLoadFloat32(src_ptr_odd + ow_block * simd_len)); | |||
#endif | |||
load_helper<4, 3 * ld_weight, oc_step, c_dim, Vld1qF32S>( | |||
weight, weight_ptr, ld_weight_oc); | |||
cal_helper<1, 0, c_dim, ow_block, remain_w>(c, src, weight); | |||
#if defined(GI_TARGET_X86) || defined(GI_RVV_INTRINSICS) | |||
src[1] = src_ptr_odd + (ow_block + 1) * simd_len; | |||
#else | |||
src[1] = GiFloat32Type2FixLenType( | |||
GiLoadFloat32(src_ptr_odd + (ow_block + 1) * simd_len)); | |||
#endif | |||
load_helper<4, 5 * ld_weight, oc_step, c_dim, Vld1qF32S>( | |||
weight, weight_ptr, ld_weight_oc); | |||
cal_helper<2, 0, c_dim, ow_block, remain_w>(c, src, weight); | |||
@@ -40,44 +40,29 @@ struct ShiftCalHelper<src_idx, weight_idx, c_dim, stride, 0, T, T2, T3> { | |||
#if defined(GI_TARGET_X86) || defined(GI_RVV_INTRINSICS) | |||
//! x86 and rvv GiSimdFmaLane API is slowly, as an alternate, use | |||
//! GiMultiplyAddScalarFloat32 | |||
#define MLA GiMultiplyAddScalarFloat32 | |||
#define cb(step) \ | |||
c[0][step] = GiFloat32Type2FixLenType(MLA( \ | |||
GiFixLenType2GiFloat32Type(c[0][step]), \ | |||
GiFixLenType2GiFloat32Type(weight[0][weight_idx]), \ | |||
*(src[(step * stride + src_idx) / 4] + (step * stride + src_idx) % 4))); \ | |||
c[1][step] = GiFloat32Type2FixLenType(MLA( \ | |||
GiFixLenType2GiFloat32Type(c[1][step]), \ | |||
GiFixLenType2GiFloat32Type(weight[1][weight_idx]), \ | |||
*(src[(step * stride + src_idx) / 4] + (step * stride + src_idx) % 4))); | |||
#define cb2(step) \ | |||
c[0][step] = GiFloat32Type2FixLenType(MLA( \ | |||
GiFixLenType2GiFloat32Type(c[0][step]), \ | |||
GiFixLenType2GiFloat32Type(weight[0][weight_idx]), \ | |||
*(src[(step * stride + src_idx) / 4] + (step * stride + src_idx) % 4))); | |||
#define MLA(a, b, c, d) \ | |||
GiMultiplyAddScalarFloat32( \ | |||
GiFixLenType2GiFloat32Type(a), GiFixLenType2GiFloat32Type(b), *(c + d)) | |||
#else | |||
#define cb(step) \ | |||
c[0][step] = GiFloat32Type2FixLenType(GiSimdFmaLane( \ | |||
GiFixLenType2GiFloat32Type(c[0][step]), \ | |||
GiFixLenType2GiFloat32Type(weight[0][weight_idx]), \ | |||
GiFixLenType2GiFloat32Type(src[(step * stride + src_idx) / 4]), \ | |||
(step * stride + src_idx) % 4)); \ | |||
c[1][step] = GiFloat32Type2FixLenType(GiSimdFmaLane( \ | |||
GiFixLenType2GiFloat32Type(c[1][step]), \ | |||
GiFixLenType2GiFloat32Type(weight[1][weight_idx]), \ | |||
GiFixLenType2GiFloat32Type(src[(step * stride + src_idx) / 4]), \ | |||
(step * stride + src_idx) % 4)); | |||
#define cb2(step) \ | |||
c[0][step] = GiFloat32Type2FixLenType(GiSimdFmaLane( \ | |||
GiFixLenType2GiFloat32Type(c[0][step]), \ | |||
GiFixLenType2GiFloat32Type(weight[0][weight_idx]), \ | |||
GiFixLenType2GiFloat32Type(src[(step * stride + src_idx) / 4]), \ | |||
(step * stride + src_idx) % 4)); | |||
#undef MLA | |||
#define MLA(a, b, c, d) \ | |||
GiSimdFmaLane( \ | |||
GiFixLenType2GiFloat32Type(a), GiFixLenType2GiFloat32Type(b), \ | |||
GiFixLenType2GiFloat32Type(c), d) | |||
#endif | |||
#define cb(step) \ | |||
c[0][step] = GiFloat32Type2FixLenType( \ | |||
MLA(c[0][step], weight[0][weight_idx], src[(step * stride + src_idx) / 4], \ | |||
(step * stride + src_idx) % 4)); \ | |||
c[1][step] = GiFloat32Type2FixLenType( \ | |||
MLA(c[1][step], weight[1][weight_idx], src[(step * stride + src_idx) / 4], \ | |||
(step * stride + src_idx) % 4)); | |||
#define cb2(step) \ | |||
c[0][step] = GiFloat32Type2FixLenType( \ | |||
MLA(c[0][step], weight[0][weight_idx], src[(step * stride + src_idx) / 4], \ | |||
(step * stride + src_idx) % 4)); | |||
#define SHIFT_CAL_HELPER(ow_remain) \ | |||
template < \ | |||
int src_idx, int weight_idx, int stride, typename T, typename T2, \ | |||
@@ -108,6 +93,7 @@ SHIFT_CAL_HELPER(8) | |||
#undef SHIFT_CAL_HELPER | |||
#undef cb | |||
#undef cb2 | |||
#undef MLA | |||
template < | |||
int src_idx, int weight_idx, int c_dim, int stride, int remain_w, typename T, | |||
@@ -17,6 +17,30 @@ using namespace conv_stride1; | |||
using NCBKernSizeParam = fallback::ConvBiasImpl::NCBKernSizeParam; | |||
using NCBKernParam = fallback::ConvBiasImpl::NCBKernParam; | |||
#if defined(GI_RVV_INTRINSICS) | |||
#define PREFER_VF | |||
#endif | |||
#if defined(PREFER_VF) | |||
#define MLA(a, b, c, d) GiMultiplyAddScalarFloat32(a, b, *(c + d)) | |||
namespace { | |||
GI_FORCEINLINE void ext_float32_ptr( | |||
const float* a, const float* b, const int n, float* ret) { | |||
int t_count = GI_SIMD_LEN_BYTE / sizeof(float); | |||
int a_count = t_count - n; | |||
for (int i = 0; i < a_count; i++) { | |||
ret[i] = a[i + n]; | |||
} | |||
for (int i = 0; i < n; i++) { | |||
ret[i + a_count] = b[i]; | |||
} | |||
} | |||
}; // namespace | |||
#else | |||
#define MLA(a, b, c, d) GiSimdFmaLane(a, b, c, d) | |||
#endif | |||
void conv_stride1::do_conv_2x2_stride1( | |||
const float* src, const float* filter, float* dst, size_t IH, size_t IW, | |||
size_t OH, size_t OW, size_t IC) { | |||
@@ -143,10 +167,18 @@ void conv_stride1::do_conv_3x3_stride1( | |||
const float* k1 = filter + 3; | |||
const float* k2 = filter + 5; | |||
#if defined(PREFER_VF) | |||
const float* _k0123 = k0; | |||
const float* _k3456 = k1; | |||
const float* _k5678 = k2; | |||
float _k6789[GI_SIMD_LEN_BYTE / sizeof(float)]; | |||
ext_float32_ptr(_k5678, _k5678, 1, _k6789); | |||
#else | |||
GI_FLOAT32_t _k0123 = GiLoadFloat32(k0); | |||
GI_FLOAT32_t _k3456 = GiLoadFloat32(k1); | |||
GI_FLOAT32_t _k5678 = GiLoadFloat32(k2); | |||
GI_FLOAT32_t _k6789 = GiExtqFloat32(_k5678, _k5678, 1); | |||
#endif | |||
size_t h = 0; | |||
for (; h + 1 < OH; h += 2) { | |||
@@ -178,25 +210,25 @@ void conv_stride1::do_conv_3x3_stride1( | |||
GI_FLOAT32_t _r31 = GiExtqFloat32(_r30, _r30n, 1); | |||
GI_FLOAT32_t _r32 = GiExtqFloat32(_r30, _r30n, 2); | |||
_sum1 = GiSimdFmaLane(_sum1, _r00, _k0123, 0); | |||
_sum2 = GiSimdFmaLane(_sum2, _r01, _k0123, 1); | |||
_sum1 = GiSimdFmaLane(_sum1, _r02, _k0123, 2); | |||
_sum2 = GiSimdFmaLane(_sum2, _r10, _k3456, 0); | |||
_sum1 = GiSimdFmaLane(_sum1, _r11, _k3456, 1); | |||
_sum2 = GiSimdFmaLane(_sum2, _r12, _k3456, 2); | |||
_sum1 = GiSimdFmaLane(_sum1, _r20, _k6789, 0); | |||
_sum2 = GiSimdFmaLane(_sum2, _r21, _k6789, 1); | |||
_sum1 = GiSimdFmaLane(_sum1, _r22, _k6789, 2); | |||
_sum3 = GiSimdFmaLane(_sum3, _r10, _k0123, 0); | |||
_sum4 = GiSimdFmaLane(_sum4, _r11, _k0123, 1); | |||
_sum3 = GiSimdFmaLane(_sum3, _r12, _k0123, 2); | |||
_sum4 = GiSimdFmaLane(_sum4, _r20, _k3456, 0); | |||
_sum3 = GiSimdFmaLane(_sum3, _r21, _k3456, 1); | |||
_sum4 = GiSimdFmaLane(_sum4, _r22, _k3456, 2); | |||
_sum3 = GiSimdFmaLane(_sum3, _r30, _k6789, 0); | |||
_sum4 = GiSimdFmaLane(_sum4, _r31, _k6789, 1); | |||
_sum3 = GiSimdFmaLane(_sum3, _r32, _k6789, 2); | |||
_sum1 = MLA(_sum1, _r00, _k0123, 0); | |||
_sum2 = MLA(_sum2, _r01, _k0123, 1); | |||
_sum1 = MLA(_sum1, _r02, _k0123, 2); | |||
_sum2 = MLA(_sum2, _r10, _k3456, 0); | |||
_sum1 = MLA(_sum1, _r11, _k3456, 1); | |||
_sum2 = MLA(_sum2, _r12, _k3456, 2); | |||
_sum1 = MLA(_sum1, _r20, _k6789, 0); | |||
_sum2 = MLA(_sum2, _r21, _k6789, 1); | |||
_sum1 = MLA(_sum1, _r22, _k6789, 2); | |||
_sum3 = MLA(_sum3, _r10, _k0123, 0); | |||
_sum4 = MLA(_sum4, _r11, _k0123, 1); | |||
_sum3 = MLA(_sum3, _r12, _k0123, 2); | |||
_sum4 = MLA(_sum4, _r20, _k3456, 0); | |||
_sum3 = MLA(_sum3, _r21, _k3456, 1); | |||
_sum4 = MLA(_sum4, _r22, _k3456, 2); | |||
_sum3 = MLA(_sum3, _r30, _k6789, 0); | |||
_sum4 = MLA(_sum4, _r31, _k6789, 1); | |||
_sum3 = MLA(_sum3, _r32, _k6789, 2); | |||
_sum1 = GiAddFloat32(_sum1, _sum2); | |||
_sum3 = GiAddFloat32(_sum3, _sum4); | |||
@@ -243,15 +275,15 @@ void conv_stride1::do_conv_3x3_stride1( | |||
GI_FLOAT32_t _r21 = GiExtqFloat32(_r20, _r20n, 1); | |||
GI_FLOAT32_t _r22 = GiExtqFloat32(_r20, _r20n, 2); | |||
_sum1 = GiSimdFmaLane(_sum1, _r00, _k0123, 0); | |||
_sum2 = GiSimdFmaLane(_sum2, _r01, _k0123, 1); | |||
_sum1 = GiSimdFmaLane(_sum1, _r02, _k0123, 2); | |||
_sum2 = GiSimdFmaLane(_sum2, _r10, _k3456, 0); | |||
_sum1 = GiSimdFmaLane(_sum1, _r11, _k3456, 1); | |||
_sum2 = GiSimdFmaLane(_sum2, _r12, _k3456, 2); | |||
_sum1 = GiSimdFmaLane(_sum1, _r20, _k6789, 0); | |||
_sum2 = GiSimdFmaLane(_sum2, _r21, _k6789, 1); | |||
_sum1 = GiSimdFmaLane(_sum1, _r22, _k6789, 2); | |||
_sum1 = MLA(_sum1, _r00, _k0123, 0); | |||
_sum2 = MLA(_sum2, _r01, _k0123, 1); | |||
_sum1 = MLA(_sum1, _r02, _k0123, 2); | |||
_sum2 = MLA(_sum2, _r10, _k3456, 0); | |||
_sum1 = MLA(_sum1, _r11, _k3456, 1); | |||
_sum2 = MLA(_sum2, _r12, _k3456, 2); | |||
_sum1 = MLA(_sum1, _r20, _k6789, 0); | |||
_sum2 = MLA(_sum2, _r21, _k6789, 1); | |||
_sum1 = MLA(_sum1, _r22, _k6789, 2); | |||
_sum1 = GiAddFloat32(_sum1, _sum2); | |||
@@ -288,6 +320,15 @@ void conv_stride1::do_conv_5x5_stride1( | |||
const float* r4 = src_ptr + IW * 4; | |||
const float* r5 = src_ptr + IW * 5; | |||
#if defined(PREFER_VF) | |||
const float* _k0123 = filter; | |||
const float* _k4567 = filter + 4; | |||
const float* _k891011 = filter + 8; | |||
const float* _k12131415 = filter + 12; | |||
const float* _k16171819 = filter + 16; | |||
const float* _k20212223 = filter + 20; | |||
const float* _k24242424 = filter + 24; | |||
#else | |||
GI_FLOAT32_t _k0123 = GiLoadFloat32(filter); | |||
GI_FLOAT32_t _k4567 = GiLoadFloat32(filter + 4); | |||
GI_FLOAT32_t _k891011 = GiLoadFloat32(filter + 8); | |||
@@ -295,6 +336,7 @@ void conv_stride1::do_conv_5x5_stride1( | |||
GI_FLOAT32_t _k16171819 = GiLoadFloat32(filter + 16); | |||
GI_FLOAT32_t _k20212223 = GiLoadFloat32(filter + 20); | |||
GI_FLOAT32_t _k24242424 = GiBroadcastFloat32(filter[24]); | |||
#endif | |||
size_t h = 0; | |||
for (; h + 1 < OH; h += 2) { | |||
@@ -340,65 +382,65 @@ void conv_stride1::do_conv_5x5_stride1( | |||
GI_FLOAT32_t _r52 = GiExtqFloat32(_r50, _r54, 2); | |||
GI_FLOAT32_t _r53 = GiExtqFloat32(_r50, _r54, 3); | |||
_sum = GiSimdFmaLane(_sum, _r00, _k0123, 0); | |||
_sum = GiSimdFmaLane(_sum, _r01, _k0123, 1); | |||
_sum = GiSimdFmaLane(_sum, _r02, _k0123, 2); | |||
_sum = GiSimdFmaLane(_sum, _r03, _k0123, 3); | |||
_sum = GiSimdFmaLane(_sum, _r04, _k4567, 0); | |||
_sum = GiSimdFmaLane(_sum, _r10, _k4567, 1); | |||
_sum = GiSimdFmaLane(_sum, _r11, _k4567, 2); | |||
_sum = GiSimdFmaLane(_sum, _r12, _k4567, 3); | |||
_sum = GiSimdFmaLane(_sum, _r13, _k891011, 0); | |||
_sum = GiSimdFmaLane(_sum, _r14, _k891011, 1); | |||
_sum = GiSimdFmaLane(_sum, _r20, _k891011, 2); | |||
_sum = GiSimdFmaLane(_sum, _r21, _k891011, 3); | |||
_sum = GiSimdFmaLane(_sum, _r22, _k12131415, 0); | |||
_sum = GiSimdFmaLane(_sum, _r23, _k12131415, 1); | |||
_sum = GiSimdFmaLane(_sum, _r24, _k12131415, 2); | |||
_sum = GiSimdFmaLane(_sum, _r30, _k12131415, 3); | |||
_sum = GiSimdFmaLane(_sum, _r31, _k16171819, 0); | |||
_sum = GiSimdFmaLane(_sum, _r32, _k16171819, 1); | |||
_sum = GiSimdFmaLane(_sum, _r33, _k16171819, 2); | |||
_sum = GiSimdFmaLane(_sum, _r34, _k16171819, 3); | |||
_sum = GiSimdFmaLane(_sum, _r40, _k20212223, 0); | |||
_sum = GiSimdFmaLane(_sum, _r41, _k20212223, 1); | |||
_sum = GiSimdFmaLane(_sum, _r42, _k20212223, 2); | |||
_sum = GiSimdFmaLane(_sum, _r43, _k20212223, 3); | |||
_sum = GiSimdFmaLane(_sum, _r44, _k24242424, 0); | |||
_sum2 = GiSimdFmaLane(_sum2, _r10, _k0123, 0); | |||
_sum2 = GiSimdFmaLane(_sum2, _r11, _k0123, 1); | |||
_sum2 = GiSimdFmaLane(_sum2, _r12, _k0123, 2); | |||
_sum2 = GiSimdFmaLane(_sum2, _r13, _k0123, 3); | |||
_sum2 = GiSimdFmaLane(_sum2, _r14, _k4567, 0); | |||
_sum2 = GiSimdFmaLane(_sum2, _r20, _k4567, 1); | |||
_sum2 = GiSimdFmaLane(_sum2, _r21, _k4567, 2); | |||
_sum2 = GiSimdFmaLane(_sum2, _r22, _k4567, 3); | |||
_sum2 = GiSimdFmaLane(_sum2, _r23, _k891011, 0); | |||
_sum2 = GiSimdFmaLane(_sum2, _r24, _k891011, 1); | |||
_sum2 = GiSimdFmaLane(_sum2, _r30, _k891011, 2); | |||
_sum2 = GiSimdFmaLane(_sum2, _r31, _k891011, 3); | |||
_sum2 = GiSimdFmaLane(_sum2, _r32, _k12131415, 0); | |||
_sum2 = GiSimdFmaLane(_sum2, _r33, _k12131415, 1); | |||
_sum2 = GiSimdFmaLane(_sum2, _r34, _k12131415, 2); | |||
_sum2 = GiSimdFmaLane(_sum2, _r40, _k12131415, 3); | |||
_sum2 = GiSimdFmaLane(_sum2, _r41, _k16171819, 0); | |||
_sum2 = GiSimdFmaLane(_sum2, _r42, _k16171819, 1); | |||
_sum2 = GiSimdFmaLane(_sum2, _r43, _k16171819, 2); | |||
_sum2 = GiSimdFmaLane(_sum2, _r44, _k16171819, 3); | |||
_sum2 = GiSimdFmaLane(_sum2, _r50, _k20212223, 0); | |||
_sum2 = GiSimdFmaLane(_sum2, _r51, _k20212223, 1); | |||
_sum2 = GiSimdFmaLane(_sum2, _r52, _k20212223, 2); | |||
_sum2 = GiSimdFmaLane(_sum2, _r53, _k20212223, 3); | |||
_sum2 = GiSimdFmaLane(_sum2, _r54, _k24242424, 0); | |||
_sum = MLA(_sum, _r00, _k0123, 0); | |||
_sum = MLA(_sum, _r01, _k0123, 1); | |||
_sum = MLA(_sum, _r02, _k0123, 2); | |||
_sum = MLA(_sum, _r03, _k0123, 3); | |||
_sum = MLA(_sum, _r04, _k4567, 0); | |||
_sum = MLA(_sum, _r10, _k4567, 1); | |||
_sum = MLA(_sum, _r11, _k4567, 2); | |||
_sum = MLA(_sum, _r12, _k4567, 3); | |||
_sum = MLA(_sum, _r13, _k891011, 0); | |||
_sum = MLA(_sum, _r14, _k891011, 1); | |||
_sum = MLA(_sum, _r20, _k891011, 2); | |||
_sum = MLA(_sum, _r21, _k891011, 3); | |||
_sum = MLA(_sum, _r22, _k12131415, 0); | |||
_sum = MLA(_sum, _r23, _k12131415, 1); | |||
_sum = MLA(_sum, _r24, _k12131415, 2); | |||
_sum = MLA(_sum, _r30, _k12131415, 3); | |||
_sum = MLA(_sum, _r31, _k16171819, 0); | |||
_sum = MLA(_sum, _r32, _k16171819, 1); | |||
_sum = MLA(_sum, _r33, _k16171819, 2); | |||
_sum = MLA(_sum, _r34, _k16171819, 3); | |||
_sum = MLA(_sum, _r40, _k20212223, 0); | |||
_sum = MLA(_sum, _r41, _k20212223, 1); | |||
_sum = MLA(_sum, _r42, _k20212223, 2); | |||
_sum = MLA(_sum, _r43, _k20212223, 3); | |||
_sum = MLA(_sum, _r44, _k24242424, 0); | |||
_sum2 = MLA(_sum2, _r10, _k0123, 0); | |||
_sum2 = MLA(_sum2, _r11, _k0123, 1); | |||
_sum2 = MLA(_sum2, _r12, _k0123, 2); | |||
_sum2 = MLA(_sum2, _r13, _k0123, 3); | |||
_sum2 = MLA(_sum2, _r14, _k4567, 0); | |||
_sum2 = MLA(_sum2, _r20, _k4567, 1); | |||
_sum2 = MLA(_sum2, _r21, _k4567, 2); | |||
_sum2 = MLA(_sum2, _r22, _k4567, 3); | |||
_sum2 = MLA(_sum2, _r23, _k891011, 0); | |||
_sum2 = MLA(_sum2, _r24, _k891011, 1); | |||
_sum2 = MLA(_sum2, _r30, _k891011, 2); | |||
_sum2 = MLA(_sum2, _r31, _k891011, 3); | |||
_sum2 = MLA(_sum2, _r32, _k12131415, 0); | |||
_sum2 = MLA(_sum2, _r33, _k12131415, 1); | |||
_sum2 = MLA(_sum2, _r34, _k12131415, 2); | |||
_sum2 = MLA(_sum2, _r40, _k12131415, 3); | |||
_sum2 = MLA(_sum2, _r41, _k16171819, 0); | |||
_sum2 = MLA(_sum2, _r42, _k16171819, 1); | |||
_sum2 = MLA(_sum2, _r43, _k16171819, 2); | |||
_sum2 = MLA(_sum2, _r44, _k16171819, 3); | |||
_sum2 = MLA(_sum2, _r50, _k20212223, 0); | |||
_sum2 = MLA(_sum2, _r51, _k20212223, 1); | |||
_sum2 = MLA(_sum2, _r52, _k20212223, 2); | |||
_sum2 = MLA(_sum2, _r53, _k20212223, 3); | |||
_sum2 = MLA(_sum2, _r54, _k24242424, 0); | |||
GiStoreFloat32(outptr, _sum); | |||
GiStoreFloat32(outptr2, _sum2); | |||
@@ -460,35 +502,35 @@ void conv_stride1::do_conv_5x5_stride1( | |||
GI_FLOAT32_t _r42 = GiExtqFloat32(_r40, _r44, 2); | |||
GI_FLOAT32_t _r43 = GiExtqFloat32(_r40, _r44, 3); | |||
_sum = GiSimdFmaLane(_sum, _r00, _k0123, 0); | |||
_sum = GiSimdFmaLane(_sum, _r01, _k0123, 1); | |||
_sum = GiSimdFmaLane(_sum, _r02, _k0123, 2); | |||
_sum = GiSimdFmaLane(_sum, _r03, _k0123, 3); | |||
_sum = GiSimdFmaLane(_sum, _r04, _k4567, 0); | |||
_sum = GiSimdFmaLane(_sum, _r10, _k4567, 1); | |||
_sum = GiSimdFmaLane(_sum, _r11, _k4567, 2); | |||
_sum = GiSimdFmaLane(_sum, _r12, _k4567, 3); | |||
_sum = GiSimdFmaLane(_sum, _r13, _k891011, 0); | |||
_sum = GiSimdFmaLane(_sum, _r14, _k891011, 1); | |||
_sum = GiSimdFmaLane(_sum, _r20, _k891011, 2); | |||
_sum = GiSimdFmaLane(_sum, _r21, _k891011, 3); | |||
_sum = GiSimdFmaLane(_sum, _r22, _k12131415, 0); | |||
_sum = GiSimdFmaLane(_sum, _r23, _k12131415, 1); | |||
_sum = GiSimdFmaLane(_sum, _r24, _k12131415, 2); | |||
_sum = GiSimdFmaLane(_sum, _r30, _k12131415, 3); | |||
_sum = GiSimdFmaLane(_sum, _r31, _k16171819, 0); | |||
_sum = GiSimdFmaLane(_sum, _r32, _k16171819, 1); | |||
_sum = GiSimdFmaLane(_sum, _r33, _k16171819, 2); | |||
_sum = GiSimdFmaLane(_sum, _r34, _k16171819, 3); | |||
_sum = GiSimdFmaLane(_sum, _r40, _k20212223, 0); | |||
_sum = GiSimdFmaLane(_sum, _r41, _k20212223, 1); | |||
_sum = GiSimdFmaLane(_sum, _r42, _k20212223, 2); | |||
_sum = GiSimdFmaLane(_sum, _r43, _k20212223, 3); | |||
_sum = GiSimdFmaLane(_sum, _r44, _k24242424, 0); | |||
_sum = MLA(_sum, _r00, _k0123, 0); | |||
_sum = MLA(_sum, _r01, _k0123, 1); | |||
_sum = MLA(_sum, _r02, _k0123, 2); | |||
_sum = MLA(_sum, _r03, _k0123, 3); | |||
_sum = MLA(_sum, _r04, _k4567, 0); | |||
_sum = MLA(_sum, _r10, _k4567, 1); | |||
_sum = MLA(_sum, _r11, _k4567, 2); | |||
_sum = MLA(_sum, _r12, _k4567, 3); | |||
_sum = MLA(_sum, _r13, _k891011, 0); | |||
_sum = MLA(_sum, _r14, _k891011, 1); | |||
_sum = MLA(_sum, _r20, _k891011, 2); | |||
_sum = MLA(_sum, _r21, _k891011, 3); | |||
_sum = MLA(_sum, _r22, _k12131415, 0); | |||
_sum = MLA(_sum, _r23, _k12131415, 1); | |||
_sum = MLA(_sum, _r24, _k12131415, 2); | |||
_sum = MLA(_sum, _r30, _k12131415, 3); | |||
_sum = MLA(_sum, _r31, _k16171819, 0); | |||
_sum = MLA(_sum, _r32, _k16171819, 1); | |||
_sum = MLA(_sum, _r33, _k16171819, 2); | |||
_sum = MLA(_sum, _r34, _k16171819, 3); | |||
_sum = MLA(_sum, _r40, _k20212223, 0); | |||
_sum = MLA(_sum, _r41, _k20212223, 1); | |||
_sum = MLA(_sum, _r42, _k20212223, 2); | |||
_sum = MLA(_sum, _r43, _k20212223, 3); | |||
_sum = MLA(_sum, _r44, _k24242424, 0); | |||
GiStoreFloat32(outptr, _sum); | |||
@@ -542,8 +584,13 @@ void conv_stride1::do_conv_7x7_stride1( | |||
rep(i, width) { | |||
GI_FLOAT32_t _sum = GiLoadFloat32(outptr); | |||
#if defined(PREFER_VF) | |||
const float* _k0123 = k0; | |||
const float* _k4567 = k0 + 4; | |||
#else | |||
GI_FLOAT32_t _k0123 = GiLoadFloat32(k0); | |||
GI_FLOAT32_t _k4567 = GiLoadFloat32(k0 + 4); | |||
#endif | |||
GI_FLOAT32_t _r00 = GiLoadFloat32(r0); // 0 1 2 3 | |||
GI_FLOAT32_t _r04 = GiLoadFloat32(r0 + 4); // 4 5 6 7 | |||
@@ -554,16 +601,21 @@ void conv_stride1::do_conv_7x7_stride1( | |||
GI_FLOAT32_t _r05 = GiExtqFloat32(_r04, _r00n, 1); // 5 6 7 8 | |||
GI_FLOAT32_t _r06 = GiExtqFloat32(_r04, _r00n, 2); // 6 7 8 9 | |||
_sum = GiSimdFmaLane(_sum, _r00, _k0123, 0); | |||
_sum = GiSimdFmaLane(_sum, _r01, _k0123, 1); | |||
_sum = GiSimdFmaLane(_sum, _r02, _k0123, 2); | |||
_sum = GiSimdFmaLane(_sum, _r03, _k0123, 3); | |||
_sum = GiSimdFmaLane(_sum, _r04, _k4567, 0); | |||
_sum = GiSimdFmaLane(_sum, _r05, _k4567, 1); | |||
_sum = GiSimdFmaLane(_sum, _r06, _k4567, 2); | |||
_sum = MLA(_sum, _r00, _k0123, 0); | |||
_sum = MLA(_sum, _r01, _k0123, 1); | |||
_sum = MLA(_sum, _r02, _k0123, 2); | |||
_sum = MLA(_sum, _r03, _k0123, 3); | |||
_sum = MLA(_sum, _r04, _k4567, 0); | |||
_sum = MLA(_sum, _r05, _k4567, 1); | |||
_sum = MLA(_sum, _r06, _k4567, 2); | |||
#if defined(PREFER_VF) | |||
const float* _k78910 = k1; | |||
const float* _k11121314 = k1 + 4; | |||
#else | |||
GI_FLOAT32_t _k78910 = GiLoadFloat32(k1); | |||
GI_FLOAT32_t _k11121314 = GiLoadFloat32(k1 + 4); | |||
#endif | |||
GI_FLOAT32_t _r10 = GiLoadFloat32(r1); | |||
GI_FLOAT32_t _r14 = GiLoadFloat32(r1 + 4); | |||
@@ -574,16 +626,21 @@ void conv_stride1::do_conv_7x7_stride1( | |||
GI_FLOAT32_t _r15 = GiExtqFloat32(_r14, _r10n, 1); | |||
GI_FLOAT32_t _r16 = GiExtqFloat32(_r14, _r10n, 2); | |||
_sum = GiSimdFmaLane(_sum, _r10, _k78910, 0); | |||
_sum = GiSimdFmaLane(_sum, _r11, _k78910, 1); | |||
_sum = GiSimdFmaLane(_sum, _r12, _k78910, 2); | |||
_sum = GiSimdFmaLane(_sum, _r13, _k78910, 3); | |||
_sum = GiSimdFmaLane(_sum, _r14, _k11121314, 0); | |||
_sum = GiSimdFmaLane(_sum, _r15, _k11121314, 1); | |||
_sum = GiSimdFmaLane(_sum, _r16, _k11121314, 2); | |||
_sum = MLA(_sum, _r10, _k78910, 0); | |||
_sum = MLA(_sum, _r11, _k78910, 1); | |||
_sum = MLA(_sum, _r12, _k78910, 2); | |||
_sum = MLA(_sum, _r13, _k78910, 3); | |||
_sum = MLA(_sum, _r14, _k11121314, 0); | |||
_sum = MLA(_sum, _r15, _k11121314, 1); | |||
_sum = MLA(_sum, _r16, _k11121314, 2); | |||
#if defined(PREFER_VF) | |||
const float* _k14151617 = k2; | |||
const float* _k18192021 = k2 + 4; | |||
#else | |||
GI_FLOAT32_t _k14151617 = GiLoadFloat32(k2); | |||
GI_FLOAT32_t _k18192021 = GiLoadFloat32(k2 + 4); | |||
#endif | |||
GI_FLOAT32_t _r20 = GiLoadFloat32(r2); | |||
GI_FLOAT32_t _r24 = GiLoadFloat32(r2 + 4); | |||
@@ -594,16 +651,21 @@ void conv_stride1::do_conv_7x7_stride1( | |||
GI_FLOAT32_t _r25 = GiExtqFloat32(_r24, _r20n, 1); | |||
GI_FLOAT32_t _r26 = GiExtqFloat32(_r24, _r20n, 2); | |||
_sum = GiSimdFmaLane(_sum, _r20, _k14151617, 0); | |||
_sum = GiSimdFmaLane(_sum, _r21, _k14151617, 1); | |||
_sum = GiSimdFmaLane(_sum, _r22, _k14151617, 2); | |||
_sum = GiSimdFmaLane(_sum, _r23, _k14151617, 3); | |||
_sum = GiSimdFmaLane(_sum, _r24, _k18192021, 0); | |||
_sum = GiSimdFmaLane(_sum, _r25, _k18192021, 1); | |||
_sum = GiSimdFmaLane(_sum, _r26, _k18192021, 2); | |||
_sum = MLA(_sum, _r20, _k14151617, 0); | |||
_sum = MLA(_sum, _r21, _k14151617, 1); | |||
_sum = MLA(_sum, _r22, _k14151617, 2); | |||
_sum = MLA(_sum, _r23, _k14151617, 3); | |||
_sum = MLA(_sum, _r24, _k18192021, 0); | |||
_sum = MLA(_sum, _r25, _k18192021, 1); | |||
_sum = MLA(_sum, _r26, _k18192021, 2); | |||
#if defined(PREFER_VF) | |||
const float* _k21222324 = k3; | |||
const float* _k25262728 = k3 + 4; | |||
#else | |||
GI_FLOAT32_t _k21222324 = GiLoadFloat32(k3); | |||
GI_FLOAT32_t _k25262728 = GiLoadFloat32(k3 + 4); | |||
#endif | |||
GI_FLOAT32_t _r30 = GiLoadFloat32(r3); | |||
GI_FLOAT32_t _r34 = GiLoadFloat32(r3 + 4); | |||
@@ -614,16 +676,21 @@ void conv_stride1::do_conv_7x7_stride1( | |||
GI_FLOAT32_t _r35 = GiExtqFloat32(_r34, _r30n, 1); | |||
GI_FLOAT32_t _r36 = GiExtqFloat32(_r34, _r30n, 2); | |||
_sum = GiSimdFmaLane(_sum, _r30, _k21222324, 0); | |||
_sum = GiSimdFmaLane(_sum, _r31, _k21222324, 1); | |||
_sum = GiSimdFmaLane(_sum, _r32, _k21222324, 2); | |||
_sum = GiSimdFmaLane(_sum, _r33, _k21222324, 3); | |||
_sum = GiSimdFmaLane(_sum, _r34, _k25262728, 0); | |||
_sum = GiSimdFmaLane(_sum, _r35, _k25262728, 1); | |||
_sum = GiSimdFmaLane(_sum, _r36, _k25262728, 2); | |||
_sum = MLA(_sum, _r30, _k21222324, 0); | |||
_sum = MLA(_sum, _r31, _k21222324, 1); | |||
_sum = MLA(_sum, _r32, _k21222324, 2); | |||
_sum = MLA(_sum, _r33, _k21222324, 3); | |||
_sum = MLA(_sum, _r34, _k25262728, 0); | |||
_sum = MLA(_sum, _r35, _k25262728, 1); | |||
_sum = MLA(_sum, _r36, _k25262728, 2); | |||
#if defined(PREFER_VF) | |||
const float* _k28293031 = k4; | |||
const float* _k32333435 = k4 + 4; | |||
#else | |||
GI_FLOAT32_t _k28293031 = GiLoadFloat32(k4); | |||
GI_FLOAT32_t _k32333435 = GiLoadFloat32(k4 + 4); | |||
#endif | |||
GI_FLOAT32_t _r40 = GiLoadFloat32(r4); | |||
GI_FLOAT32_t _r44 = GiLoadFloat32(r4 + 4); | |||
@@ -634,16 +701,21 @@ void conv_stride1::do_conv_7x7_stride1( | |||
GI_FLOAT32_t _r45 = GiExtqFloat32(_r44, _r40n, 1); | |||
GI_FLOAT32_t _r46 = GiExtqFloat32(_r44, _r40n, 2); | |||
_sum = GiSimdFmaLane(_sum, _r40, _k28293031, 0); | |||
_sum = GiSimdFmaLane(_sum, _r41, _k28293031, 1); | |||
_sum = GiSimdFmaLane(_sum, _r42, _k28293031, 2); | |||
_sum = GiSimdFmaLane(_sum, _r43, _k28293031, 3); | |||
_sum = GiSimdFmaLane(_sum, _r44, _k32333435, 0); | |||
_sum = GiSimdFmaLane(_sum, _r45, _k32333435, 1); | |||
_sum = GiSimdFmaLane(_sum, _r46, _k32333435, 2); | |||
_sum = MLA(_sum, _r40, _k28293031, 0); | |||
_sum = MLA(_sum, _r41, _k28293031, 1); | |||
_sum = MLA(_sum, _r42, _k28293031, 2); | |||
_sum = MLA(_sum, _r43, _k28293031, 3); | |||
_sum = MLA(_sum, _r44, _k32333435, 0); | |||
_sum = MLA(_sum, _r45, _k32333435, 1); | |||
_sum = MLA(_sum, _r46, _k32333435, 2); | |||
#if defined(PREFER_VF) | |||
const float* _k35363738 = k5; | |||
const float* _k39404142 = k5 + 4; | |||
#else | |||
GI_FLOAT32_t _k35363738 = GiLoadFloat32(k5); | |||
GI_FLOAT32_t _k39404142 = GiLoadFloat32(k5 + 4); | |||
#endif | |||
GI_FLOAT32_t _r50 = GiLoadFloat32(r5); | |||
GI_FLOAT32_t _r54 = GiLoadFloat32(r5 + 4); | |||
@@ -654,17 +726,24 @@ void conv_stride1::do_conv_7x7_stride1( | |||
GI_FLOAT32_t _r55 = GiExtqFloat32(_r54, _r50n, 1); | |||
GI_FLOAT32_t _r56 = GiExtqFloat32(_r54, _r50n, 2); | |||
_sum = GiSimdFmaLane(_sum, _r50, _k35363738, 0); | |||
_sum = GiSimdFmaLane(_sum, _r51, _k35363738, 1); | |||
_sum = GiSimdFmaLane(_sum, _r52, _k35363738, 2); | |||
_sum = GiSimdFmaLane(_sum, _r53, _k35363738, 3); | |||
_sum = GiSimdFmaLane(_sum, _r54, _k39404142, 0); | |||
_sum = GiSimdFmaLane(_sum, _r55, _k39404142, 1); | |||
_sum = GiSimdFmaLane(_sum, _r56, _k39404142, 2); | |||
_sum = MLA(_sum, _r50, _k35363738, 0); | |||
_sum = MLA(_sum, _r51, _k35363738, 1); | |||
_sum = MLA(_sum, _r52, _k35363738, 2); | |||
_sum = MLA(_sum, _r53, _k35363738, 3); | |||
_sum = MLA(_sum, _r54, _k39404142, 0); | |||
_sum = MLA(_sum, _r55, _k39404142, 1); | |||
_sum = MLA(_sum, _r56, _k39404142, 2); | |||
#if defined(PREFER_VF) | |||
const float* _k42434445 = k6; | |||
float _k46474849[GI_SIMD_LEN_BYTE / sizeof(float)]; | |||
memcpy(_k46474849, k6 + 4, | |||
sizeof(float) * GI_SIMD_LEN_BYTE / sizeof(float) - 1); | |||
#else | |||
GI_FLOAT32_t _k42434445 = GiLoadFloat32(k6); | |||
GI_FLOAT32_t _k46474849 = | |||
GiLd1qLaneFloat32(k6 + 4 + 2, GiLoadFloat32LowHalf(k6 + 4), 2); | |||
#endif | |||
GI_FLOAT32_t _r60 = GiLoadFloat32(r6); | |||
GI_FLOAT32_t _r64 = GiLoadFloat32(r6 + 4); | |||
@@ -675,13 +754,13 @@ void conv_stride1::do_conv_7x7_stride1( | |||
GI_FLOAT32_t _r65 = GiExtqFloat32(_r64, _r60n, 1); | |||
GI_FLOAT32_t _r66 = GiExtqFloat32(_r64, _r60n, 2); | |||
_sum = GiSimdFmaLane(_sum, _r60, _k42434445, 0); | |||
_sum = GiSimdFmaLane(_sum, _r61, _k42434445, 1); | |||
_sum = GiSimdFmaLane(_sum, _r62, _k42434445, 2); | |||
_sum = GiSimdFmaLane(_sum, _r63, _k42434445, 3); | |||
_sum = GiSimdFmaLane(_sum, _r64, _k46474849, 0); | |||
_sum = GiSimdFmaLane(_sum, _r65, _k46474849, 1); | |||
_sum = GiSimdFmaLane(_sum, _r66, _k46474849, 2); | |||
_sum = MLA(_sum, _r60, _k42434445, 0); | |||
_sum = MLA(_sum, _r61, _k42434445, 1); | |||
_sum = MLA(_sum, _r62, _k42434445, 2); | |||
_sum = MLA(_sum, _r63, _k42434445, 3); | |||
_sum = MLA(_sum, _r64, _k46474849, 0); | |||
_sum = MLA(_sum, _r65, _k46474849, 1); | |||
_sum = MLA(_sum, _r66, _k46474849, 2); | |||
GiStoreFloat32(outptr, _sum); | |||
@@ -15,6 +15,30 @@ using namespace conv_stride2; | |||
using NCBKernSizeParam = fallback::ConvBiasImpl::NCBKernSizeParam; | |||
using NCBKernParam = fallback::ConvBiasImpl::NCBKernParam; | |||
#if defined(GI_RVV_INTRINSICS) | |||
#define PREFER_VF | |||
#endif | |||
#if defined(PREFER_VF) | |||
#define MLA(a, b, c, d) GiMultiplyAddScalarFloat32(a, b, *(c + d)) | |||
namespace { | |||
GI_FORCEINLINE void ext_float32_ptr( | |||
const float* a, const float* b, const int n, float* ret) { | |||
int t_count = GI_SIMD_LEN_BYTE / sizeof(float); | |||
int a_count = t_count - n; | |||
for (int i = 0; i < a_count; i++) { | |||
ret[i] = a[i + n]; | |||
} | |||
for (int i = 0; i < n; i++) { | |||
ret[i + a_count] = b[i]; | |||
} | |||
} | |||
}; // namespace | |||
#else | |||
#define MLA(a, b, c, d) GiSimdFmaLane(a, b, c, d) | |||
#endif | |||
void conv_stride2::do_conv_2x2_stride2( | |||
const float* src, const float* filter, float* dst, size_t IH, size_t IW, | |||
size_t OH, size_t OW, size_t IC) { | |||
@@ -29,7 +53,11 @@ void conv_stride2::do_conv_2x2_stride2( | |||
const float* k0 = filter; | |||
#if defined(PREFER_VF) | |||
const float* _k0123 = k0; | |||
#else | |||
GI_FLOAT32_t _k0123 = GiLoadFloat32(k0); | |||
#endif | |||
rep(h, OH) { | |||
int nn = OW >> 2; | |||
@@ -41,16 +69,16 @@ void conv_stride2::do_conv_2x2_stride2( | |||
GI_FLOAT32_t _r00 = GiGetSubVectorFloat32V2(_r0, 0); // 0 2 4 6 | |||
GI_FLOAT32_t _r01 = GiGetSubVectorFloat32V2(_r0, 1); // 1 3 5 7 | |||
_outp = GiSimdFmaLane(_outp, _r00, _k0123, 0); | |||
_outp = GiSimdFmaLane(_outp, _r01, _k0123, 1); | |||
_outp = MLA(_outp, _r00, _k0123, 0); | |||
_outp = MLA(_outp, _r01, _k0123, 1); | |||
GI_FLOAT32_V2_t _r1 = GiLoadUzipFloat32V2(r1); | |||
GI_FLOAT32_t _r10 = GiGetSubVectorFloat32V2(_r1, 0); | |||
GI_FLOAT32_t _r11 = GiGetSubVectorFloat32V2(_r1, 1); | |||
_outp = GiSimdFmaLane(_outp, _r10, _k0123, 2); | |||
_outp = GiSimdFmaLane(_outp, _r11, _k0123, 3); | |||
_outp = MLA(_outp, _r10, _k0123, 2); | |||
_outp = MLA(_outp, _r11, _k0123, 3); | |||
GiStoreFloat32(outptr, _outp); | |||
@@ -84,10 +112,18 @@ void conv_stride2::do_conv_3x3_stride2( | |||
const float* k1 = filter + 3; | |||
const float* k2 = filter + 5; | |||
#if defined(PREFER_VF) | |||
const float* _k0123 = k0; | |||
const float* _k3456 = k1; | |||
const float* _k5678 = k2; | |||
float _k6789[GI_SIMD_LEN_BYTE / sizeof(float)]; | |||
ext_float32_ptr(_k5678, _k5678, 1, _k6789); | |||
#else | |||
GI_FLOAT32_t _k0123 = GiLoadFloat32(k0); | |||
GI_FLOAT32_t _k3456 = GiLoadFloat32(k1); | |||
GI_FLOAT32_t _k5678 = GiLoadFloat32(k2); | |||
GI_FLOAT32_t _k6789 = GiExtqFloat32(_k5678, _k5678, 1); | |||
#endif | |||
rep(h, OH) { | |||
int nn = OW >> 2; | |||
@@ -102,9 +138,9 @@ void conv_stride2::do_conv_3x3_stride2( | |||
GI_FLOAT32_t _r02 = GiExtqFloat32( | |||
_r00, GiGetSubVectorFloat32V2(_r0n, 0), 1); // 2 4 6 8 | |||
_outp = GiSimdFmaLane(_outp, _r00, _k0123, 0); | |||
_outp = GiSimdFmaLane(_outp, _r01, _k0123, 1); | |||
_outp = GiSimdFmaLane(_outp, _r02, _k0123, 2); | |||
_outp = MLA(_outp, _r00, _k0123, 0); | |||
_outp = MLA(_outp, _r01, _k0123, 1); | |||
_outp = MLA(_outp, _r02, _k0123, 2); | |||
GI_FLOAT32_V2_t _r1 = GiLoadUzipFloat32V2(r1); | |||
GI_FLOAT32_V2_t _r1n = GiLoadUzipFloat32V2(r1 + 8); | |||
@@ -114,9 +150,9 @@ void conv_stride2::do_conv_3x3_stride2( | |||
GI_FLOAT32_t _r12 = | |||
GiExtqFloat32(_r10, GiGetSubVectorFloat32V2(_r1n, 0), 1); | |||
_outp = GiSimdFmaLane(_outp, _r10, _k3456, 0); | |||
_outp = GiSimdFmaLane(_outp, _r11, _k3456, 1); | |||
_outp = GiSimdFmaLane(_outp, _r12, _k3456, 2); | |||
_outp = MLA(_outp, _r10, _k3456, 0); | |||
_outp = MLA(_outp, _r11, _k3456, 1); | |||
_outp = MLA(_outp, _r12, _k3456, 2); | |||
GI_FLOAT32_V2_t _r2 = GiLoadUzipFloat32V2(r2); | |||
GI_FLOAT32_V2_t _r2n = GiLoadUzipFloat32V2(r2 + 8); | |||
@@ -126,9 +162,9 @@ void conv_stride2::do_conv_3x3_stride2( | |||
GI_FLOAT32_t _r22 = | |||
GiExtqFloat32(_r20, GiGetSubVectorFloat32V2(_r2n, 0), 1); | |||
_outp = GiSimdFmaLane(_outp, _r20, _k6789, 0); | |||
_outp = GiSimdFmaLane(_outp, _r21, _k6789, 1); | |||
_outp = GiSimdFmaLane(_outp, _r22, _k6789, 2); | |||
_outp = MLA(_outp, _r20, _k6789, 0); | |||
_outp = MLA(_outp, _r21, _k6789, 1); | |||
_outp = MLA(_outp, _r22, _k6789, 2); | |||
GiStoreFloat32(outptr, _outp); | |||
@@ -162,6 +198,15 @@ void conv_stride2::do_conv_5x5_stride2( | |||
const float* r3 = src_ptr + IW * 3; | |||
const float* r4 = src_ptr + IW * 4; | |||
#if defined(PREFER_VF) | |||
const float* _k0123 = filter; | |||
const float* _k4567 = filter + 4; | |||
const float* _k891011 = filter + 8; | |||
const float* _k12131415 = filter + 12; | |||
const float* _k16171819 = filter + 16; | |||
const float* _k20212223 = filter + 20; | |||
const float* _k24242424 = filter + 24; | |||
#else | |||
GI_FLOAT32_t _k0123 = GiLoadFloat32(filter); | |||
GI_FLOAT32_t _k4567 = GiLoadFloat32(filter + 4); | |||
GI_FLOAT32_t _k891011 = GiLoadFloat32(filter + 8); | |||
@@ -169,6 +214,7 @@ void conv_stride2::do_conv_5x5_stride2( | |||
GI_FLOAT32_t _k16171819 = GiLoadFloat32(filter + 16); | |||
GI_FLOAT32_t _k20212223 = GiLoadFloat32(filter + 20); | |||
GI_FLOAT32_t _k24242424 = GiBroadcastFloat32(filter[24]); | |||
#endif | |||
for (size_t i = 0; i < OH; i++) { | |||
int nn = OW >> 2; | |||
@@ -230,35 +276,35 @@ void conv_stride2::do_conv_5x5_stride2( | |||
GI_FLOAT32_t _r43 = GiExtqFloat32(_r41, _r4_9111315, 1); | |||
GI_FLOAT32_t _r44 = GiExtqFloat32(_r40, _r4_8101214, 2); | |||
_sum = GiSimdFmaLane(_sum, _r00, _k0123, 0); | |||
_sum = GiSimdFmaLane(_sum, _r01, _k0123, 1); | |||
_sum = GiSimdFmaLane(_sum, _r02, _k0123, 2); | |||
_sum = GiSimdFmaLane(_sum, _r03, _k0123, 3); | |||
_sum = GiSimdFmaLane(_sum, _r04, _k4567, 0); | |||
_sum = GiSimdFmaLane(_sum, _r10, _k4567, 1); | |||
_sum = GiSimdFmaLane(_sum, _r11, _k4567, 2); | |||
_sum = GiSimdFmaLane(_sum, _r12, _k4567, 3); | |||
_sum = GiSimdFmaLane(_sum, _r13, _k891011, 0); | |||
_sum = GiSimdFmaLane(_sum, _r14, _k891011, 1); | |||
_sum = GiSimdFmaLane(_sum, _r20, _k891011, 2); | |||
_sum = GiSimdFmaLane(_sum, _r21, _k891011, 3); | |||
_sum = GiSimdFmaLane(_sum, _r22, _k12131415, 0); | |||
_sum = GiSimdFmaLane(_sum, _r23, _k12131415, 1); | |||
_sum = GiSimdFmaLane(_sum, _r24, _k12131415, 2); | |||
_sum = GiSimdFmaLane(_sum, _r30, _k12131415, 3); | |||
_sum = GiSimdFmaLane(_sum, _r31, _k16171819, 0); | |||
_sum = GiSimdFmaLane(_sum, _r32, _k16171819, 1); | |||
_sum = GiSimdFmaLane(_sum, _r33, _k16171819, 2); | |||
_sum = GiSimdFmaLane(_sum, _r34, _k16171819, 3); | |||
_sum = GiSimdFmaLane(_sum, _r40, _k20212223, 0); | |||
_sum = GiSimdFmaLane(_sum, _r41, _k20212223, 1); | |||
_sum = GiSimdFmaLane(_sum, _r42, _k20212223, 2); | |||
_sum = GiSimdFmaLane(_sum, _r43, _k20212223, 3); | |||
_sum = GiSimdFmaLane(_sum, _r44, _k24242424, 0); | |||
_sum = MLA(_sum, _r00, _k0123, 0); | |||
_sum = MLA(_sum, _r01, _k0123, 1); | |||
_sum = MLA(_sum, _r02, _k0123, 2); | |||
_sum = MLA(_sum, _r03, _k0123, 3); | |||
_sum = MLA(_sum, _r04, _k4567, 0); | |||
_sum = MLA(_sum, _r10, _k4567, 1); | |||
_sum = MLA(_sum, _r11, _k4567, 2); | |||
_sum = MLA(_sum, _r12, _k4567, 3); | |||
_sum = MLA(_sum, _r13, _k891011, 0); | |||
_sum = MLA(_sum, _r14, _k891011, 1); | |||
_sum = MLA(_sum, _r20, _k891011, 2); | |||
_sum = MLA(_sum, _r21, _k891011, 3); | |||
_sum = MLA(_sum, _r22, _k12131415, 0); | |||
_sum = MLA(_sum, _r23, _k12131415, 1); | |||
_sum = MLA(_sum, _r24, _k12131415, 2); | |||
_sum = MLA(_sum, _r30, _k12131415, 3); | |||
_sum = MLA(_sum, _r31, _k16171819, 0); | |||
_sum = MLA(_sum, _r32, _k16171819, 1); | |||
_sum = MLA(_sum, _r33, _k16171819, 2); | |||
_sum = MLA(_sum, _r34, _k16171819, 3); | |||
_sum = MLA(_sum, _r40, _k20212223, 0); | |||
_sum = MLA(_sum, _r41, _k20212223, 1); | |||
_sum = MLA(_sum, _r42, _k20212223, 2); | |||
_sum = MLA(_sum, _r43, _k20212223, 3); | |||
_sum = MLA(_sum, _r44, _k24242424, 0); | |||
GiStoreFloat32(outptr, _sum); | |||
@@ -312,8 +358,13 @@ void conv_stride2::do_conv_7x7_stride2( | |||
rep(i, nn) { | |||
GI_FLOAT32_t _sum = GiLoadFloat32(outptr); | |||
#if defined(PREFER_VF) | |||
const float* _k0123 = k0; | |||
const float* _k4567 = k0 + 4; | |||
#else | |||
GI_FLOAT32_t _k0123 = GiLoadFloat32(k0); | |||
GI_FLOAT32_t _k4567 = GiLoadFloat32(k0 + 4); | |||
#endif | |||
GI_FLOAT32_V2_t _r00_02461357 = GiLoadUzipFloat32V2(r0); | |||
GI_FLOAT32_V2_t _r00nx2 = GiLoadUzipFloat32V2(r0 + 8); | |||
@@ -331,16 +382,21 @@ void conv_stride2::do_conv_7x7_stride2( | |||
GI_FLOAT32_t _r05 = GiExtqFloat32(_r01, _r0_9111315, 2); // 5 7 9 11 | |||
GI_FLOAT32_t _r06 = GiExtqFloat32(_r00, _r0_8101214, 3); // 6 8 10 12 | |||
_sum = GiSimdFmaLane(_sum, _r00, _k0123, 0); | |||
_sum = GiSimdFmaLane(_sum, _r01, _k0123, 1); | |||
_sum = GiSimdFmaLane(_sum, _r02, _k0123, 2); | |||
_sum = GiSimdFmaLane(_sum, _r03, _k0123, 3); | |||
_sum = GiSimdFmaLane(_sum, _r04, _k4567, 0); | |||
_sum = GiSimdFmaLane(_sum, _r05, _k4567, 1); | |||
_sum = GiSimdFmaLane(_sum, _r06, _k4567, 2); | |||
_sum = MLA(_sum, _r00, _k0123, 0); | |||
_sum = MLA(_sum, _r01, _k0123, 1); | |||
_sum = MLA(_sum, _r02, _k0123, 2); | |||
_sum = MLA(_sum, _r03, _k0123, 3); | |||
_sum = MLA(_sum, _r04, _k4567, 0); | |||
_sum = MLA(_sum, _r05, _k4567, 1); | |||
_sum = MLA(_sum, _r06, _k4567, 2); | |||
#if defined(PREFER_VF) | |||
const float* _k78910 = k1; | |||
const float* _k11121314 = k1 + 4; | |||
#else | |||
GI_FLOAT32_t _k78910 = GiLoadFloat32(k1); | |||
GI_FLOAT32_t _k11121314 = GiLoadFloat32(k1 + 4); | |||
#endif | |||
GI_FLOAT32_V2_t _r10_02461357 = GiLoadUzipFloat32V2(r1); | |||
GI_FLOAT32_V2_t _r10nx2 = GiLoadUzipFloat32V2(r1 + 8); | |||
@@ -354,16 +410,21 @@ void conv_stride2::do_conv_7x7_stride2( | |||
GI_FLOAT32_t _r15 = GiExtqFloat32(_r11, _r1_9111315, 2); | |||
GI_FLOAT32_t _r16 = GiExtqFloat32(_r10, _r1_8101214, 3); | |||
_sum = GiSimdFmaLane(_sum, _r10, _k78910, 0); | |||
_sum = GiSimdFmaLane(_sum, _r11, _k78910, 1); | |||
_sum = GiSimdFmaLane(_sum, _r12, _k78910, 2); | |||
_sum = GiSimdFmaLane(_sum, _r13, _k78910, 3); | |||
_sum = GiSimdFmaLane(_sum, _r14, _k11121314, 0); | |||
_sum = GiSimdFmaLane(_sum, _r15, _k11121314, 1); | |||
_sum = GiSimdFmaLane(_sum, _r16, _k11121314, 2); | |||
_sum = MLA(_sum, _r10, _k78910, 0); | |||
_sum = MLA(_sum, _r11, _k78910, 1); | |||
_sum = MLA(_sum, _r12, _k78910, 2); | |||
_sum = MLA(_sum, _r13, _k78910, 3); | |||
_sum = MLA(_sum, _r14, _k11121314, 0); | |||
_sum = MLA(_sum, _r15, _k11121314, 1); | |||
_sum = MLA(_sum, _r16, _k11121314, 2); | |||
#if defined(PREFER_VF) | |||
const float* _k14151617 = k2; | |||
const float* _k18192021 = k2 + 4; | |||
#else | |||
GI_FLOAT32_t _k14151617 = GiLoadFloat32(k2); | |||
GI_FLOAT32_t _k18192021 = GiLoadFloat32(k2 + 4); | |||
#endif | |||
GI_FLOAT32_V2_t _r20_02461357 = GiLoadUzipFloat32V2(r2); | |||
GI_FLOAT32_V2_t _r20nx2 = GiLoadUzipFloat32V2(r2 + 8); | |||
@@ -377,16 +438,21 @@ void conv_stride2::do_conv_7x7_stride2( | |||
GI_FLOAT32_t _r25 = GiExtqFloat32(_r21, _r2_9111315, 2); | |||
GI_FLOAT32_t _r26 = GiExtqFloat32(_r20, _r2_8101214, 3); | |||
_sum = GiSimdFmaLane(_sum, _r20, _k14151617, 0); | |||
_sum = GiSimdFmaLane(_sum, _r21, _k14151617, 1); | |||
_sum = GiSimdFmaLane(_sum, _r22, _k14151617, 2); | |||
_sum = GiSimdFmaLane(_sum, _r23, _k14151617, 3); | |||
_sum = GiSimdFmaLane(_sum, _r24, _k18192021, 0); | |||
_sum = GiSimdFmaLane(_sum, _r25, _k18192021, 1); | |||
_sum = GiSimdFmaLane(_sum, _r26, _k18192021, 2); | |||
_sum = MLA(_sum, _r20, _k14151617, 0); | |||
_sum = MLA(_sum, _r21, _k14151617, 1); | |||
_sum = MLA(_sum, _r22, _k14151617, 2); | |||
_sum = MLA(_sum, _r23, _k14151617, 3); | |||
_sum = MLA(_sum, _r24, _k18192021, 0); | |||
_sum = MLA(_sum, _r25, _k18192021, 1); | |||
_sum = MLA(_sum, _r26, _k18192021, 2); | |||
#if defined(PREFER_VF) | |||
const float* _k21222324 = k3; | |||
const float* _k25262728 = k3 + 4; | |||
#else | |||
GI_FLOAT32_t _k21222324 = GiLoadFloat32(k3); | |||
GI_FLOAT32_t _k25262728 = GiLoadFloat32(k3 + 4); | |||
#endif | |||
GI_FLOAT32_V2_t _r30_02461357 = GiLoadUzipFloat32V2(r3); | |||
GI_FLOAT32_V2_t _r30nx2 = GiLoadUzipFloat32V2(r3 + 8); | |||
@@ -400,16 +466,21 @@ void conv_stride2::do_conv_7x7_stride2( | |||
GI_FLOAT32_t _r35 = GiExtqFloat32(_r31, _r3_9111315, 2); | |||
GI_FLOAT32_t _r36 = GiExtqFloat32(_r30, _r3_8101214, 3); | |||
_sum = GiSimdFmaLane(_sum, _r30, _k21222324, 0); | |||
_sum = GiSimdFmaLane(_sum, _r31, _k21222324, 1); | |||
_sum = GiSimdFmaLane(_sum, _r32, _k21222324, 2); | |||
_sum = GiSimdFmaLane(_sum, _r33, _k21222324, 3); | |||
_sum = GiSimdFmaLane(_sum, _r34, _k25262728, 0); | |||
_sum = GiSimdFmaLane(_sum, _r35, _k25262728, 1); | |||
_sum = GiSimdFmaLane(_sum, _r36, _k25262728, 2); | |||
_sum = MLA(_sum, _r30, _k21222324, 0); | |||
_sum = MLA(_sum, _r31, _k21222324, 1); | |||
_sum = MLA(_sum, _r32, _k21222324, 2); | |||
_sum = MLA(_sum, _r33, _k21222324, 3); | |||
_sum = MLA(_sum, _r34, _k25262728, 0); | |||
_sum = MLA(_sum, _r35, _k25262728, 1); | |||
_sum = MLA(_sum, _r36, _k25262728, 2); | |||
#if defined(PREFER_VF) | |||
const float* _k28293031 = k4; | |||
const float* _k32333435 = k4 + 4; | |||
#else | |||
GI_FLOAT32_t _k28293031 = GiLoadFloat32(k4); | |||
GI_FLOAT32_t _k32333435 = GiLoadFloat32(k4 + 4); | |||
#endif | |||
GI_FLOAT32_V2_t _r40_02461357 = GiLoadUzipFloat32V2(r4); | |||
GI_FLOAT32_V2_t _r40nx2 = GiLoadUzipFloat32V2(r4 + 8); | |||
@@ -423,16 +494,21 @@ void conv_stride2::do_conv_7x7_stride2( | |||
GI_FLOAT32_t _r45 = GiExtqFloat32(_r41, _r4_9111315, 2); | |||
GI_FLOAT32_t _r46 = GiExtqFloat32(_r40, _r4_8101214, 3); | |||
_sum = GiSimdFmaLane(_sum, _r40, _k28293031, 0); | |||
_sum = GiSimdFmaLane(_sum, _r41, _k28293031, 1); | |||
_sum = GiSimdFmaLane(_sum, _r42, _k28293031, 2); | |||
_sum = GiSimdFmaLane(_sum, _r43, _k28293031, 3); | |||
_sum = GiSimdFmaLane(_sum, _r44, _k32333435, 0); | |||
_sum = GiSimdFmaLane(_sum, _r45, _k32333435, 1); | |||
_sum = GiSimdFmaLane(_sum, _r46, _k32333435, 2); | |||
_sum = MLA(_sum, _r40, _k28293031, 0); | |||
_sum = MLA(_sum, _r41, _k28293031, 1); | |||
_sum = MLA(_sum, _r42, _k28293031, 2); | |||
_sum = MLA(_sum, _r43, _k28293031, 3); | |||
_sum = MLA(_sum, _r44, _k32333435, 0); | |||
_sum = MLA(_sum, _r45, _k32333435, 1); | |||
_sum = MLA(_sum, _r46, _k32333435, 2); | |||
#if defined(PREFER_VF) | |||
const float* _k35363738 = k5; | |||
const float* _k39404142 = k5 + 4; | |||
#else | |||
GI_FLOAT32_t _k35363738 = GiLoadFloat32(k5); | |||
GI_FLOAT32_t _k39404142 = GiLoadFloat32(k5 + 4); | |||
#endif | |||
GI_FLOAT32_V2_t _r50_02461357 = GiLoadUzipFloat32V2(r5); | |||
GI_FLOAT32_V2_t _r50nx2 = GiLoadUzipFloat32V2(r5 + 8); | |||
@@ -446,16 +522,21 @@ void conv_stride2::do_conv_7x7_stride2( | |||
GI_FLOAT32_t _r55 = GiExtqFloat32(_r51, _r5_9111315, 2); | |||
GI_FLOAT32_t _r56 = GiExtqFloat32(_r50, _r5_8101214, 3); | |||
_sum = GiSimdFmaLane(_sum, _r50, _k35363738, 0); | |||
_sum = GiSimdFmaLane(_sum, _r51, _k35363738, 1); | |||
_sum = GiSimdFmaLane(_sum, _r52, _k35363738, 2); | |||
_sum = GiSimdFmaLane(_sum, _r53, _k35363738, 3); | |||
_sum = GiSimdFmaLane(_sum, _r54, _k39404142, 0); | |||
_sum = GiSimdFmaLane(_sum, _r55, _k39404142, 1); | |||
_sum = GiSimdFmaLane(_sum, _r56, _k39404142, 2); | |||
_sum = MLA(_sum, _r50, _k35363738, 0); | |||
_sum = MLA(_sum, _r51, _k35363738, 1); | |||
_sum = MLA(_sum, _r52, _k35363738, 2); | |||
_sum = MLA(_sum, _r53, _k35363738, 3); | |||
_sum = MLA(_sum, _r54, _k39404142, 0); | |||
_sum = MLA(_sum, _r55, _k39404142, 1); | |||
_sum = MLA(_sum, _r56, _k39404142, 2); | |||
#if defined(PREFER_VF) | |||
const float* _k42434445 = k6; | |||
const float* _k45464748 = k6 + 3; | |||
#else | |||
GI_FLOAT32_t _k42434445 = GiLoadFloat32(k6); | |||
GI_FLOAT32_t _k45464748 = GiLoadFloat32(k6 + 3); | |||
#endif | |||
GI_FLOAT32_V2_t _r60_02461357 = GiLoadUzipFloat32V2(r6); | |||
GI_FLOAT32_V2_t _r60nx2 = GiLoadUzipFloat32V2(r6 + 8); | |||
@@ -469,13 +550,13 @@ void conv_stride2::do_conv_7x7_stride2( | |||
GI_FLOAT32_t _r65 = GiExtqFloat32(_r61, _r6_9111315, 2); | |||
GI_FLOAT32_t _r66 = GiExtqFloat32(_r60, _r6_8101214, 3); | |||
_sum = GiSimdFmaLane(_sum, _r60, _k42434445, 0); | |||
_sum = GiSimdFmaLane(_sum, _r61, _k42434445, 1); | |||
_sum = GiSimdFmaLane(_sum, _r62, _k42434445, 2); | |||
_sum = GiSimdFmaLane(_sum, _r63, _k42434445, 3); | |||
_sum = GiSimdFmaLane(_sum, _r64, _k45464748, 1); | |||
_sum = GiSimdFmaLane(_sum, _r65, _k45464748, 2); | |||
_sum = GiSimdFmaLane(_sum, _r66, _k45464748, 3); | |||
_sum = MLA(_sum, _r60, _k42434445, 0); | |||
_sum = MLA(_sum, _r61, _k42434445, 1); | |||
_sum = MLA(_sum, _r62, _k42434445, 2); | |||
_sum = MLA(_sum, _r63, _k42434445, 3); | |||
_sum = MLA(_sum, _r64, _k45464748, 1); | |||
_sum = MLA(_sum, _r65, _k45464748, 2); | |||
_sum = MLA(_sum, _r66, _k45464748, 3); | |||
GiStoreFloat32(outptr, _sum); | |||
@@ -75,6 +75,21 @@ struct InputTransformF73_NCHW44 { | |||
size_t icb = ic / pack_size; | |||
GI_FLOAT32_t d0, d1, d2, d3, d4, d5, d6, d7, d8; | |||
#if defined(GI_TARGET_X86) || defined(GI_RVV_INTRINSICS) | |||
//! x86 and rvv GiSimdFmaLane API is slowly, as an alternate, use | |||
//! GiMultiplyAddScalarFloat32 | |||
#define MADD(a, b, c, d) GiMultiplyAddScalarFloat32(a, b, *(c + d)) | |||
const float* v0 = input_parameters + 0; | |||
const float* v1 = input_parameters + 4; | |||
const float* v2 = input_parameters + 8; | |||
const float* v3 = input_parameters + 12; | |||
const float* v4 = input_parameters + 16; | |||
const float* v5 = input_parameters + 20; | |||
const float* v6 = input_parameters + 24; | |||
#define MSUB(a, b, c, d) GiMultiplySubScalarFloat32(a, b, *(c + d)) | |||
#else | |||
#define MADD(a, b, c, d) GiSimdFmaLane(a, b, c, d) | |||
#define MSUB(a, b, c, d) GiFmsqLaneQFloat32(a, b, c, d) | |||
GI_FLOAT32_t v0 = GiLoadFloat32(input_parameters + 0); | |||
GI_FLOAT32_t v1 = GiLoadFloat32(input_parameters + 4); | |||
GI_FLOAT32_t v2 = GiLoadFloat32(input_parameters + 8); | |||
@@ -82,6 +97,7 @@ struct InputTransformF73_NCHW44 { | |||
GI_FLOAT32_t v4 = GiLoadFloat32(input_parameters + 16); | |||
GI_FLOAT32_t v5 = GiLoadFloat32(input_parameters + 20); | |||
GI_FLOAT32_t v6 = GiLoadFloat32(input_parameters + 24); | |||
#endif | |||
//! B | |||
//! 1.5 0 0 0 0 0 0 0 0 | |||
@@ -120,59 +136,59 @@ struct InputTransformF73_NCHW44 { | |||
auto t##i##5 = d7; \ | |||
auto t##i##6 = d7; \ | |||
auto t##i##7 = d7; \ | |||
t##i##8 = GiFmsqLaneQFloat32(t##i##8, d7, v0, 0); \ | |||
t##i##8 = MSUB(t##i##8, d7, v0, 0); \ | |||
t##i##0 = GiSubtractFloat32(t##i##0, d1); \ | |||
t##i##1 = GiFmsqLaneQFloat32(t##i##1, d1, v0, 0); \ | |||
t##i##2 = GiSimdFmaLane(t##i##2, d1, v0, 0); \ | |||
t##i##3 = GiFmsqLaneQFloat32(t##i##3, d1, v0, 1); \ | |||
t##i##4 = GiSimdFmaLane(t##i##4, d1, v0, 1); \ | |||
t##i##5 = GiFmsqLaneQFloat32(t##i##5, d1, v0, 2); \ | |||
t##i##6 = GiSimdFmaLane(t##i##6, d1, v0, 2); \ | |||
t##i##1 = MSUB(t##i##1, d1, v0, 0); \ | |||
t##i##2 = MADD(t##i##2, d1, v0, 0); \ | |||
t##i##3 = MSUB(t##i##3, d1, v0, 1); \ | |||
t##i##4 = MADD(t##i##4, d1, v0, 1); \ | |||
t##i##5 = MSUB(t##i##5, d1, v0, 2); \ | |||
t##i##6 = MADD(t##i##6, d1, v0, 2); \ | |||
t##i##7 = GiSubtractFloat32(t##i##7, d1); \ | |||
t##i##8 = GiSimdFmaLane(t##i##8, d1, v0, 0); \ | |||
t##i##0 = GiFmsqLaneQFloat32(t##i##0, d2, v0, 3); \ | |||
t##i##1 = GiFmsqLaneQFloat32(t##i##1, d2, v1, 0); \ | |||
t##i##2 = GiFmsqLaneQFloat32(t##i##2, d2, v1, 1); \ | |||
t##i##3 = GiSimdFmaLane(t##i##3, d2, v1, 2); \ | |||
t##i##4 = GiFmsqLaneQFloat32(t##i##4, d2, v1, 3); \ | |||
t##i##5 = GiFmsqLaneQFloat32(t##i##5, d2, v2, 0); \ | |||
t##i##6 = GiFmsqLaneQFloat32(t##i##6, d2, v2, 1); \ | |||
t##i##8 = MADD(t##i##8, d1, v0, 0); \ | |||
t##i##0 = MSUB(t##i##0, d2, v0, 3); \ | |||
t##i##1 = MSUB(t##i##1, d2, v1, 0); \ | |||
t##i##2 = MSUB(t##i##2, d2, v1, 1); \ | |||
t##i##3 = MADD(t##i##3, d2, v1, 2); \ | |||
t##i##4 = MSUB(t##i##4, d2, v1, 3); \ | |||
t##i##5 = MSUB(t##i##5, d2, v2, 0); \ | |||
t##i##6 = MSUB(t##i##6, d2, v2, 1); \ | |||
t##i##8 = GiSubtractFloat32(t##i##8, d2); \ | |||
t##i##0 = GiSimdFmaLane(t##i##0, d3, v2, 2); \ | |||
t##i##1 = GiSimdFmaLane(t##i##1, d3, v2, 3); \ | |||
t##i##2 = GiFmsqLaneQFloat32(t##i##2, d3, v3, 0); \ | |||
t##i##3 = GiSimdFmaLane(t##i##3, d3, v2, 0); \ | |||
t##i##4 = GiFmsqLaneQFloat32(t##i##4, d3, v3, 1); \ | |||
t##i##5 = GiSimdFmaLane(t##i##5, d3, v3, 2); \ | |||
t##i##6 = GiSimdFmaLane(t##i##6, d3, v3, 3); \ | |||
t##i##7 = GiSimdFmaLane(t##i##7, d3, v2, 2); \ | |||
t##i##8 = GiFmsqLaneQFloat32(t##i##8, d3, v0, 3); \ | |||
t##i##0 = GiSimdFmaLane(t##i##0, d4, v0, 3); \ | |||
t##i##1 = GiSimdFmaLane(t##i##1, d4, v4, 0); \ | |||
t##i##2 = GiSimdFmaLane(t##i##2, d4, v4, 1); \ | |||
t##i##3 = GiFmsqLaneQFloat32(t##i##3, d4, v4, 2); \ | |||
t##i##4 = GiSimdFmaLane(t##i##4, d4, v4, 3); \ | |||
t##i##5 = GiSimdFmaLane(t##i##5, d4, v5, 0); \ | |||
t##i##6 = GiSimdFmaLane(t##i##6, d4, v5, 1); \ | |||
t##i##8 = GiSimdFmaLane(t##i##8, d4, v2, 2); \ | |||
t##i##0 = GiFmsqLaneQFloat32(t##i##0, d5, v2, 2); \ | |||
t##i##1 = GiFmsqLaneQFloat32(t##i##1, d5, v5, 2); \ | |||
t##i##2 = GiFmsqLaneQFloat32(t##i##2, d5, v5, 3); \ | |||
t##i##3 = GiFmsqLaneQFloat32(t##i##3, d5, v6, 0); \ | |||
t##i##4 = GiSimdFmaLane(t##i##4, d5, v6, 1); \ | |||
t##i##5 = GiFmsqLaneQFloat32(t##i##5, d5, v5, 2); \ | |||
t##i##6 = GiFmsqLaneQFloat32(t##i##6, d5, v6, 0); \ | |||
t##i##7 = GiFmsqLaneQFloat32(t##i##7, d5, v2, 2); \ | |||
t##i##8 = GiSimdFmaLane(t##i##8, d5, v0, 3); \ | |||
t##i##0 = GiFmsqLaneQFloat32(t##i##0, d6, v0, 0); \ | |||
t##i##1 = GiFmsqLaneQFloat32(t##i##1, d6, v1, 0); \ | |||
t##i##2 = GiFmsqLaneQFloat32(t##i##2, d6, v1, 1); \ | |||
t##i##3 = GiSimdFmaLane(t##i##3, d6, v1, 0); \ | |||
t##i##4 = GiFmsqLaneQFloat32(t##i##4, d6, v3, 1); \ | |||
t##i##0 = MADD(t##i##0, d3, v2, 2); \ | |||
t##i##1 = MADD(t##i##1, d3, v2, 3); \ | |||
t##i##2 = MSUB(t##i##2, d3, v3, 0); \ | |||
t##i##3 = MADD(t##i##3, d3, v2, 0); \ | |||
t##i##4 = MSUB(t##i##4, d3, v3, 1); \ | |||
t##i##5 = MADD(t##i##5, d3, v3, 2); \ | |||
t##i##6 = MADD(t##i##6, d3, v3, 3); \ | |||
t##i##7 = MADD(t##i##7, d3, v2, 2); \ | |||
t##i##8 = MSUB(t##i##8, d3, v0, 3); \ | |||
t##i##0 = MADD(t##i##0, d4, v0, 3); \ | |||
t##i##1 = MADD(t##i##1, d4, v4, 0); \ | |||
t##i##2 = MADD(t##i##2, d4, v4, 1); \ | |||
t##i##3 = MSUB(t##i##3, d4, v4, 2); \ | |||
t##i##4 = MADD(t##i##4, d4, v4, 3); \ | |||
t##i##5 = MADD(t##i##5, d4, v5, 0); \ | |||
t##i##6 = MADD(t##i##6, d4, v5, 1); \ | |||
t##i##8 = MADD(t##i##8, d4, v2, 2); \ | |||
t##i##0 = MSUB(t##i##0, d5, v2, 2); \ | |||
t##i##1 = MSUB(t##i##1, d5, v5, 2); \ | |||
t##i##2 = MSUB(t##i##2, d5, v5, 3); \ | |||
t##i##3 = MSUB(t##i##3, d5, v6, 0); \ | |||
t##i##4 = MADD(t##i##4, d5, v6, 1); \ | |||
t##i##5 = MSUB(t##i##5, d5, v5, 2); \ | |||
t##i##6 = MSUB(t##i##6, d5, v6, 0); \ | |||
t##i##7 = MSUB(t##i##7, d5, v2, 2); \ | |||
t##i##8 = MADD(t##i##8, d5, v0, 3); \ | |||
t##i##0 = MSUB(t##i##0, d6, v0, 0); \ | |||
t##i##1 = MSUB(t##i##1, d6, v1, 0); \ | |||
t##i##2 = MSUB(t##i##2, d6, v1, 1); \ | |||
t##i##3 = MADD(t##i##3, d6, v1, 0); \ | |||
t##i##4 = MSUB(t##i##4, d6, v3, 1); \ | |||
t##i##5 = GiSubtractFloat32(t##i##5, d6); \ | |||
t##i##6 = GiFmsqLaneQFloat32(t##i##6, d6, v6, 2); \ | |||
t##i##8 = GiFmsqLaneQFloat32(t##i##8, d6, v2, 2); \ | |||
t##i##0 = GiSimdFmaLane(t##i##0, d0, v0, 0); | |||
t##i##6 = MSUB(t##i##6, d6, v6, 2); \ | |||
t##i##8 = MSUB(t##i##8, d6, v2, 2); \ | |||
t##i##0 = MADD(t##i##0, d0, v0, 0); | |||
UNROLL_CALL_RAW(9, cb); | |||
#undef cb | |||
@@ -187,59 +203,59 @@ struct InputTransformF73_NCHW44 { | |||
d5 = t7##i; \ | |||
d6 = t7##i; \ | |||
d7 = t7##i; \ | |||
d8 = GiFmsqLaneQFloat32(d8, t7##i, v0, 0); \ | |||
d8 = MSUB(d8, t7##i, v0, 0); \ | |||
d0 = GiSubtractFloat32(d0, t1##i); \ | |||
d1 = GiFmsqLaneQFloat32(d1, t1##i, v0, 0); \ | |||
d2 = GiSimdFmaLane(d2, t1##i, v0, 0); \ | |||
d3 = GiFmsqLaneQFloat32(d3, t1##i, v0, 1); \ | |||
d4 = GiSimdFmaLane(d4, t1##i, v0, 1); \ | |||
d5 = GiFmsqLaneQFloat32(d5, t1##i, v0, 2); \ | |||
d6 = GiSimdFmaLane(d6, t1##i, v0, 2); \ | |||
d1 = MSUB(d1, t1##i, v0, 0); \ | |||
d2 = MADD(d2, t1##i, v0, 0); \ | |||
d3 = MSUB(d3, t1##i, v0, 1); \ | |||
d4 = MADD(d4, t1##i, v0, 1); \ | |||
d5 = MSUB(d5, t1##i, v0, 2); \ | |||
d6 = MADD(d6, t1##i, v0, 2); \ | |||
d7 = GiSubtractFloat32(d7, t1##i); \ | |||
d8 = GiSimdFmaLane(d8, t1##i, v0, 0); \ | |||
d0 = GiFmsqLaneQFloat32(d0, t2##i, v0, 3); \ | |||
d1 = GiFmsqLaneQFloat32(d1, t2##i, v1, 0); \ | |||
d2 = GiFmsqLaneQFloat32(d2, t2##i, v1, 1); \ | |||
d3 = GiSimdFmaLane(d3, t2##i, v1, 2); \ | |||
d4 = GiFmsqLaneQFloat32(d4, t2##i, v1, 3); \ | |||
d5 = GiFmsqLaneQFloat32(d5, t2##i, v2, 0); \ | |||
d6 = GiFmsqLaneQFloat32(d6, t2##i, v2, 1); \ | |||
d8 = MADD(d8, t1##i, v0, 0); \ | |||
d0 = MSUB(d0, t2##i, v0, 3); \ | |||
d1 = MSUB(d1, t2##i, v1, 0); \ | |||
d2 = MSUB(d2, t2##i, v1, 1); \ | |||
d3 = MADD(d3, t2##i, v1, 2); \ | |||
d4 = MSUB(d4, t2##i, v1, 3); \ | |||
d5 = MSUB(d5, t2##i, v2, 0); \ | |||
d6 = MSUB(d6, t2##i, v2, 1); \ | |||
d8 = GiSubtractFloat32(d8, t2##i); \ | |||
d0 = GiSimdFmaLane(d0, t3##i, v2, 2); \ | |||
d1 = GiSimdFmaLane(d1, t3##i, v2, 3); \ | |||
d2 = GiFmsqLaneQFloat32(d2, t3##i, v3, 0); \ | |||
d3 = GiSimdFmaLane(d3, t3##i, v2, 0); \ | |||
d4 = GiFmsqLaneQFloat32(d4, t3##i, v3, 1); \ | |||
d5 = GiSimdFmaLane(d5, t3##i, v3, 2); \ | |||
d6 = GiSimdFmaLane(d6, t3##i, v3, 3); \ | |||
d7 = GiSimdFmaLane(d7, t3##i, v2, 2); \ | |||
d8 = GiFmsqLaneQFloat32(d8, t3##i, v0, 3); \ | |||
d0 = GiSimdFmaLane(d0, t4##i, v0, 3); \ | |||
d1 = GiSimdFmaLane(d1, t4##i, v4, 0); \ | |||
d2 = GiSimdFmaLane(d2, t4##i, v4, 1); \ | |||
d3 = GiFmsqLaneQFloat32(d3, t4##i, v4, 2); \ | |||
d4 = GiSimdFmaLane(d4, t4##i, v4, 3); \ | |||
d5 = GiSimdFmaLane(d5, t4##i, v5, 0); \ | |||
d6 = GiSimdFmaLane(d6, t4##i, v5, 1); \ | |||
d8 = GiSimdFmaLane(d8, t4##i, v2, 2); \ | |||
d0 = GiFmsqLaneQFloat32(d0, t5##i, v2, 2); \ | |||
d1 = GiFmsqLaneQFloat32(d1, t5##i, v5, 2); \ | |||
d2 = GiFmsqLaneQFloat32(d2, t5##i, v5, 3); \ | |||
d3 = GiFmsqLaneQFloat32(d3, t5##i, v6, 0); \ | |||
d4 = GiSimdFmaLane(d4, t5##i, v6, 1); \ | |||
d5 = GiFmsqLaneQFloat32(d5, t5##i, v5, 2); \ | |||
d6 = GiFmsqLaneQFloat32(d6, t5##i, v6, 0); \ | |||
d7 = GiFmsqLaneQFloat32(d7, t5##i, v2, 2); \ | |||
d8 = GiSimdFmaLane(d8, t5##i, v0, 3); \ | |||
d0 = GiFmsqLaneQFloat32(d0, t6##i, v0, 0); \ | |||
d1 = GiFmsqLaneQFloat32(d1, t6##i, v1, 0); \ | |||
d2 = GiFmsqLaneQFloat32(d2, t6##i, v1, 1); \ | |||
d3 = GiSimdFmaLane(d3, t6##i, v1, 0); \ | |||
d4 = GiFmsqLaneQFloat32(d4, t6##i, v3, 1); \ | |||
d0 = MADD(d0, t3##i, v2, 2); \ | |||
d1 = MADD(d1, t3##i, v2, 3); \ | |||
d2 = MSUB(d2, t3##i, v3, 0); \ | |||
d3 = MADD(d3, t3##i, v2, 0); \ | |||
d4 = MSUB(d4, t3##i, v3, 1); \ | |||
d5 = MADD(d5, t3##i, v3, 2); \ | |||
d6 = MADD(d6, t3##i, v3, 3); \ | |||
d7 = MADD(d7, t3##i, v2, 2); \ | |||
d8 = MSUB(d8, t3##i, v0, 3); \ | |||
d0 = MADD(d0, t4##i, v0, 3); \ | |||
d1 = MADD(d1, t4##i, v4, 0); \ | |||
d2 = MADD(d2, t4##i, v4, 1); \ | |||
d3 = MSUB(d3, t4##i, v4, 2); \ | |||
d4 = MADD(d4, t4##i, v4, 3); \ | |||
d5 = MADD(d5, t4##i, v5, 0); \ | |||
d6 = MADD(d6, t4##i, v5, 1); \ | |||
d8 = MADD(d8, t4##i, v2, 2); \ | |||
d0 = MSUB(d0, t5##i, v2, 2); \ | |||
d1 = MSUB(d1, t5##i, v5, 2); \ | |||
d2 = MSUB(d2, t5##i, v5, 3); \ | |||
d3 = MSUB(d3, t5##i, v6, 0); \ | |||
d4 = MADD(d4, t5##i, v6, 1); \ | |||
d5 = MSUB(d5, t5##i, v5, 2); \ | |||
d6 = MSUB(d6, t5##i, v6, 0); \ | |||
d7 = MSUB(d7, t5##i, v2, 2); \ | |||
d8 = MADD(d8, t5##i, v0, 3); \ | |||
d0 = MSUB(d0, t6##i, v0, 0); \ | |||
d1 = MSUB(d1, t6##i, v1, 0); \ | |||
d2 = MSUB(d2, t6##i, v1, 1); \ | |||
d3 = MADD(d3, t6##i, v1, 0); \ | |||
d4 = MSUB(d4, t6##i, v3, 1); \ | |||
d5 = GiSubtractFloat32(d5, t6##i); \ | |||
d6 = GiFmsqLaneQFloat32(d6, t6##i, v6, 2); \ | |||
d8 = GiFmsqLaneQFloat32(d8, t6##i, v2, 2); \ | |||
d0 = GiSimdFmaLane(d0, t0##i, v0, 0); \ | |||
d6 = MSUB(d6, t6##i, v6, 2); \ | |||
d8 = MSUB(d8, t6##i, v2, 2); \ | |||
d0 = MADD(d0, t0##i, v0, 0); \ | |||
GiStoreFloat32( \ | |||
input_transform_buf + \ | |||
(0 * alpha + i) * ICB * nr_units_in_tile * pack_size + \ | |||
@@ -288,6 +304,8 @@ struct InputTransformF73_NCHW44 { | |||
UNROLL_CALL_RAW(9, cb); | |||
#undef cb | |||
#undef MADD | |||
#undef MSUB | |||
} | |||
}; | |||
@@ -224,9 +224,7 @@ GI_FLOAT32_t GiMlaqFloat32(GI_FLOAT32_t a, GI_FLOAT32_t b, GI_FLOAT32_t c) { | |||
#endif | |||
#elif defined(GI_SSE2_INTRINSICS) | |||
// fma is coming soon, but right now: | |||
__m128 res; | |||
res = _mm_mul_ps(c, b); | |||
return _mm_add_ps(a, res); | |||
return _mm_add_ps(a, _mm_mul_ps(c, b)); | |||
#elif defined(GI_RVV_INTRINSICS) | |||
return vfmadd_vv_f32m1(b, c, a, GI_SIMD_LEN_BYTE / sizeof(float)); | |||
#else | |||