GitOrigin-RevId: f29593be4d
HuaHua404-patch-4
@@ -24,21 +24,27 @@ struct ShiftCalHelper<src_idx, weight_idx, c_dim, ow_block, 0, T, T2, T3, T4> { | |||||
static MEGDNN_ALWAYS_INLINE void impl(T&, T2&, T3&) {} | static MEGDNN_ALWAYS_INLINE void impl(T&, T2&, T3&) {} | ||||
}; | }; | ||||
#define cb2(step, lane, ow_block) \ | |||||
c[0][step] = GiFloat32Type2FixLenType(GiSimdFmaLane( \ | |||||
GiFixLenType2GiFloat32Type(c[0][step]), \ | |||||
GiFixLenType2GiFloat32Type(weight[0][lane]), \ | |||||
GiFixLenType2GiFloat32Type(src[(step + src_idx) % ow_block]), lane)); \ | |||||
c[1][step] = GiFloat32Type2FixLenType(GiSimdFmaLane( \ | |||||
GiFixLenType2GiFloat32Type(c[1][step]), \ | |||||
GiFixLenType2GiFloat32Type(weight[1][lane]), \ | |||||
GiFixLenType2GiFloat32Type(src[(step + src_idx) % ow_block]), lane)); | |||||
#define cb(step, lane, ow_block) \ | |||||
c[0][step] = GiFloat32Type2FixLenType(GiSimdFmaLane( \ | |||||
GiFixLenType2GiFloat32Type(c[0][step]), \ | |||||
GiFixLenType2GiFloat32Type(weight[0][lane]), \ | |||||
GiFixLenType2GiFloat32Type(src[(step + src_idx) % ow_block]), lane)); | |||||
#if defined(GI_TARGET_X86) || defined(GI_RVV_INTRINSICS) | |||||
//! x86 and rvv GiSimdFmaLane API is slowly, as an alternate, use | |||||
//! GiMultiplyAddScalarFloat32 | |||||
#define MLA(a, b, c, d) \ | |||||
GiMultiplyAddScalarFloat32( \ | |||||
GiFixLenType2GiFloat32Type(a), GiFixLenType2GiFloat32Type(b), *(c + d)) | |||||
#else | |||||
#define MLA(a, b, c, d) \ | |||||
GiSimdFmaLane( \ | |||||
GiFixLenType2GiFloat32Type(a), GiFixLenType2GiFloat32Type(b), \ | |||||
GiFixLenType2GiFloat32Type(c), d) | |||||
#endif | |||||
#define cb2(step, lane, ow_block) \ | |||||
c[0][step] = GiFloat32Type2FixLenType( \ | |||||
MLA(c[0][step], weight[0][lane], src[(step + src_idx) % ow_block], lane)); \ | |||||
c[1][step] = GiFloat32Type2FixLenType( \ | |||||
MLA(c[1][step], weight[1][lane], src[(step + src_idx) % ow_block], lane)); | |||||
#define cb(step, lane, ow_block) \ | |||||
c[0][step] = GiFloat32Type2FixLenType( \ | |||||
MLA(c[0][step], weight[0][lane], src[(step + src_idx) % ow_block], lane)); | |||||
#define SHIFT_CAL_HELPER(ow_block, remain_w) \ | #define SHIFT_CAL_HELPER(ow_block, remain_w) \ | ||||
template < \ | template < \ | ||||
@@ -81,6 +87,7 @@ SHIFT_CAL_HELPER(4, 4); | |||||
#undef SHIFT_CAL_HELPER | #undef SHIFT_CAL_HELPER | ||||
#undef cb | #undef cb | ||||
#undef cb2 | #undef cb2 | ||||
#undef MLA | |||||
template < | template < | ||||
int src_idx, int weight_idx, int c_dim, int ow_block, int remain_w, typename T, | int src_idx, int weight_idx, int c_dim, int ow_block, int remain_w, typename T, | ||||
@@ -145,14 +152,23 @@ struct KerGiXXs1Nchw44FP32<bias_mode, Op, remain_w, 2, oc_block, ow_block> { | |||||
for (int ic_idx = 0; ic_idx < ic; ic_idx += ic_step) { | for (int ic_idx = 0; ic_idx < ic; ic_idx += ic_step) { | ||||
const float* src_ptr = src_ptr_origin + ic_idx * ld_src_ic; | const float* src_ptr = src_ptr_origin + ic_idx * ld_src_ic; | ||||
for (int fh_idx = 0; fh_idx < filter_size; ++fh_idx) { | for (int fh_idx = 0; fh_idx < filter_size; ++fh_idx) { | ||||
GI_FLOAT32_FIXLEN_t src[ow_block]; | |||||
GI_FLOAT32_FIXLEN_t weight[c_dim][ic_step]; | GI_FLOAT32_FIXLEN_t weight[c_dim][ic_step]; | ||||
#if defined(GI_TARGET_X86) || defined(GI_RVV_INTRINSICS) | |||||
const float* src[ow_block]; | |||||
load_ptr_helper<ow_block, 0, simd_len, 0>(src, src_ptr, 0); | |||||
#else | |||||
GI_FLOAT32_FIXLEN_t src[ow_block]; | |||||
load_helper<ow_block, 0, simd_len, 0, Vld1qF32S>(src, src_ptr, 0); | load_helper<ow_block, 0, simd_len, 0, Vld1qF32S>(src, src_ptr, 0); | ||||
#endif | |||||
load_helper<ic_step, 0, oc_step, c_dim, Vld1qF32S>( | load_helper<ic_step, 0, oc_step, c_dim, Vld1qF32S>( | ||||
weight, weight_ptr, ld_weight_oc); | weight, weight_ptr, ld_weight_oc); | ||||
cal_helper<0, 0, c_dim, ow_block, remain_w>(c, src, weight); | cal_helper<0, 0, c_dim, ow_block, remain_w>(c, src, weight); | ||||
#if defined(GI_TARGET_X86) || defined(GI_RVV_INTRINSICS) | |||||
src[0] = src_ptr + (ow_block)*ic_step; | |||||
#else | |||||
src[0] = GiFloat32Type2FixLenType( | src[0] = GiFloat32Type2FixLenType( | ||||
GiLoadFloat32(src_ptr + (ow_block)*ic_step)); | GiLoadFloat32(src_ptr + (ow_block)*ic_step)); | ||||
#endif | |||||
load_helper<ic_step, 1 * ld_weight, oc_step, c_dim, Vld1qF32S>( | load_helper<ic_step, 1 * ld_weight, oc_step, c_dim, Vld1qF32S>( | ||||
weight, weight_ptr, ld_weight_oc); | weight, weight_ptr, ld_weight_oc); | ||||
cal_helper<1, 0, c_dim, ow_block, remain_w>(c, src, weight); | cal_helper<1, 0, c_dim, ow_block, remain_w>(c, src, weight); | ||||
@@ -188,19 +204,32 @@ struct KerGiXXs1Nchw44FP32<bias_mode, Op, remain_w, 3, oc_block, ow_block> { | |||||
for (int ic_idx = 0; ic_idx < ic; ic_idx += ic_step) { | for (int ic_idx = 0; ic_idx < ic; ic_idx += ic_step) { | ||||
const float* src_ptr = src_ptr_origin + ic_idx * ld_src_ic; | const float* src_ptr = src_ptr_origin + ic_idx * ld_src_ic; | ||||
for (int fh_idx = 0; fh_idx < filter_size; ++fh_idx) { | for (int fh_idx = 0; fh_idx < filter_size; ++fh_idx) { | ||||
#if defined(GI_TARGET_X86) || defined(GI_RVV_INTRINSICS) | |||||
const float* src[ow_block]; | |||||
load_ptr_helper<ow_block, 0, simd_len, 0>(src, src_ptr, 0); | |||||
#else | |||||
GI_FLOAT32_FIXLEN_t src[ow_block]; | GI_FLOAT32_FIXLEN_t src[ow_block]; | ||||
GI_FLOAT32_FIXLEN_t weight[c_dim][ic_step]; | |||||
load_helper<ow_block, 0, simd_len, 0, Vld1qF32S>(src, src_ptr, 0); | load_helper<ow_block, 0, simd_len, 0, Vld1qF32S>(src, src_ptr, 0); | ||||
#endif | |||||
GI_FLOAT32_FIXLEN_t weight[c_dim][ic_step]; | |||||
load_helper<ic_step, 0, oc_step, c_dim, Vld1qF32S>( | load_helper<ic_step, 0, oc_step, c_dim, Vld1qF32S>( | ||||
weight, weight_ptr, ld_weight_oc); | weight, weight_ptr, ld_weight_oc); | ||||
cal_helper<0, 0, c_dim, ow_block, remain_w>(c, src, weight); | cal_helper<0, 0, c_dim, ow_block, remain_w>(c, src, weight); | ||||
#if defined(GI_TARGET_X86) || defined(GI_RVV_INTRINSICS) | |||||
src[0] = src_ptr + (ow_block)*ic_step; | |||||
#else | |||||
src[0] = GiFloat32Type2FixLenType( | src[0] = GiFloat32Type2FixLenType( | ||||
GiLoadFloat32(src_ptr + (ow_block)*ic_step)); | GiLoadFloat32(src_ptr + (ow_block)*ic_step)); | ||||
#endif | |||||
load_helper<ic_step, 1 * ld_weight, oc_step, c_dim, Vld1qF32S>( | load_helper<ic_step, 1 * ld_weight, oc_step, c_dim, Vld1qF32S>( | ||||
weight, weight_ptr, ld_weight_oc); | weight, weight_ptr, ld_weight_oc); | ||||
cal_helper<1, 0, c_dim, ow_block, remain_w>(c, src, weight); | cal_helper<1, 0, c_dim, ow_block, remain_w>(c, src, weight); | ||||
#if defined(GI_TARGET_X86) || defined(GI_RVV_INTRINSICS) | |||||
src[1] = src_ptr + (ow_block + 1) * ic_step; | |||||
#else | |||||
src[1] = GiFloat32Type2FixLenType( | src[1] = GiFloat32Type2FixLenType( | ||||
GiLoadFloat32(src_ptr + (ow_block + 1) * ic_step)); | GiLoadFloat32(src_ptr + (ow_block + 1) * ic_step)); | ||||
#endif | |||||
load_helper<ic_step, 2 * ld_weight, oc_step, c_dim, Vld1qF32S>( | load_helper<ic_step, 2 * ld_weight, oc_step, c_dim, Vld1qF32S>( | ||||
weight, weight_ptr, ld_weight_oc); | weight, weight_ptr, ld_weight_oc); | ||||
cal_helper<2, 0, c_dim, ow_block, remain_w>(c, src, weight); | cal_helper<2, 0, c_dim, ow_block, remain_w>(c, src, weight); | ||||
@@ -235,33 +264,54 @@ struct KerGiXXs1Nchw44FP32<bias_mode, Op, remain_w, 5, oc_block, ow_block> { | |||||
for (int ic_idx = 0; ic_idx < ic; ic_idx += ic_step) { | for (int ic_idx = 0; ic_idx < ic; ic_idx += ic_step) { | ||||
const float* src_ptr = src_ptr_origin + ic_idx * ld_src_ic; | const float* src_ptr = src_ptr_origin + ic_idx * ld_src_ic; | ||||
for (int fh_idx = 0; fh_idx < filter_size; ++fh_idx) { | for (int fh_idx = 0; fh_idx < filter_size; ++fh_idx) { | ||||
#if defined(GI_TARGET_X86) || defined(GI_RVV_INTRINSICS) | |||||
const float* src[ow_block]; | |||||
load_ptr_helper<ow_block, 0, simd_len, 0>(src, src_ptr, 0); | |||||
#else | |||||
GI_FLOAT32_FIXLEN_t src[ow_block]; | GI_FLOAT32_FIXLEN_t src[ow_block]; | ||||
GI_FLOAT32_FIXLEN_t weight[c_dim][ic_step]; | |||||
load_helper<ow_block, 0, simd_len, 0, Vld1qF32S>(src, src_ptr, 0); | load_helper<ow_block, 0, simd_len, 0, Vld1qF32S>(src, src_ptr, 0); | ||||
#endif | |||||
GI_FLOAT32_FIXLEN_t weight[c_dim][ic_step]; | |||||
load_helper<ic_step, 0, oc_step, c_dim, Vld1qF32S>( | load_helper<ic_step, 0, oc_step, c_dim, Vld1qF32S>( | ||||
weight, weight_ptr, ld_weight_oc); | weight, weight_ptr, ld_weight_oc); | ||||
cal_helper<0, 0, c_dim, ow_block, remain_w>(c, src, weight); | cal_helper<0, 0, c_dim, ow_block, remain_w>(c, src, weight); | ||||
#if defined(GI_TARGET_X86) || defined(GI_RVV_INTRINSICS) | |||||
src[0] = src_ptr + (ow_block)*ic_step; | |||||
#else | |||||
src[0] = GiFloat32Type2FixLenType( | src[0] = GiFloat32Type2FixLenType( | ||||
GiLoadFloat32(src_ptr + (ow_block)*ic_step)); | GiLoadFloat32(src_ptr + (ow_block)*ic_step)); | ||||
#endif | |||||
load_helper<ic_step, 1 * ld_weight, oc_step, c_dim, Vld1qF32S>( | load_helper<ic_step, 1 * ld_weight, oc_step, c_dim, Vld1qF32S>( | ||||
weight, weight_ptr, ld_weight_oc); | weight, weight_ptr, ld_weight_oc); | ||||
cal_helper<1, 0, c_dim, ow_block, remain_w>(c, src, weight); | cal_helper<1, 0, c_dim, ow_block, remain_w>(c, src, weight); | ||||
#if defined(GI_TARGET_X86) || defined(GI_RVV_INTRINSICS) | |||||
src[1] = src_ptr + (ow_block + 1) * ic_step; | |||||
#else | |||||
src[1] = GiFloat32Type2FixLenType( | src[1] = GiFloat32Type2FixLenType( | ||||
GiLoadFloat32(src_ptr + (ow_block + 1) * ic_step)); | GiLoadFloat32(src_ptr + (ow_block + 1) * ic_step)); | ||||
#endif | |||||
load_helper<ic_step, 2 * ld_weight, oc_step, c_dim, Vld1qF32S>( | load_helper<ic_step, 2 * ld_weight, oc_step, c_dim, Vld1qF32S>( | ||||
weight, weight_ptr, ld_weight_oc); | weight, weight_ptr, ld_weight_oc); | ||||
cal_helper<2, 0, c_dim, ow_block, remain_w>(c, src, weight); | cal_helper<2, 0, c_dim, ow_block, remain_w>(c, src, weight); | ||||
#if defined(GI_TARGET_X86) || defined(GI_RVV_INTRINSICS) | |||||
src[2] = src_ptr + (ow_block + 2) * ic_step; | |||||
#else | |||||
src[2] = GiFloat32Type2FixLenType( | src[2] = GiFloat32Type2FixLenType( | ||||
GiLoadFloat32(src_ptr + (ow_block + 2) * ic_step)); | GiLoadFloat32(src_ptr + (ow_block + 2) * ic_step)); | ||||
#endif | |||||
load_helper<ic_step, 3 * ld_weight, oc_step, c_dim, Vld1qF32S>( | load_helper<ic_step, 3 * ld_weight, oc_step, c_dim, Vld1qF32S>( | ||||
weight, weight_ptr, ld_weight_oc); | weight, weight_ptr, ld_weight_oc); | ||||
cal_helper<3, 0, c_dim, ow_block, remain_w>(c, src, weight); | cal_helper<3, 0, c_dim, ow_block, remain_w>(c, src, weight); | ||||
#if defined(GI_TARGET_X86) || defined(GI_RVV_INTRINSICS) | |||||
src[3] = src_ptr + (ow_block + 3) * ic_step; | |||||
#else | |||||
src[3] = GiFloat32Type2FixLenType( | src[3] = GiFloat32Type2FixLenType( | ||||
GiLoadFloat32(src_ptr + (ow_block + 3) * ic_step)); | GiLoadFloat32(src_ptr + (ow_block + 3) * ic_step)); | ||||
#endif | |||||
load_helper<ic_step, 4 * ld_weight, oc_step, c_dim, Vld1qF32S>( | load_helper<ic_step, 4 * ld_weight, oc_step, c_dim, Vld1qF32S>( | ||||
weight, weight_ptr, ld_weight_oc); | weight, weight_ptr, ld_weight_oc); | ||||
cal_helper<4, 0, c_dim, ow_block, remain_w>(c, src, weight); | cal_helper<4, 0, c_dim, ow_block, remain_w>(c, src, weight); | ||||
@@ -297,45 +347,74 @@ struct KerGiXXs1Nchw44FP32<bias_mode, Op, remain_w, 7, oc_block, ow_block> { | |||||
for (int ic_idx = 0; ic_idx < ic; ic_idx += ic_step) { | for (int ic_idx = 0; ic_idx < ic; ic_idx += ic_step) { | ||||
const float* src_ptr = src_ptr_origin + ic_idx * ld_src_ic; | const float* src_ptr = src_ptr_origin + ic_idx * ld_src_ic; | ||||
for (int fh_idx = 0; fh_idx < filter_size; ++fh_idx) { | for (int fh_idx = 0; fh_idx < filter_size; ++fh_idx) { | ||||
#if defined(GI_TARGET_X86) || defined(GI_RVV_INTRINSICS) | |||||
const float* src[ow_block]; | |||||
load_ptr_helper<ow_block, 0, simd_len, 0>(src, src_ptr, 0); | |||||
#else | |||||
GI_FLOAT32_FIXLEN_t src[ow_block]; | GI_FLOAT32_FIXLEN_t src[ow_block]; | ||||
GI_FLOAT32_FIXLEN_t weight[c_dim][ic_step]; | |||||
load_helper<ow_block, 0, simd_len, 0, Vld1qF32S>(src, src_ptr, 0); | load_helper<ow_block, 0, simd_len, 0, Vld1qF32S>(src, src_ptr, 0); | ||||
#endif | |||||
GI_FLOAT32_FIXLEN_t weight[c_dim][ic_step]; | |||||
load_helper<ic_step, 0, oc_step, c_dim, Vld1qF32S>( | load_helper<ic_step, 0, oc_step, c_dim, Vld1qF32S>( | ||||
weight, weight_ptr, ld_weight_oc); | weight, weight_ptr, ld_weight_oc); | ||||
cal_helper<0, 0, c_dim, ow_block, remain_w>(c, src, weight); | cal_helper<0, 0, c_dim, ow_block, remain_w>(c, src, weight); | ||||
#if defined(GI_TARGET_X86) || defined(GI_RVV_INTRINSICS) | |||||
src[0] = src_ptr + (ow_block)*ic_step; | |||||
#else | |||||
src[0] = GiFloat32Type2FixLenType( | src[0] = GiFloat32Type2FixLenType( | ||||
GiLoadFloat32(src_ptr + (ow_block)*ic_step)); | GiLoadFloat32(src_ptr + (ow_block)*ic_step)); | ||||
#endif | |||||
load_helper<ic_step, 1 * ld_weight, oc_step, c_dim, Vld1qF32S>( | load_helper<ic_step, 1 * ld_weight, oc_step, c_dim, Vld1qF32S>( | ||||
weight, weight_ptr, ld_weight_oc); | weight, weight_ptr, ld_weight_oc); | ||||
cal_helper<1, 0, c_dim, ow_block, remain_w>(c, src, weight); | cal_helper<1, 0, c_dim, ow_block, remain_w>(c, src, weight); | ||||
#if defined(GI_TARGET_X86) || defined(GI_RVV_INTRINSICS) | |||||
src[1] = src_ptr + (ow_block + 1) * ic_step; | |||||
#else | |||||
src[1] = GiFloat32Type2FixLenType( | src[1] = GiFloat32Type2FixLenType( | ||||
GiLoadFloat32(src_ptr + (ow_block + 1) * ic_step)); | GiLoadFloat32(src_ptr + (ow_block + 1) * ic_step)); | ||||
#endif | |||||
load_helper<ic_step, 2 * ld_weight, oc_step, c_dim, Vld1qF32S>( | load_helper<ic_step, 2 * ld_weight, oc_step, c_dim, Vld1qF32S>( | ||||
weight, weight_ptr, ld_weight_oc); | weight, weight_ptr, ld_weight_oc); | ||||
cal_helper<2, 0, c_dim, ow_block, remain_w>(c, src, weight); | cal_helper<2, 0, c_dim, ow_block, remain_w>(c, src, weight); | ||||
#if defined(GI_TARGET_X86) || defined(GI_RVV_INTRINSICS) | |||||
src[2] = src_ptr + (ow_block + 2) * ic_step; | |||||
#else | |||||
src[2] = GiFloat32Type2FixLenType( | src[2] = GiFloat32Type2FixLenType( | ||||
GiLoadFloat32(src_ptr + (ow_block + 2) * ic_step)); | GiLoadFloat32(src_ptr + (ow_block + 2) * ic_step)); | ||||
#endif | |||||
load_helper<ic_step, 3 * ld_weight, oc_step, c_dim, Vld1qF32S>( | load_helper<ic_step, 3 * ld_weight, oc_step, c_dim, Vld1qF32S>( | ||||
weight, weight_ptr, ld_weight_oc); | weight, weight_ptr, ld_weight_oc); | ||||
cal_helper<3, 0, c_dim, ow_block, remain_w>(c, src, weight); | cal_helper<3, 0, c_dim, ow_block, remain_w>(c, src, weight); | ||||
#if defined(GI_TARGET_X86) || defined(GI_RVV_INTRINSICS) | |||||
src[3] = src_ptr + (ow_block + 3) * ic_step; | |||||
#else | |||||
src[3] = GiFloat32Type2FixLenType( | src[3] = GiFloat32Type2FixLenType( | ||||
GiLoadFloat32(src_ptr + (ow_block + 3) * ic_step)); | GiLoadFloat32(src_ptr + (ow_block + 3) * ic_step)); | ||||
#endif | |||||
load_helper<ic_step, 4 * ld_weight, oc_step, c_dim, Vld1qF32S>( | load_helper<ic_step, 4 * ld_weight, oc_step, c_dim, Vld1qF32S>( | ||||
weight, weight_ptr, ld_weight_oc); | weight, weight_ptr, ld_weight_oc); | ||||
cal_helper<4, 0, c_dim, ow_block, remain_w>(c, src, weight); | cal_helper<4, 0, c_dim, ow_block, remain_w>(c, src, weight); | ||||
#if defined(GI_TARGET_X86) || defined(GI_RVV_INTRINSICS) | |||||
src[4] = src_ptr + (ow_block + 4) * ic_step; | |||||
#else | |||||
src[4] = GiFloat32Type2FixLenType( | src[4] = GiFloat32Type2FixLenType( | ||||
GiLoadFloat32(src_ptr + (ow_block + 4) * ic_step)); | GiLoadFloat32(src_ptr + (ow_block + 4) * ic_step)); | ||||
#endif | |||||
load_helper<ic_step, 5 * ld_weight, oc_step, c_dim, Vld1qF32S>( | load_helper<ic_step, 5 * ld_weight, oc_step, c_dim, Vld1qF32S>( | ||||
weight, weight_ptr, ld_weight_oc); | weight, weight_ptr, ld_weight_oc); | ||||
cal_helper<5, 0, c_dim, ow_block, remain_w>(c, src, weight); | cal_helper<5, 0, c_dim, ow_block, remain_w>(c, src, weight); | ||||
#if defined(GI_TARGET_X86) || defined(GI_RVV_INTRINSICS) | |||||
src[5] = src_ptr + (ow_block + 5) * ic_step; | |||||
#else | |||||
src[5] = GiFloat32Type2FixLenType( | src[5] = GiFloat32Type2FixLenType( | ||||
GiLoadFloat32(src_ptr + (ow_block + 5) * ic_step)); | GiLoadFloat32(src_ptr + (ow_block + 5) * ic_step)); | ||||
#endif | |||||
load_helper<ic_step, 6 * ld_weight, oc_step, c_dim, Vld1qF32S>( | load_helper<ic_step, 6 * ld_weight, oc_step, c_dim, Vld1qF32S>( | ||||
weight, weight_ptr, ld_weight_oc); | weight, weight_ptr, ld_weight_oc); | ||||
cal_helper<6, 0, c_dim, ow_block, remain_w>(c, src, weight); | cal_helper<6, 0, c_dim, ow_block, remain_w>(c, src, weight); | ||||
@@ -24,21 +24,28 @@ struct ShiftCalHelper<src_idx, weight_idx, c_dim, ow_block, 0, T, T2, T3, T4> { | |||||
static MEGDNN_ALWAYS_INLINE void impl(T&, T2&, T3&) {} | static MEGDNN_ALWAYS_INLINE void impl(T&, T2&, T3&) {} | ||||
}; | }; | ||||
#define cb2(step, lane, ow_block) \ | |||||
c[0][step] = GiFloat32Type2FixLenType(GiSimdFmaLane( \ | |||||
GiFixLenType2GiFloat32Type(c[0][step]), \ | |||||
GiFixLenType2GiFloat32Type(weight[0][lane]), \ | |||||
GiFixLenType2GiFloat32Type(src[(step + src_idx) % ow_block]), lane)); \ | |||||
c[1][step] = GiFloat32Type2FixLenType(GiSimdFmaLane( \ | |||||
GiFixLenType2GiFloat32Type(c[1][step]), \ | |||||
GiFixLenType2GiFloat32Type(weight[1][lane]), \ | |||||
GiFixLenType2GiFloat32Type(src[(step + src_idx) % ow_block]), lane)); | |||||
#define cb(step, lane, ow_block) \ | |||||
c[0][step] = GiFloat32Type2FixLenType(GiSimdFmaLane( \ | |||||
GiFixLenType2GiFloat32Type(c[0][step]), \ | |||||
GiFixLenType2GiFloat32Type(weight[0][lane]), \ | |||||
GiFixLenType2GiFloat32Type(src[(step + src_idx) % ow_block]), lane)); | |||||
#if defined(GI_TARGET_X86) || defined(GI_RVV_INTRINSICS) | |||||
//! x86 and rvv GiSimdFmaLane API is slowly, as an alternate, use | |||||
//! GiMultiplyAddScalarFloat32 | |||||
#define MLA(a, b, c, d) \ | |||||
GiMultiplyAddScalarFloat32( \ | |||||
GiFixLenType2GiFloat32Type(a), GiFixLenType2GiFloat32Type(b), *(c + d)) | |||||
#else | |||||
#define MLA(a, b, c, d) \ | |||||
GiSimdFmaLane( \ | |||||
GiFixLenType2GiFloat32Type(a), GiFixLenType2GiFloat32Type(b), \ | |||||
GiFixLenType2GiFloat32Type(c), d) | |||||
#endif | |||||
#define cb2(step, lane, ow_block) \ | |||||
c[0][step] = GiFloat32Type2FixLenType( \ | |||||
MLA(c[0][step], weight[0][lane], src[(step + src_idx) % ow_block], lane)); \ | |||||
c[1][step] = GiFloat32Type2FixLenType( \ | |||||
MLA(c[1][step], weight[1][lane], src[(step + src_idx) % ow_block], lane)); | |||||
#define cb(step, lane, ow_block) \ | |||||
c[0][step] = GiFloat32Type2FixLenType( \ | |||||
MLA(c[0][step], weight[0][lane], src[(step + src_idx) % ow_block], lane)); | |||||
#define SHIFT_CAL_HELPER(ow_block, remain_w) \ | #define SHIFT_CAL_HELPER(ow_block, remain_w) \ | ||||
template < \ | template < \ | ||||
@@ -81,6 +88,7 @@ SHIFT_CAL_HELPER(4, 4); | |||||
#undef SHIFT_CAL_HELPER | #undef SHIFT_CAL_HELPER | ||||
#undef cb | #undef cb | ||||
#undef cb2 | #undef cb2 | ||||
#undef MLA | |||||
template < | template < | ||||
int src_idx, int weight_idx, int c_dim, int ow_block, int remain_w, typename T, | int src_idx, int weight_idx, int c_dim, int ow_block, int remain_w, typename T, | ||||
@@ -146,15 +154,24 @@ struct KerGiXXs2Nchw44FP32<bias_mode, Op, remain_w, 2, oc_block, ow_block> { | |||||
const float* src_ptr = src_ptr_origin + ic_idx * ld_src_ic; | const float* src_ptr = src_ptr_origin + ic_idx * ld_src_ic; | ||||
const float* src_ptr_odd = src_ptr_odd_origin + ic_idx * ld_src_ic; | const float* src_ptr_odd = src_ptr_odd_origin + ic_idx * ld_src_ic; | ||||
GI_FLOAT32_FIXLEN_t src[ow_block]; | |||||
GI_FLOAT32_FIXLEN_t weight[c_dim][4]; | GI_FLOAT32_FIXLEN_t weight[c_dim][4]; | ||||
/////////row 0///////////// | |||||
#if defined(GI_TARGET_X86) || defined(GI_RVV_INTRINSICS) | |||||
const float* src[ow_block]; | |||||
load_ptr_helper<ow_block, 0, simd_len, 0>(src, src_ptr, 0); | |||||
#else | |||||
GI_FLOAT32_FIXLEN_t src[ow_block]; | |||||
load_helper<ow_block, 0, simd_len, 0, Vld1qF32S>(src, src_ptr, 0); | load_helper<ow_block, 0, simd_len, 0, Vld1qF32S>(src, src_ptr, 0); | ||||
#endif | |||||
/////////row 0///////////// | |||||
load_helper<4, 0, oc_step, c_dim, Vld1qF32S>( | load_helper<4, 0, oc_step, c_dim, Vld1qF32S>( | ||||
weight, weight_ptr, ld_weight_oc); | weight, weight_ptr, ld_weight_oc); | ||||
cal_helper<0, 0, c_dim, ow_block, remain_w>(c, src, weight); | cal_helper<0, 0, c_dim, ow_block, remain_w>(c, src, weight); | ||||
#if defined(GI_TARGET_X86) || defined(GI_RVV_INTRINSICS) | |||||
load_ptr_helper<ow_block, 0, simd_len, 0>(src, src_ptr_odd, 0); | |||||
#else | |||||
load_helper<ow_block, 0, simd_len, 0, Vld1qF32S>(src, src_ptr_odd, 0); | load_helper<ow_block, 0, simd_len, 0, Vld1qF32S>(src, src_ptr_odd, 0); | ||||
#endif | |||||
load_helper<4, 1 * ld_weight, oc_step, c_dim, Vld1qF32S>( | load_helper<4, 1 * ld_weight, oc_step, c_dim, Vld1qF32S>( | ||||
weight, weight_ptr, ld_weight_oc); | weight, weight_ptr, ld_weight_oc); | ||||
cal_helper<0, 0, c_dim, ow_block, remain_w>(c, src, weight); | cal_helper<0, 0, c_dim, ow_block, remain_w>(c, src, weight); | ||||
@@ -162,12 +179,20 @@ struct KerGiXXs2Nchw44FP32<bias_mode, Op, remain_w, 2, oc_block, ow_block> { | |||||
src_ptr_odd += ld_src_iw; | src_ptr_odd += ld_src_iw; | ||||
weight_ptr += ld_weight_fh; | weight_ptr += ld_weight_fh; | ||||
/////////row 1///////////// | /////////row 1///////////// | ||||
#if defined(GI_TARGET_X86) || defined(GI_RVV_INTRINSICS) | |||||
load_ptr_helper<ow_block, 0, simd_len, 0>(src, src_ptr, 0); | |||||
#else | |||||
load_helper<ow_block, 0, simd_len, 0, Vld1qF32S>(src, src_ptr, 0); | load_helper<ow_block, 0, simd_len, 0, Vld1qF32S>(src, src_ptr, 0); | ||||
#endif | |||||
load_helper<4, 0, oc_step, c_dim, Vld1qF32S>( | load_helper<4, 0, oc_step, c_dim, Vld1qF32S>( | ||||
weight, weight_ptr, ld_weight_oc); | weight, weight_ptr, ld_weight_oc); | ||||
cal_helper<0, 0, c_dim, ow_block, remain_w>(c, src, weight); | cal_helper<0, 0, c_dim, ow_block, remain_w>(c, src, weight); | ||||
#if defined(GI_TARGET_X86) || defined(GI_RVV_INTRINSICS) | |||||
load_ptr_helper<ow_block, 0, simd_len, 0>(src, src_ptr_odd, 0); | |||||
#else | |||||
load_helper<ow_block, 0, simd_len, 0, Vld1qF32S>(src, src_ptr_odd, 0); | load_helper<ow_block, 0, simd_len, 0, Vld1qF32S>(src, src_ptr_odd, 0); | ||||
#endif | |||||
load_helper<4, 1 * ld_weight, oc_step, c_dim, Vld1qF32S>( | load_helper<4, 1 * ld_weight, oc_step, c_dim, Vld1qF32S>( | ||||
weight, weight_ptr, ld_weight_oc); | weight, weight_ptr, ld_weight_oc); | ||||
cal_helper<0, 0, c_dim, ow_block, remain_w>(c, src, weight); | cal_helper<0, 0, c_dim, ow_block, remain_w>(c, src, weight); | ||||
@@ -203,21 +228,34 @@ struct KerGiXXs2Nchw44FP32<bias_mode, Op, remain_w, 3, oc_block, ow_block> { | |||||
const float* src_ptr = src_ptr_origin + ic_idx * ld_src_ic; | const float* src_ptr = src_ptr_origin + ic_idx * ld_src_ic; | ||||
const float* src_ptr_odd = src_ptr_odd_origin + ic_idx * ld_src_ic; | const float* src_ptr_odd = src_ptr_odd_origin + ic_idx * ld_src_ic; | ||||
GI_FLOAT32_FIXLEN_t src[ow_block]; | |||||
GI_FLOAT32_FIXLEN_t weight[c_dim][4]; | GI_FLOAT32_FIXLEN_t weight[c_dim][4]; | ||||
/////////row 0///////////// | |||||
#if defined(GI_TARGET_X86) || defined(GI_RVV_INTRINSICS) | |||||
const float* src[ow_block]; | |||||
load_ptr_helper<ow_block, 0, simd_len, 0>(src, src_ptr, 0); | |||||
#else | |||||
GI_FLOAT32_FIXLEN_t src[ow_block]; | |||||
load_helper<ow_block, 0, simd_len, 0, Vld1qF32S>(src, src_ptr, 0); | load_helper<ow_block, 0, simd_len, 0, Vld1qF32S>(src, src_ptr, 0); | ||||
#endif | |||||
/////////row 0///////////// | |||||
load_helper<4, 0, oc_step, c_dim, Vld1qF32S>( | load_helper<4, 0, oc_step, c_dim, Vld1qF32S>( | ||||
weight, weight_ptr, ld_weight_oc); | weight, weight_ptr, ld_weight_oc); | ||||
cal_helper<0, 0, c_dim, ow_block, remain_w>(c, src, weight); | cal_helper<0, 0, c_dim, ow_block, remain_w>(c, src, weight); | ||||
#if defined(GI_TARGET_X86) || defined(GI_RVV_INTRINSICS) | |||||
src[0] = src_ptr + ow_block * simd_len; | |||||
#else | |||||
src[0] = GiFloat32Type2FixLenType( | src[0] = GiFloat32Type2FixLenType( | ||||
GiLoadFloat32(src_ptr + ow_block * simd_len)); | GiLoadFloat32(src_ptr + ow_block * simd_len)); | ||||
#endif | |||||
load_helper<4, 2 * ld_weight, oc_step, c_dim, Vld1qF32S>( | load_helper<4, 2 * ld_weight, oc_step, c_dim, Vld1qF32S>( | ||||
weight, weight_ptr, ld_weight_oc); | weight, weight_ptr, ld_weight_oc); | ||||
cal_helper<1, 0, c_dim, ow_block, remain_w>(c, src, weight); | cal_helper<1, 0, c_dim, ow_block, remain_w>(c, src, weight); | ||||
#if defined(GI_TARGET_X86) || defined(GI_RVV_INTRINSICS) | |||||
load_ptr_helper<ow_block, 0, simd_len, 0>(src, src_ptr_odd, 0); | |||||
#else | |||||
load_helper<ow_block, 0, simd_len, 0, Vld1qF32S>(src, src_ptr_odd, 0); | load_helper<ow_block, 0, simd_len, 0, Vld1qF32S>(src, src_ptr_odd, 0); | ||||
#endif | |||||
load_helper<4, 1 * ld_weight, oc_step, c_dim, Vld1qF32S>( | load_helper<4, 1 * ld_weight, oc_step, c_dim, Vld1qF32S>( | ||||
weight, weight_ptr, ld_weight_oc); | weight, weight_ptr, ld_weight_oc); | ||||
cal_helper<0, 0, c_dim, ow_block, remain_w>(c, src, weight); | cal_helper<0, 0, c_dim, ow_block, remain_w>(c, src, weight); | ||||
@@ -225,17 +263,29 @@ struct KerGiXXs2Nchw44FP32<bias_mode, Op, remain_w, 3, oc_block, ow_block> { | |||||
src_ptr_odd += ld_src_iw; | src_ptr_odd += ld_src_iw; | ||||
weight_ptr += ld_weight_fh; | weight_ptr += ld_weight_fh; | ||||
/////////row 1///////////// | /////////row 1///////////// | ||||
#if defined(GI_TARGET_X86) || defined(GI_RVV_INTRINSICS) | |||||
load_ptr_helper<ow_block, 0, simd_len, 0>(src, src_ptr, 0); | |||||
#else | |||||
load_helper<ow_block, 0, simd_len, 0, Vld1qF32S>(src, src_ptr, 0); | load_helper<ow_block, 0, simd_len, 0, Vld1qF32S>(src, src_ptr, 0); | ||||
#endif | |||||
load_helper<4, 0, oc_step, c_dim, Vld1qF32S>( | load_helper<4, 0, oc_step, c_dim, Vld1qF32S>( | ||||
weight, weight_ptr, ld_weight_oc); | weight, weight_ptr, ld_weight_oc); | ||||
cal_helper<0, 0, c_dim, ow_block, remain_w>(c, src, weight); | cal_helper<0, 0, c_dim, ow_block, remain_w>(c, src, weight); | ||||
#if defined(GI_TARGET_X86) || defined(GI_RVV_INTRINSICS) | |||||
src[0] = src_ptr + ow_block * simd_len; | |||||
#else | |||||
src[0] = GiFloat32Type2FixLenType( | src[0] = GiFloat32Type2FixLenType( | ||||
GiLoadFloat32(src_ptr + ow_block * simd_len)); | GiLoadFloat32(src_ptr + ow_block * simd_len)); | ||||
#endif | |||||
load_helper<4, 2 * ld_weight, oc_step, c_dim, Vld1qF32S>( | load_helper<4, 2 * ld_weight, oc_step, c_dim, Vld1qF32S>( | ||||
weight, weight_ptr, ld_weight_oc); | weight, weight_ptr, ld_weight_oc); | ||||
cal_helper<1, 0, c_dim, ow_block, remain_w>(c, src, weight); | cal_helper<1, 0, c_dim, ow_block, remain_w>(c, src, weight); | ||||
#if defined(GI_TARGET_X86) || defined(GI_RVV_INTRINSICS) | |||||
load_ptr_helper<ow_block, 0, simd_len, 0>(src, src_ptr_odd, 0); | |||||
#else | |||||
load_helper<ow_block, 0, simd_len, 0, Vld1qF32S>(src, src_ptr_odd, 0); | load_helper<ow_block, 0, simd_len, 0, Vld1qF32S>(src, src_ptr_odd, 0); | ||||
#endif | |||||
load_helper<4, 1 * ld_weight, oc_step, c_dim, Vld1qF32S>( | load_helper<4, 1 * ld_weight, oc_step, c_dim, Vld1qF32S>( | ||||
weight, weight_ptr, ld_weight_oc); | weight, weight_ptr, ld_weight_oc); | ||||
cal_helper<0, 0, c_dim, ow_block, remain_w>(c, src, weight); | cal_helper<0, 0, c_dim, ow_block, remain_w>(c, src, weight); | ||||
@@ -243,18 +293,30 @@ struct KerGiXXs2Nchw44FP32<bias_mode, Op, remain_w, 3, oc_block, ow_block> { | |||||
src_ptr_odd += ld_src_iw; | src_ptr_odd += ld_src_iw; | ||||
weight_ptr += ld_weight_fh; | weight_ptr += ld_weight_fh; | ||||
//////////row 2///////////// | //////////row 2///////////// | ||||
#if defined(GI_TARGET_X86) || defined(GI_RVV_INTRINSICS) | |||||
load_ptr_helper<ow_block, 0, simd_len, 0>(src, src_ptr, 0); | |||||
#else | |||||
load_helper<ow_block, 0, simd_len, 0, Vld1qF32S>(src, src_ptr, 0); | load_helper<ow_block, 0, simd_len, 0, Vld1qF32S>(src, src_ptr, 0); | ||||
#endif | |||||
load_helper<4, 0, oc_step, c_dim, Vld1qF32S>( | load_helper<4, 0, oc_step, c_dim, Vld1qF32S>( | ||||
weight, weight_ptr, ld_weight_oc); | weight, weight_ptr, ld_weight_oc); | ||||
cal_helper<0, 0, c_dim, ow_block, remain_w>(c, src, weight); | cal_helper<0, 0, c_dim, ow_block, remain_w>(c, src, weight); | ||||
#if defined(GI_TARGET_X86) || defined(GI_RVV_INTRINSICS) | |||||
src[0] = src_ptr + ow_block * simd_len; | |||||
#else | |||||
src[0] = GiFloat32Type2FixLenType( | src[0] = GiFloat32Type2FixLenType( | ||||
GiLoadFloat32(src_ptr + ow_block * simd_len)); | GiLoadFloat32(src_ptr + ow_block * simd_len)); | ||||
#endif | |||||
load_helper<4, 2 * ld_weight, oc_step, c_dim, Vld1qF32S>( | load_helper<4, 2 * ld_weight, oc_step, c_dim, Vld1qF32S>( | ||||
weight, weight_ptr, ld_weight_oc); | weight, weight_ptr, ld_weight_oc); | ||||
cal_helper<1, 0, c_dim, ow_block, remain_w>(c, src, weight); | cal_helper<1, 0, c_dim, ow_block, remain_w>(c, src, weight); | ||||
#if defined(GI_TARGET_X86) || defined(GI_RVV_INTRINSICS) | |||||
load_ptr_helper<ow_block, 0, simd_len, 0>(src, src_ptr_odd, 0); | |||||
#else | |||||
load_helper<ow_block, 0, simd_len, 0, Vld1qF32S>(src, src_ptr_odd, 0); | load_helper<ow_block, 0, simd_len, 0, Vld1qF32S>(src, src_ptr_odd, 0); | ||||
#endif | |||||
load_helper<4, 1 * ld_weight, oc_step, c_dim, Vld1qF32S>( | load_helper<4, 1 * ld_weight, oc_step, c_dim, Vld1qF32S>( | ||||
weight, weight_ptr, ld_weight_oc); | weight, weight_ptr, ld_weight_oc); | ||||
cal_helper<0, 0, c_dim, ow_block, remain_w>(c, src, weight); | cal_helper<0, 0, c_dim, ow_block, remain_w>(c, src, weight); | ||||
@@ -292,30 +354,51 @@ struct KerGiXXs2Nchw44FP32<bias_mode, Op, remain_w, 5, oc_block, ow_block> { | |||||
const float* src_ptr_odd = src_ptr_odd_origin + ic_idx * ld_src_ic; | const float* src_ptr_odd = src_ptr_odd_origin + ic_idx * ld_src_ic; | ||||
for (int fh_idx = 0; fh_idx < filter_size; ++fh_idx) { | for (int fh_idx = 0; fh_idx < filter_size; ++fh_idx) { | ||||
GI_FLOAT32_FIXLEN_t src[ow_block]; | |||||
GI_FLOAT32_FIXLEN_t weight[c_dim][4]; | GI_FLOAT32_FIXLEN_t weight[c_dim][4]; | ||||
// even element | |||||
#if defined(GI_TARGET_X86) || defined(GI_RVV_INTRINSICS) | |||||
const float* src[ow_block]; | |||||
load_ptr_helper<ow_block, 0, simd_len, 0>(src, src_ptr, 0); | |||||
#else | |||||
GI_FLOAT32_FIXLEN_t src[ow_block]; | |||||
load_helper<ow_block, 0, simd_len, 0, Vld1qF32S>(src, src_ptr, 0); | load_helper<ow_block, 0, simd_len, 0, Vld1qF32S>(src, src_ptr, 0); | ||||
#endif | |||||
// even element | |||||
load_helper<4, 0, oc_step, c_dim, Vld1qF32S>( | load_helper<4, 0, oc_step, c_dim, Vld1qF32S>( | ||||
weight, weight_ptr, ld_weight_oc); | weight, weight_ptr, ld_weight_oc); | ||||
cal_helper<0, 0, c_dim, ow_block, remain_w>(c, src, weight); | cal_helper<0, 0, c_dim, ow_block, remain_w>(c, src, weight); | ||||
#if defined(GI_TARGET_X86) || defined(GI_RVV_INTRINSICS) | |||||
src[0] = src_ptr + ow_block * simd_len; | |||||
#else | |||||
src[0] = GiFloat32Type2FixLenType( | src[0] = GiFloat32Type2FixLenType( | ||||
GiLoadFloat32(src_ptr + ow_block * simd_len)); | GiLoadFloat32(src_ptr + ow_block * simd_len)); | ||||
#endif | |||||
load_helper<4, 2 * ld_weight, oc_step, c_dim, Vld1qF32S>( | load_helper<4, 2 * ld_weight, oc_step, c_dim, Vld1qF32S>( | ||||
weight, weight_ptr, ld_weight_oc); | weight, weight_ptr, ld_weight_oc); | ||||
cal_helper<1, 0, c_dim, ow_block, remain_w>(c, src, weight); | cal_helper<1, 0, c_dim, ow_block, remain_w>(c, src, weight); | ||||
#if defined(GI_TARGET_X86) || defined(GI_RVV_INTRINSICS) | |||||
src[1] = src_ptr + (ow_block + 1) * simd_len; | |||||
#else | |||||
src[1] = GiFloat32Type2FixLenType( | src[1] = GiFloat32Type2FixLenType( | ||||
GiLoadFloat32(src_ptr + (ow_block + 1) * simd_len)); | GiLoadFloat32(src_ptr + (ow_block + 1) * simd_len)); | ||||
#endif | |||||
load_helper<4, 4 * ld_weight, oc_step, c_dim, Vld1qF32S>( | load_helper<4, 4 * ld_weight, oc_step, c_dim, Vld1qF32S>( | ||||
weight, weight_ptr, ld_weight_oc); | weight, weight_ptr, ld_weight_oc); | ||||
cal_helper<2, 0, c_dim, ow_block, remain_w>(c, src, weight); | cal_helper<2, 0, c_dim, ow_block, remain_w>(c, src, weight); | ||||
// odd element | // odd element | ||||
#if defined(GI_TARGET_X86) || defined(GI_RVV_INTRINSICS) | |||||
load_ptr_helper<ow_block, 0, simd_len, 0>(src, src_ptr_odd, 0); | |||||
#else | |||||
load_helper<ow_block, 0, simd_len, 0, Vld1qF32S>(src, src_ptr_odd, 0); | load_helper<ow_block, 0, simd_len, 0, Vld1qF32S>(src, src_ptr_odd, 0); | ||||
#endif | |||||
load_helper<4, 1 * ld_weight, oc_step, c_dim, Vld1qF32S>( | load_helper<4, 1 * ld_weight, oc_step, c_dim, Vld1qF32S>( | ||||
weight, weight_ptr, ld_weight_oc); | weight, weight_ptr, ld_weight_oc); | ||||
cal_helper<0, 0, c_dim, ow_block, remain_w>(c, src, weight); | cal_helper<0, 0, c_dim, ow_block, remain_w>(c, src, weight); | ||||
#if defined(GI_TARGET_X86) || defined(GI_RVV_INTRINSICS) | |||||
src[0] = src_ptr_odd + ow_block * simd_len; | |||||
#else | |||||
src[0] = GiFloat32Type2FixLenType( | src[0] = GiFloat32Type2FixLenType( | ||||
GiLoadFloat32(src_ptr_odd + ow_block * simd_len)); | GiLoadFloat32(src_ptr_odd + ow_block * simd_len)); | ||||
#endif | |||||
load_helper<4, 3 * ld_weight, oc_step, c_dim, Vld1qF32S>( | load_helper<4, 3 * ld_weight, oc_step, c_dim, Vld1qF32S>( | ||||
weight, weight_ptr, ld_weight_oc); | weight, weight_ptr, ld_weight_oc); | ||||
cal_helper<1, 0, c_dim, ow_block, remain_w>(c, src, weight); | cal_helper<1, 0, c_dim, ow_block, remain_w>(c, src, weight); | ||||
@@ -360,40 +443,69 @@ struct KerGiXXs2Nchw44FP32<bias_mode, Op, remain_w, 7, oc_block, ow_block> { | |||||
const float* src_ptr_odd = src_ptr_odd_origin + ic_idx * ld_src_ic; | const float* src_ptr_odd = src_ptr_odd_origin + ic_idx * ld_src_ic; | ||||
for (int fh_idx = 0; fh_idx < filter_size; ++fh_idx) { | for (int fh_idx = 0; fh_idx < filter_size; ++fh_idx) { | ||||
GI_FLOAT32_FIXLEN_t src[ow_block]; | |||||
GI_FLOAT32_FIXLEN_t weight[c_dim][4]; | GI_FLOAT32_FIXLEN_t weight[c_dim][4]; | ||||
// even element | |||||
#if defined(GI_TARGET_X86) || defined(GI_RVV_INTRINSICS) | |||||
const float* src[ow_block]; | |||||
load_ptr_helper<ow_block, 0, simd_len, 0>(src, src_ptr, 0); | |||||
#else | |||||
GI_FLOAT32_FIXLEN_t src[ow_block]; | |||||
load_helper<ow_block, 0, simd_len, 0, Vld1qF32S>(src, src_ptr, 0); | load_helper<ow_block, 0, simd_len, 0, Vld1qF32S>(src, src_ptr, 0); | ||||
#endif | |||||
// even element | |||||
load_helper<4, 0, oc_step, c_dim, Vld1qF32S>( | load_helper<4, 0, oc_step, c_dim, Vld1qF32S>( | ||||
weight, weight_ptr, ld_weight_oc); | weight, weight_ptr, ld_weight_oc); | ||||
cal_helper<0, 0, c_dim, ow_block, remain_w>(c, src, weight); | cal_helper<0, 0, c_dim, ow_block, remain_w>(c, src, weight); | ||||
#if defined(GI_TARGET_X86) || defined(GI_RVV_INTRINSICS) | |||||
src[0] = src_ptr + ow_block * simd_len; | |||||
#else | |||||
src[0] = GiFloat32Type2FixLenType( | src[0] = GiFloat32Type2FixLenType( | ||||
GiLoadFloat32(src_ptr + ow_block * simd_len)); | GiLoadFloat32(src_ptr + ow_block * simd_len)); | ||||
#endif | |||||
load_helper<4, 2 * ld_weight, oc_step, c_dim, Vld1qF32S>( | load_helper<4, 2 * ld_weight, oc_step, c_dim, Vld1qF32S>( | ||||
weight, weight_ptr, ld_weight_oc); | weight, weight_ptr, ld_weight_oc); | ||||
cal_helper<1, 0, c_dim, ow_block, remain_w>(c, src, weight); | cal_helper<1, 0, c_dim, ow_block, remain_w>(c, src, weight); | ||||
#if defined(GI_TARGET_X86) || defined(GI_RVV_INTRINSICS) | |||||
src[1] = src_ptr + (ow_block + 1) * simd_len; | |||||
#else | |||||
src[1] = GiFloat32Type2FixLenType( | src[1] = GiFloat32Type2FixLenType( | ||||
GiLoadFloat32(src_ptr + (ow_block + 1) * simd_len)); | GiLoadFloat32(src_ptr + (ow_block + 1) * simd_len)); | ||||
#endif | |||||
load_helper<4, 4 * ld_weight, oc_step, c_dim, Vld1qF32S>( | load_helper<4, 4 * ld_weight, oc_step, c_dim, Vld1qF32S>( | ||||
weight, weight_ptr, ld_weight_oc); | weight, weight_ptr, ld_weight_oc); | ||||
cal_helper<2, 0, c_dim, ow_block, remain_w>(c, src, weight); | cal_helper<2, 0, c_dim, ow_block, remain_w>(c, src, weight); | ||||
#if defined(GI_TARGET_X86) || defined(GI_RVV_INTRINSICS) | |||||
src[2] = src_ptr + (ow_block + 2) * simd_len; | |||||
#else | |||||
src[2] = GiFloat32Type2FixLenType( | src[2] = GiFloat32Type2FixLenType( | ||||
GiLoadFloat32(src_ptr + (ow_block + 2) * simd_len)); | GiLoadFloat32(src_ptr + (ow_block + 2) * simd_len)); | ||||
#endif | |||||
load_helper<4, 6 * ld_weight, oc_step, c_dim, Vld1qF32S>( | load_helper<4, 6 * ld_weight, oc_step, c_dim, Vld1qF32S>( | ||||
weight, weight_ptr, ld_weight_oc); | weight, weight_ptr, ld_weight_oc); | ||||
cal_helper<3, 0, c_dim, ow_block, remain_w>(c, src, weight); | cal_helper<3, 0, c_dim, ow_block, remain_w>(c, src, weight); | ||||
// odd element | // odd element | ||||
#if defined(GI_TARGET_X86) || defined(GI_RVV_INTRINSICS) | |||||
load_ptr_helper<ow_block, 0, simd_len, 0>(src, src_ptr_odd, 0); | |||||
#else | |||||
load_helper<ow_block, 0, simd_len, 0, Vld1qF32S>(src, src_ptr_odd, 0); | load_helper<ow_block, 0, simd_len, 0, Vld1qF32S>(src, src_ptr_odd, 0); | ||||
#endif | |||||
load_helper<4, 1 * ld_weight, oc_step, c_dim, Vld1qF32S>( | load_helper<4, 1 * ld_weight, oc_step, c_dim, Vld1qF32S>( | ||||
weight, weight_ptr, ld_weight_oc); | weight, weight_ptr, ld_weight_oc); | ||||
cal_helper<0, 0, c_dim, ow_block, remain_w>(c, src, weight); | cal_helper<0, 0, c_dim, ow_block, remain_w>(c, src, weight); | ||||
#if defined(GI_TARGET_X86) || defined(GI_RVV_INTRINSICS) | |||||
src[0] = src_ptr_odd + ow_block * simd_len; | |||||
#else | |||||
src[0] = GiFloat32Type2FixLenType( | src[0] = GiFloat32Type2FixLenType( | ||||
GiLoadFloat32(src_ptr_odd + ow_block * simd_len)); | GiLoadFloat32(src_ptr_odd + ow_block * simd_len)); | ||||
#endif | |||||
load_helper<4, 3 * ld_weight, oc_step, c_dim, Vld1qF32S>( | load_helper<4, 3 * ld_weight, oc_step, c_dim, Vld1qF32S>( | ||||
weight, weight_ptr, ld_weight_oc); | weight, weight_ptr, ld_weight_oc); | ||||
cal_helper<1, 0, c_dim, ow_block, remain_w>(c, src, weight); | cal_helper<1, 0, c_dim, ow_block, remain_w>(c, src, weight); | ||||
#if defined(GI_TARGET_X86) || defined(GI_RVV_INTRINSICS) | |||||
src[1] = src_ptr_odd + (ow_block + 1) * simd_len; | |||||
#else | |||||
src[1] = GiFloat32Type2FixLenType( | src[1] = GiFloat32Type2FixLenType( | ||||
GiLoadFloat32(src_ptr_odd + (ow_block + 1) * simd_len)); | GiLoadFloat32(src_ptr_odd + (ow_block + 1) * simd_len)); | ||||
#endif | |||||
load_helper<4, 5 * ld_weight, oc_step, c_dim, Vld1qF32S>( | load_helper<4, 5 * ld_weight, oc_step, c_dim, Vld1qF32S>( | ||||
weight, weight_ptr, ld_weight_oc); | weight, weight_ptr, ld_weight_oc); | ||||
cal_helper<2, 0, c_dim, ow_block, remain_w>(c, src, weight); | cal_helper<2, 0, c_dim, ow_block, remain_w>(c, src, weight); | ||||
@@ -40,44 +40,29 @@ struct ShiftCalHelper<src_idx, weight_idx, c_dim, stride, 0, T, T2, T3> { | |||||
#if defined(GI_TARGET_X86) || defined(GI_RVV_INTRINSICS) | #if defined(GI_TARGET_X86) || defined(GI_RVV_INTRINSICS) | ||||
//! x86 and rvv GiSimdFmaLane API is slowly, as an alternate, use | //! x86 and rvv GiSimdFmaLane API is slowly, as an alternate, use | ||||
//! GiMultiplyAddScalarFloat32 | //! GiMultiplyAddScalarFloat32 | ||||
#define MLA GiMultiplyAddScalarFloat32 | |||||
#define cb(step) \ | |||||
c[0][step] = GiFloat32Type2FixLenType(MLA( \ | |||||
GiFixLenType2GiFloat32Type(c[0][step]), \ | |||||
GiFixLenType2GiFloat32Type(weight[0][weight_idx]), \ | |||||
*(src[(step * stride + src_idx) / 4] + (step * stride + src_idx) % 4))); \ | |||||
c[1][step] = GiFloat32Type2FixLenType(MLA( \ | |||||
GiFixLenType2GiFloat32Type(c[1][step]), \ | |||||
GiFixLenType2GiFloat32Type(weight[1][weight_idx]), \ | |||||
*(src[(step * stride + src_idx) / 4] + (step * stride + src_idx) % 4))); | |||||
#define cb2(step) \ | |||||
c[0][step] = GiFloat32Type2FixLenType(MLA( \ | |||||
GiFixLenType2GiFloat32Type(c[0][step]), \ | |||||
GiFixLenType2GiFloat32Type(weight[0][weight_idx]), \ | |||||
*(src[(step * stride + src_idx) / 4] + (step * stride + src_idx) % 4))); | |||||
#define MLA(a, b, c, d) \ | |||||
GiMultiplyAddScalarFloat32( \ | |||||
GiFixLenType2GiFloat32Type(a), GiFixLenType2GiFloat32Type(b), *(c + d)) | |||||
#else | #else | ||||
#define cb(step) \ | |||||
c[0][step] = GiFloat32Type2FixLenType(GiSimdFmaLane( \ | |||||
GiFixLenType2GiFloat32Type(c[0][step]), \ | |||||
GiFixLenType2GiFloat32Type(weight[0][weight_idx]), \ | |||||
GiFixLenType2GiFloat32Type(src[(step * stride + src_idx) / 4]), \ | |||||
(step * stride + src_idx) % 4)); \ | |||||
c[1][step] = GiFloat32Type2FixLenType(GiSimdFmaLane( \ | |||||
GiFixLenType2GiFloat32Type(c[1][step]), \ | |||||
GiFixLenType2GiFloat32Type(weight[1][weight_idx]), \ | |||||
GiFixLenType2GiFloat32Type(src[(step * stride + src_idx) / 4]), \ | |||||
(step * stride + src_idx) % 4)); | |||||
#define cb2(step) \ | |||||
c[0][step] = GiFloat32Type2FixLenType(GiSimdFmaLane( \ | |||||
GiFixLenType2GiFloat32Type(c[0][step]), \ | |||||
GiFixLenType2GiFloat32Type(weight[0][weight_idx]), \ | |||||
GiFixLenType2GiFloat32Type(src[(step * stride + src_idx) / 4]), \ | |||||
(step * stride + src_idx) % 4)); | |||||
#undef MLA | |||||
#define MLA(a, b, c, d) \ | |||||
GiSimdFmaLane( \ | |||||
GiFixLenType2GiFloat32Type(a), GiFixLenType2GiFloat32Type(b), \ | |||||
GiFixLenType2GiFloat32Type(c), d) | |||||
#endif | #endif | ||||
#define cb(step) \ | |||||
c[0][step] = GiFloat32Type2FixLenType( \ | |||||
MLA(c[0][step], weight[0][weight_idx], src[(step * stride + src_idx) / 4], \ | |||||
(step * stride + src_idx) % 4)); \ | |||||
c[1][step] = GiFloat32Type2FixLenType( \ | |||||
MLA(c[1][step], weight[1][weight_idx], src[(step * stride + src_idx) / 4], \ | |||||
(step * stride + src_idx) % 4)); | |||||
#define cb2(step) \ | |||||
c[0][step] = GiFloat32Type2FixLenType( \ | |||||
MLA(c[0][step], weight[0][weight_idx], src[(step * stride + src_idx) / 4], \ | |||||
(step * stride + src_idx) % 4)); | |||||
#define SHIFT_CAL_HELPER(ow_remain) \ | #define SHIFT_CAL_HELPER(ow_remain) \ | ||||
template < \ | template < \ | ||||
int src_idx, int weight_idx, int stride, typename T, typename T2, \ | int src_idx, int weight_idx, int stride, typename T, typename T2, \ | ||||
@@ -108,6 +93,7 @@ SHIFT_CAL_HELPER(8) | |||||
#undef SHIFT_CAL_HELPER | #undef SHIFT_CAL_HELPER | ||||
#undef cb | #undef cb | ||||
#undef cb2 | #undef cb2 | ||||
#undef MLA | |||||
template < | template < | ||||
int src_idx, int weight_idx, int c_dim, int stride, int remain_w, typename T, | int src_idx, int weight_idx, int c_dim, int stride, int remain_w, typename T, | ||||
@@ -17,6 +17,30 @@ using namespace conv_stride1; | |||||
using NCBKernSizeParam = fallback::ConvBiasImpl::NCBKernSizeParam; | using NCBKernSizeParam = fallback::ConvBiasImpl::NCBKernSizeParam; | ||||
using NCBKernParam = fallback::ConvBiasImpl::NCBKernParam; | using NCBKernParam = fallback::ConvBiasImpl::NCBKernParam; | ||||
#if defined(GI_RVV_INTRINSICS) | |||||
#define PREFER_VF | |||||
#endif | |||||
#if defined(PREFER_VF) | |||||
#define MLA(a, b, c, d) GiMultiplyAddScalarFloat32(a, b, *(c + d)) | |||||
namespace { | |||||
GI_FORCEINLINE void ext_float32_ptr( | |||||
const float* a, const float* b, const int n, float* ret) { | |||||
int t_count = GI_SIMD_LEN_BYTE / sizeof(float); | |||||
int a_count = t_count - n; | |||||
for (int i = 0; i < a_count; i++) { | |||||
ret[i] = a[i + n]; | |||||
} | |||||
for (int i = 0; i < n; i++) { | |||||
ret[i + a_count] = b[i]; | |||||
} | |||||
} | |||||
}; // namespace | |||||
#else | |||||
#define MLA(a, b, c, d) GiSimdFmaLane(a, b, c, d) | |||||
#endif | |||||
void conv_stride1::do_conv_2x2_stride1( | void conv_stride1::do_conv_2x2_stride1( | ||||
const float* src, const float* filter, float* dst, size_t IH, size_t IW, | const float* src, const float* filter, float* dst, size_t IH, size_t IW, | ||||
size_t OH, size_t OW, size_t IC) { | size_t OH, size_t OW, size_t IC) { | ||||
@@ -143,10 +167,18 @@ void conv_stride1::do_conv_3x3_stride1( | |||||
const float* k1 = filter + 3; | const float* k1 = filter + 3; | ||||
const float* k2 = filter + 5; | const float* k2 = filter + 5; | ||||
#if defined(PREFER_VF) | |||||
const float* _k0123 = k0; | |||||
const float* _k3456 = k1; | |||||
const float* _k5678 = k2; | |||||
float _k6789[GI_SIMD_LEN_BYTE / sizeof(float)]; | |||||
ext_float32_ptr(_k5678, _k5678, 1, _k6789); | |||||
#else | |||||
GI_FLOAT32_t _k0123 = GiLoadFloat32(k0); | GI_FLOAT32_t _k0123 = GiLoadFloat32(k0); | ||||
GI_FLOAT32_t _k3456 = GiLoadFloat32(k1); | GI_FLOAT32_t _k3456 = GiLoadFloat32(k1); | ||||
GI_FLOAT32_t _k5678 = GiLoadFloat32(k2); | GI_FLOAT32_t _k5678 = GiLoadFloat32(k2); | ||||
GI_FLOAT32_t _k6789 = GiExtqFloat32(_k5678, _k5678, 1); | GI_FLOAT32_t _k6789 = GiExtqFloat32(_k5678, _k5678, 1); | ||||
#endif | |||||
size_t h = 0; | size_t h = 0; | ||||
for (; h + 1 < OH; h += 2) { | for (; h + 1 < OH; h += 2) { | ||||
@@ -178,25 +210,25 @@ void conv_stride1::do_conv_3x3_stride1( | |||||
GI_FLOAT32_t _r31 = GiExtqFloat32(_r30, _r30n, 1); | GI_FLOAT32_t _r31 = GiExtqFloat32(_r30, _r30n, 1); | ||||
GI_FLOAT32_t _r32 = GiExtqFloat32(_r30, _r30n, 2); | GI_FLOAT32_t _r32 = GiExtqFloat32(_r30, _r30n, 2); | ||||
_sum1 = GiSimdFmaLane(_sum1, _r00, _k0123, 0); | |||||
_sum2 = GiSimdFmaLane(_sum2, _r01, _k0123, 1); | |||||
_sum1 = GiSimdFmaLane(_sum1, _r02, _k0123, 2); | |||||
_sum2 = GiSimdFmaLane(_sum2, _r10, _k3456, 0); | |||||
_sum1 = GiSimdFmaLane(_sum1, _r11, _k3456, 1); | |||||
_sum2 = GiSimdFmaLane(_sum2, _r12, _k3456, 2); | |||||
_sum1 = GiSimdFmaLane(_sum1, _r20, _k6789, 0); | |||||
_sum2 = GiSimdFmaLane(_sum2, _r21, _k6789, 1); | |||||
_sum1 = GiSimdFmaLane(_sum1, _r22, _k6789, 2); | |||||
_sum3 = GiSimdFmaLane(_sum3, _r10, _k0123, 0); | |||||
_sum4 = GiSimdFmaLane(_sum4, _r11, _k0123, 1); | |||||
_sum3 = GiSimdFmaLane(_sum3, _r12, _k0123, 2); | |||||
_sum4 = GiSimdFmaLane(_sum4, _r20, _k3456, 0); | |||||
_sum3 = GiSimdFmaLane(_sum3, _r21, _k3456, 1); | |||||
_sum4 = GiSimdFmaLane(_sum4, _r22, _k3456, 2); | |||||
_sum3 = GiSimdFmaLane(_sum3, _r30, _k6789, 0); | |||||
_sum4 = GiSimdFmaLane(_sum4, _r31, _k6789, 1); | |||||
_sum3 = GiSimdFmaLane(_sum3, _r32, _k6789, 2); | |||||
_sum1 = MLA(_sum1, _r00, _k0123, 0); | |||||
_sum2 = MLA(_sum2, _r01, _k0123, 1); | |||||
_sum1 = MLA(_sum1, _r02, _k0123, 2); | |||||
_sum2 = MLA(_sum2, _r10, _k3456, 0); | |||||
_sum1 = MLA(_sum1, _r11, _k3456, 1); | |||||
_sum2 = MLA(_sum2, _r12, _k3456, 2); | |||||
_sum1 = MLA(_sum1, _r20, _k6789, 0); | |||||
_sum2 = MLA(_sum2, _r21, _k6789, 1); | |||||
_sum1 = MLA(_sum1, _r22, _k6789, 2); | |||||
_sum3 = MLA(_sum3, _r10, _k0123, 0); | |||||
_sum4 = MLA(_sum4, _r11, _k0123, 1); | |||||
_sum3 = MLA(_sum3, _r12, _k0123, 2); | |||||
_sum4 = MLA(_sum4, _r20, _k3456, 0); | |||||
_sum3 = MLA(_sum3, _r21, _k3456, 1); | |||||
_sum4 = MLA(_sum4, _r22, _k3456, 2); | |||||
_sum3 = MLA(_sum3, _r30, _k6789, 0); | |||||
_sum4 = MLA(_sum4, _r31, _k6789, 1); | |||||
_sum3 = MLA(_sum3, _r32, _k6789, 2); | |||||
_sum1 = GiAddFloat32(_sum1, _sum2); | _sum1 = GiAddFloat32(_sum1, _sum2); | ||||
_sum3 = GiAddFloat32(_sum3, _sum4); | _sum3 = GiAddFloat32(_sum3, _sum4); | ||||
@@ -243,15 +275,15 @@ void conv_stride1::do_conv_3x3_stride1( | |||||
GI_FLOAT32_t _r21 = GiExtqFloat32(_r20, _r20n, 1); | GI_FLOAT32_t _r21 = GiExtqFloat32(_r20, _r20n, 1); | ||||
GI_FLOAT32_t _r22 = GiExtqFloat32(_r20, _r20n, 2); | GI_FLOAT32_t _r22 = GiExtqFloat32(_r20, _r20n, 2); | ||||
_sum1 = GiSimdFmaLane(_sum1, _r00, _k0123, 0); | |||||
_sum2 = GiSimdFmaLane(_sum2, _r01, _k0123, 1); | |||||
_sum1 = GiSimdFmaLane(_sum1, _r02, _k0123, 2); | |||||
_sum2 = GiSimdFmaLane(_sum2, _r10, _k3456, 0); | |||||
_sum1 = GiSimdFmaLane(_sum1, _r11, _k3456, 1); | |||||
_sum2 = GiSimdFmaLane(_sum2, _r12, _k3456, 2); | |||||
_sum1 = GiSimdFmaLane(_sum1, _r20, _k6789, 0); | |||||
_sum2 = GiSimdFmaLane(_sum2, _r21, _k6789, 1); | |||||
_sum1 = GiSimdFmaLane(_sum1, _r22, _k6789, 2); | |||||
_sum1 = MLA(_sum1, _r00, _k0123, 0); | |||||
_sum2 = MLA(_sum2, _r01, _k0123, 1); | |||||
_sum1 = MLA(_sum1, _r02, _k0123, 2); | |||||
_sum2 = MLA(_sum2, _r10, _k3456, 0); | |||||
_sum1 = MLA(_sum1, _r11, _k3456, 1); | |||||
_sum2 = MLA(_sum2, _r12, _k3456, 2); | |||||
_sum1 = MLA(_sum1, _r20, _k6789, 0); | |||||
_sum2 = MLA(_sum2, _r21, _k6789, 1); | |||||
_sum1 = MLA(_sum1, _r22, _k6789, 2); | |||||
_sum1 = GiAddFloat32(_sum1, _sum2); | _sum1 = GiAddFloat32(_sum1, _sum2); | ||||
@@ -288,6 +320,15 @@ void conv_stride1::do_conv_5x5_stride1( | |||||
const float* r4 = src_ptr + IW * 4; | const float* r4 = src_ptr + IW * 4; | ||||
const float* r5 = src_ptr + IW * 5; | const float* r5 = src_ptr + IW * 5; | ||||
#if defined(PREFER_VF) | |||||
const float* _k0123 = filter; | |||||
const float* _k4567 = filter + 4; | |||||
const float* _k891011 = filter + 8; | |||||
const float* _k12131415 = filter + 12; | |||||
const float* _k16171819 = filter + 16; | |||||
const float* _k20212223 = filter + 20; | |||||
const float* _k24242424 = filter + 24; | |||||
#else | |||||
GI_FLOAT32_t _k0123 = GiLoadFloat32(filter); | GI_FLOAT32_t _k0123 = GiLoadFloat32(filter); | ||||
GI_FLOAT32_t _k4567 = GiLoadFloat32(filter + 4); | GI_FLOAT32_t _k4567 = GiLoadFloat32(filter + 4); | ||||
GI_FLOAT32_t _k891011 = GiLoadFloat32(filter + 8); | GI_FLOAT32_t _k891011 = GiLoadFloat32(filter + 8); | ||||
@@ -295,6 +336,7 @@ void conv_stride1::do_conv_5x5_stride1( | |||||
GI_FLOAT32_t _k16171819 = GiLoadFloat32(filter + 16); | GI_FLOAT32_t _k16171819 = GiLoadFloat32(filter + 16); | ||||
GI_FLOAT32_t _k20212223 = GiLoadFloat32(filter + 20); | GI_FLOAT32_t _k20212223 = GiLoadFloat32(filter + 20); | ||||
GI_FLOAT32_t _k24242424 = GiBroadcastFloat32(filter[24]); | GI_FLOAT32_t _k24242424 = GiBroadcastFloat32(filter[24]); | ||||
#endif | |||||
size_t h = 0; | size_t h = 0; | ||||
for (; h + 1 < OH; h += 2) { | for (; h + 1 < OH; h += 2) { | ||||
@@ -340,65 +382,65 @@ void conv_stride1::do_conv_5x5_stride1( | |||||
GI_FLOAT32_t _r52 = GiExtqFloat32(_r50, _r54, 2); | GI_FLOAT32_t _r52 = GiExtqFloat32(_r50, _r54, 2); | ||||
GI_FLOAT32_t _r53 = GiExtqFloat32(_r50, _r54, 3); | GI_FLOAT32_t _r53 = GiExtqFloat32(_r50, _r54, 3); | ||||
_sum = GiSimdFmaLane(_sum, _r00, _k0123, 0); | |||||
_sum = GiSimdFmaLane(_sum, _r01, _k0123, 1); | |||||
_sum = GiSimdFmaLane(_sum, _r02, _k0123, 2); | |||||
_sum = GiSimdFmaLane(_sum, _r03, _k0123, 3); | |||||
_sum = GiSimdFmaLane(_sum, _r04, _k4567, 0); | |||||
_sum = GiSimdFmaLane(_sum, _r10, _k4567, 1); | |||||
_sum = GiSimdFmaLane(_sum, _r11, _k4567, 2); | |||||
_sum = GiSimdFmaLane(_sum, _r12, _k4567, 3); | |||||
_sum = GiSimdFmaLane(_sum, _r13, _k891011, 0); | |||||
_sum = GiSimdFmaLane(_sum, _r14, _k891011, 1); | |||||
_sum = GiSimdFmaLane(_sum, _r20, _k891011, 2); | |||||
_sum = GiSimdFmaLane(_sum, _r21, _k891011, 3); | |||||
_sum = GiSimdFmaLane(_sum, _r22, _k12131415, 0); | |||||
_sum = GiSimdFmaLane(_sum, _r23, _k12131415, 1); | |||||
_sum = GiSimdFmaLane(_sum, _r24, _k12131415, 2); | |||||
_sum = GiSimdFmaLane(_sum, _r30, _k12131415, 3); | |||||
_sum = GiSimdFmaLane(_sum, _r31, _k16171819, 0); | |||||
_sum = GiSimdFmaLane(_sum, _r32, _k16171819, 1); | |||||
_sum = GiSimdFmaLane(_sum, _r33, _k16171819, 2); | |||||
_sum = GiSimdFmaLane(_sum, _r34, _k16171819, 3); | |||||
_sum = GiSimdFmaLane(_sum, _r40, _k20212223, 0); | |||||
_sum = GiSimdFmaLane(_sum, _r41, _k20212223, 1); | |||||
_sum = GiSimdFmaLane(_sum, _r42, _k20212223, 2); | |||||
_sum = GiSimdFmaLane(_sum, _r43, _k20212223, 3); | |||||
_sum = GiSimdFmaLane(_sum, _r44, _k24242424, 0); | |||||
_sum2 = GiSimdFmaLane(_sum2, _r10, _k0123, 0); | |||||
_sum2 = GiSimdFmaLane(_sum2, _r11, _k0123, 1); | |||||
_sum2 = GiSimdFmaLane(_sum2, _r12, _k0123, 2); | |||||
_sum2 = GiSimdFmaLane(_sum2, _r13, _k0123, 3); | |||||
_sum2 = GiSimdFmaLane(_sum2, _r14, _k4567, 0); | |||||
_sum2 = GiSimdFmaLane(_sum2, _r20, _k4567, 1); | |||||
_sum2 = GiSimdFmaLane(_sum2, _r21, _k4567, 2); | |||||
_sum2 = GiSimdFmaLane(_sum2, _r22, _k4567, 3); | |||||
_sum2 = GiSimdFmaLane(_sum2, _r23, _k891011, 0); | |||||
_sum2 = GiSimdFmaLane(_sum2, _r24, _k891011, 1); | |||||
_sum2 = GiSimdFmaLane(_sum2, _r30, _k891011, 2); | |||||
_sum2 = GiSimdFmaLane(_sum2, _r31, _k891011, 3); | |||||
_sum2 = GiSimdFmaLane(_sum2, _r32, _k12131415, 0); | |||||
_sum2 = GiSimdFmaLane(_sum2, _r33, _k12131415, 1); | |||||
_sum2 = GiSimdFmaLane(_sum2, _r34, _k12131415, 2); | |||||
_sum2 = GiSimdFmaLane(_sum2, _r40, _k12131415, 3); | |||||
_sum2 = GiSimdFmaLane(_sum2, _r41, _k16171819, 0); | |||||
_sum2 = GiSimdFmaLane(_sum2, _r42, _k16171819, 1); | |||||
_sum2 = GiSimdFmaLane(_sum2, _r43, _k16171819, 2); | |||||
_sum2 = GiSimdFmaLane(_sum2, _r44, _k16171819, 3); | |||||
_sum2 = GiSimdFmaLane(_sum2, _r50, _k20212223, 0); | |||||
_sum2 = GiSimdFmaLane(_sum2, _r51, _k20212223, 1); | |||||
_sum2 = GiSimdFmaLane(_sum2, _r52, _k20212223, 2); | |||||
_sum2 = GiSimdFmaLane(_sum2, _r53, _k20212223, 3); | |||||
_sum2 = GiSimdFmaLane(_sum2, _r54, _k24242424, 0); | |||||
_sum = MLA(_sum, _r00, _k0123, 0); | |||||
_sum = MLA(_sum, _r01, _k0123, 1); | |||||
_sum = MLA(_sum, _r02, _k0123, 2); | |||||
_sum = MLA(_sum, _r03, _k0123, 3); | |||||
_sum = MLA(_sum, _r04, _k4567, 0); | |||||
_sum = MLA(_sum, _r10, _k4567, 1); | |||||
_sum = MLA(_sum, _r11, _k4567, 2); | |||||
_sum = MLA(_sum, _r12, _k4567, 3); | |||||
_sum = MLA(_sum, _r13, _k891011, 0); | |||||
_sum = MLA(_sum, _r14, _k891011, 1); | |||||
_sum = MLA(_sum, _r20, _k891011, 2); | |||||
_sum = MLA(_sum, _r21, _k891011, 3); | |||||
_sum = MLA(_sum, _r22, _k12131415, 0); | |||||
_sum = MLA(_sum, _r23, _k12131415, 1); | |||||
_sum = MLA(_sum, _r24, _k12131415, 2); | |||||
_sum = MLA(_sum, _r30, _k12131415, 3); | |||||
_sum = MLA(_sum, _r31, _k16171819, 0); | |||||
_sum = MLA(_sum, _r32, _k16171819, 1); | |||||
_sum = MLA(_sum, _r33, _k16171819, 2); | |||||
_sum = MLA(_sum, _r34, _k16171819, 3); | |||||
_sum = MLA(_sum, _r40, _k20212223, 0); | |||||
_sum = MLA(_sum, _r41, _k20212223, 1); | |||||
_sum = MLA(_sum, _r42, _k20212223, 2); | |||||
_sum = MLA(_sum, _r43, _k20212223, 3); | |||||
_sum = MLA(_sum, _r44, _k24242424, 0); | |||||
_sum2 = MLA(_sum2, _r10, _k0123, 0); | |||||
_sum2 = MLA(_sum2, _r11, _k0123, 1); | |||||
_sum2 = MLA(_sum2, _r12, _k0123, 2); | |||||
_sum2 = MLA(_sum2, _r13, _k0123, 3); | |||||
_sum2 = MLA(_sum2, _r14, _k4567, 0); | |||||
_sum2 = MLA(_sum2, _r20, _k4567, 1); | |||||
_sum2 = MLA(_sum2, _r21, _k4567, 2); | |||||
_sum2 = MLA(_sum2, _r22, _k4567, 3); | |||||
_sum2 = MLA(_sum2, _r23, _k891011, 0); | |||||
_sum2 = MLA(_sum2, _r24, _k891011, 1); | |||||
_sum2 = MLA(_sum2, _r30, _k891011, 2); | |||||
_sum2 = MLA(_sum2, _r31, _k891011, 3); | |||||
_sum2 = MLA(_sum2, _r32, _k12131415, 0); | |||||
_sum2 = MLA(_sum2, _r33, _k12131415, 1); | |||||
_sum2 = MLA(_sum2, _r34, _k12131415, 2); | |||||
_sum2 = MLA(_sum2, _r40, _k12131415, 3); | |||||
_sum2 = MLA(_sum2, _r41, _k16171819, 0); | |||||
_sum2 = MLA(_sum2, _r42, _k16171819, 1); | |||||
_sum2 = MLA(_sum2, _r43, _k16171819, 2); | |||||
_sum2 = MLA(_sum2, _r44, _k16171819, 3); | |||||
_sum2 = MLA(_sum2, _r50, _k20212223, 0); | |||||
_sum2 = MLA(_sum2, _r51, _k20212223, 1); | |||||
_sum2 = MLA(_sum2, _r52, _k20212223, 2); | |||||
_sum2 = MLA(_sum2, _r53, _k20212223, 3); | |||||
_sum2 = MLA(_sum2, _r54, _k24242424, 0); | |||||
GiStoreFloat32(outptr, _sum); | GiStoreFloat32(outptr, _sum); | ||||
GiStoreFloat32(outptr2, _sum2); | GiStoreFloat32(outptr2, _sum2); | ||||
@@ -460,35 +502,35 @@ void conv_stride1::do_conv_5x5_stride1( | |||||
GI_FLOAT32_t _r42 = GiExtqFloat32(_r40, _r44, 2); | GI_FLOAT32_t _r42 = GiExtqFloat32(_r40, _r44, 2); | ||||
GI_FLOAT32_t _r43 = GiExtqFloat32(_r40, _r44, 3); | GI_FLOAT32_t _r43 = GiExtqFloat32(_r40, _r44, 3); | ||||
_sum = GiSimdFmaLane(_sum, _r00, _k0123, 0); | |||||
_sum = GiSimdFmaLane(_sum, _r01, _k0123, 1); | |||||
_sum = GiSimdFmaLane(_sum, _r02, _k0123, 2); | |||||
_sum = GiSimdFmaLane(_sum, _r03, _k0123, 3); | |||||
_sum = GiSimdFmaLane(_sum, _r04, _k4567, 0); | |||||
_sum = GiSimdFmaLane(_sum, _r10, _k4567, 1); | |||||
_sum = GiSimdFmaLane(_sum, _r11, _k4567, 2); | |||||
_sum = GiSimdFmaLane(_sum, _r12, _k4567, 3); | |||||
_sum = GiSimdFmaLane(_sum, _r13, _k891011, 0); | |||||
_sum = GiSimdFmaLane(_sum, _r14, _k891011, 1); | |||||
_sum = GiSimdFmaLane(_sum, _r20, _k891011, 2); | |||||
_sum = GiSimdFmaLane(_sum, _r21, _k891011, 3); | |||||
_sum = GiSimdFmaLane(_sum, _r22, _k12131415, 0); | |||||
_sum = GiSimdFmaLane(_sum, _r23, _k12131415, 1); | |||||
_sum = GiSimdFmaLane(_sum, _r24, _k12131415, 2); | |||||
_sum = GiSimdFmaLane(_sum, _r30, _k12131415, 3); | |||||
_sum = GiSimdFmaLane(_sum, _r31, _k16171819, 0); | |||||
_sum = GiSimdFmaLane(_sum, _r32, _k16171819, 1); | |||||
_sum = GiSimdFmaLane(_sum, _r33, _k16171819, 2); | |||||
_sum = GiSimdFmaLane(_sum, _r34, _k16171819, 3); | |||||
_sum = GiSimdFmaLane(_sum, _r40, _k20212223, 0); | |||||
_sum = GiSimdFmaLane(_sum, _r41, _k20212223, 1); | |||||
_sum = GiSimdFmaLane(_sum, _r42, _k20212223, 2); | |||||
_sum = GiSimdFmaLane(_sum, _r43, _k20212223, 3); | |||||
_sum = GiSimdFmaLane(_sum, _r44, _k24242424, 0); | |||||
_sum = MLA(_sum, _r00, _k0123, 0); | |||||
_sum = MLA(_sum, _r01, _k0123, 1); | |||||
_sum = MLA(_sum, _r02, _k0123, 2); | |||||
_sum = MLA(_sum, _r03, _k0123, 3); | |||||
_sum = MLA(_sum, _r04, _k4567, 0); | |||||
_sum = MLA(_sum, _r10, _k4567, 1); | |||||
_sum = MLA(_sum, _r11, _k4567, 2); | |||||
_sum = MLA(_sum, _r12, _k4567, 3); | |||||
_sum = MLA(_sum, _r13, _k891011, 0); | |||||
_sum = MLA(_sum, _r14, _k891011, 1); | |||||
_sum = MLA(_sum, _r20, _k891011, 2); | |||||
_sum = MLA(_sum, _r21, _k891011, 3); | |||||
_sum = MLA(_sum, _r22, _k12131415, 0); | |||||
_sum = MLA(_sum, _r23, _k12131415, 1); | |||||
_sum = MLA(_sum, _r24, _k12131415, 2); | |||||
_sum = MLA(_sum, _r30, _k12131415, 3); | |||||
_sum = MLA(_sum, _r31, _k16171819, 0); | |||||
_sum = MLA(_sum, _r32, _k16171819, 1); | |||||
_sum = MLA(_sum, _r33, _k16171819, 2); | |||||
_sum = MLA(_sum, _r34, _k16171819, 3); | |||||
_sum = MLA(_sum, _r40, _k20212223, 0); | |||||
_sum = MLA(_sum, _r41, _k20212223, 1); | |||||
_sum = MLA(_sum, _r42, _k20212223, 2); | |||||
_sum = MLA(_sum, _r43, _k20212223, 3); | |||||
_sum = MLA(_sum, _r44, _k24242424, 0); | |||||
GiStoreFloat32(outptr, _sum); | GiStoreFloat32(outptr, _sum); | ||||
@@ -542,8 +584,13 @@ void conv_stride1::do_conv_7x7_stride1( | |||||
rep(i, width) { | rep(i, width) { | ||||
GI_FLOAT32_t _sum = GiLoadFloat32(outptr); | GI_FLOAT32_t _sum = GiLoadFloat32(outptr); | ||||
#if defined(PREFER_VF) | |||||
const float* _k0123 = k0; | |||||
const float* _k4567 = k0 + 4; | |||||
#else | |||||
GI_FLOAT32_t _k0123 = GiLoadFloat32(k0); | GI_FLOAT32_t _k0123 = GiLoadFloat32(k0); | ||||
GI_FLOAT32_t _k4567 = GiLoadFloat32(k0 + 4); | GI_FLOAT32_t _k4567 = GiLoadFloat32(k0 + 4); | ||||
#endif | |||||
GI_FLOAT32_t _r00 = GiLoadFloat32(r0); // 0 1 2 3 | GI_FLOAT32_t _r00 = GiLoadFloat32(r0); // 0 1 2 3 | ||||
GI_FLOAT32_t _r04 = GiLoadFloat32(r0 + 4); // 4 5 6 7 | GI_FLOAT32_t _r04 = GiLoadFloat32(r0 + 4); // 4 5 6 7 | ||||
@@ -554,16 +601,21 @@ void conv_stride1::do_conv_7x7_stride1( | |||||
GI_FLOAT32_t _r05 = GiExtqFloat32(_r04, _r00n, 1); // 5 6 7 8 | GI_FLOAT32_t _r05 = GiExtqFloat32(_r04, _r00n, 1); // 5 6 7 8 | ||||
GI_FLOAT32_t _r06 = GiExtqFloat32(_r04, _r00n, 2); // 6 7 8 9 | GI_FLOAT32_t _r06 = GiExtqFloat32(_r04, _r00n, 2); // 6 7 8 9 | ||||
_sum = GiSimdFmaLane(_sum, _r00, _k0123, 0); | |||||
_sum = GiSimdFmaLane(_sum, _r01, _k0123, 1); | |||||
_sum = GiSimdFmaLane(_sum, _r02, _k0123, 2); | |||||
_sum = GiSimdFmaLane(_sum, _r03, _k0123, 3); | |||||
_sum = GiSimdFmaLane(_sum, _r04, _k4567, 0); | |||||
_sum = GiSimdFmaLane(_sum, _r05, _k4567, 1); | |||||
_sum = GiSimdFmaLane(_sum, _r06, _k4567, 2); | |||||
_sum = MLA(_sum, _r00, _k0123, 0); | |||||
_sum = MLA(_sum, _r01, _k0123, 1); | |||||
_sum = MLA(_sum, _r02, _k0123, 2); | |||||
_sum = MLA(_sum, _r03, _k0123, 3); | |||||
_sum = MLA(_sum, _r04, _k4567, 0); | |||||
_sum = MLA(_sum, _r05, _k4567, 1); | |||||
_sum = MLA(_sum, _r06, _k4567, 2); | |||||
#if defined(PREFER_VF) | |||||
const float* _k78910 = k1; | |||||
const float* _k11121314 = k1 + 4; | |||||
#else | |||||
GI_FLOAT32_t _k78910 = GiLoadFloat32(k1); | GI_FLOAT32_t _k78910 = GiLoadFloat32(k1); | ||||
GI_FLOAT32_t _k11121314 = GiLoadFloat32(k1 + 4); | GI_FLOAT32_t _k11121314 = GiLoadFloat32(k1 + 4); | ||||
#endif | |||||
GI_FLOAT32_t _r10 = GiLoadFloat32(r1); | GI_FLOAT32_t _r10 = GiLoadFloat32(r1); | ||||
GI_FLOAT32_t _r14 = GiLoadFloat32(r1 + 4); | GI_FLOAT32_t _r14 = GiLoadFloat32(r1 + 4); | ||||
@@ -574,16 +626,21 @@ void conv_stride1::do_conv_7x7_stride1( | |||||
GI_FLOAT32_t _r15 = GiExtqFloat32(_r14, _r10n, 1); | GI_FLOAT32_t _r15 = GiExtqFloat32(_r14, _r10n, 1); | ||||
GI_FLOAT32_t _r16 = GiExtqFloat32(_r14, _r10n, 2); | GI_FLOAT32_t _r16 = GiExtqFloat32(_r14, _r10n, 2); | ||||
_sum = GiSimdFmaLane(_sum, _r10, _k78910, 0); | |||||
_sum = GiSimdFmaLane(_sum, _r11, _k78910, 1); | |||||
_sum = GiSimdFmaLane(_sum, _r12, _k78910, 2); | |||||
_sum = GiSimdFmaLane(_sum, _r13, _k78910, 3); | |||||
_sum = GiSimdFmaLane(_sum, _r14, _k11121314, 0); | |||||
_sum = GiSimdFmaLane(_sum, _r15, _k11121314, 1); | |||||
_sum = GiSimdFmaLane(_sum, _r16, _k11121314, 2); | |||||
_sum = MLA(_sum, _r10, _k78910, 0); | |||||
_sum = MLA(_sum, _r11, _k78910, 1); | |||||
_sum = MLA(_sum, _r12, _k78910, 2); | |||||
_sum = MLA(_sum, _r13, _k78910, 3); | |||||
_sum = MLA(_sum, _r14, _k11121314, 0); | |||||
_sum = MLA(_sum, _r15, _k11121314, 1); | |||||
_sum = MLA(_sum, _r16, _k11121314, 2); | |||||
#if defined(PREFER_VF) | |||||
const float* _k14151617 = k2; | |||||
const float* _k18192021 = k2 + 4; | |||||
#else | |||||
GI_FLOAT32_t _k14151617 = GiLoadFloat32(k2); | GI_FLOAT32_t _k14151617 = GiLoadFloat32(k2); | ||||
GI_FLOAT32_t _k18192021 = GiLoadFloat32(k2 + 4); | GI_FLOAT32_t _k18192021 = GiLoadFloat32(k2 + 4); | ||||
#endif | |||||
GI_FLOAT32_t _r20 = GiLoadFloat32(r2); | GI_FLOAT32_t _r20 = GiLoadFloat32(r2); | ||||
GI_FLOAT32_t _r24 = GiLoadFloat32(r2 + 4); | GI_FLOAT32_t _r24 = GiLoadFloat32(r2 + 4); | ||||
@@ -594,16 +651,21 @@ void conv_stride1::do_conv_7x7_stride1( | |||||
GI_FLOAT32_t _r25 = GiExtqFloat32(_r24, _r20n, 1); | GI_FLOAT32_t _r25 = GiExtqFloat32(_r24, _r20n, 1); | ||||
GI_FLOAT32_t _r26 = GiExtqFloat32(_r24, _r20n, 2); | GI_FLOAT32_t _r26 = GiExtqFloat32(_r24, _r20n, 2); | ||||
_sum = GiSimdFmaLane(_sum, _r20, _k14151617, 0); | |||||
_sum = GiSimdFmaLane(_sum, _r21, _k14151617, 1); | |||||
_sum = GiSimdFmaLane(_sum, _r22, _k14151617, 2); | |||||
_sum = GiSimdFmaLane(_sum, _r23, _k14151617, 3); | |||||
_sum = GiSimdFmaLane(_sum, _r24, _k18192021, 0); | |||||
_sum = GiSimdFmaLane(_sum, _r25, _k18192021, 1); | |||||
_sum = GiSimdFmaLane(_sum, _r26, _k18192021, 2); | |||||
_sum = MLA(_sum, _r20, _k14151617, 0); | |||||
_sum = MLA(_sum, _r21, _k14151617, 1); | |||||
_sum = MLA(_sum, _r22, _k14151617, 2); | |||||
_sum = MLA(_sum, _r23, _k14151617, 3); | |||||
_sum = MLA(_sum, _r24, _k18192021, 0); | |||||
_sum = MLA(_sum, _r25, _k18192021, 1); | |||||
_sum = MLA(_sum, _r26, _k18192021, 2); | |||||
#if defined(PREFER_VF) | |||||
const float* _k21222324 = k3; | |||||
const float* _k25262728 = k3 + 4; | |||||
#else | |||||
GI_FLOAT32_t _k21222324 = GiLoadFloat32(k3); | GI_FLOAT32_t _k21222324 = GiLoadFloat32(k3); | ||||
GI_FLOAT32_t _k25262728 = GiLoadFloat32(k3 + 4); | GI_FLOAT32_t _k25262728 = GiLoadFloat32(k3 + 4); | ||||
#endif | |||||
GI_FLOAT32_t _r30 = GiLoadFloat32(r3); | GI_FLOAT32_t _r30 = GiLoadFloat32(r3); | ||||
GI_FLOAT32_t _r34 = GiLoadFloat32(r3 + 4); | GI_FLOAT32_t _r34 = GiLoadFloat32(r3 + 4); | ||||
@@ -614,16 +676,21 @@ void conv_stride1::do_conv_7x7_stride1( | |||||
GI_FLOAT32_t _r35 = GiExtqFloat32(_r34, _r30n, 1); | GI_FLOAT32_t _r35 = GiExtqFloat32(_r34, _r30n, 1); | ||||
GI_FLOAT32_t _r36 = GiExtqFloat32(_r34, _r30n, 2); | GI_FLOAT32_t _r36 = GiExtqFloat32(_r34, _r30n, 2); | ||||
_sum = GiSimdFmaLane(_sum, _r30, _k21222324, 0); | |||||
_sum = GiSimdFmaLane(_sum, _r31, _k21222324, 1); | |||||
_sum = GiSimdFmaLane(_sum, _r32, _k21222324, 2); | |||||
_sum = GiSimdFmaLane(_sum, _r33, _k21222324, 3); | |||||
_sum = GiSimdFmaLane(_sum, _r34, _k25262728, 0); | |||||
_sum = GiSimdFmaLane(_sum, _r35, _k25262728, 1); | |||||
_sum = GiSimdFmaLane(_sum, _r36, _k25262728, 2); | |||||
_sum = MLA(_sum, _r30, _k21222324, 0); | |||||
_sum = MLA(_sum, _r31, _k21222324, 1); | |||||
_sum = MLA(_sum, _r32, _k21222324, 2); | |||||
_sum = MLA(_sum, _r33, _k21222324, 3); | |||||
_sum = MLA(_sum, _r34, _k25262728, 0); | |||||
_sum = MLA(_sum, _r35, _k25262728, 1); | |||||
_sum = MLA(_sum, _r36, _k25262728, 2); | |||||
#if defined(PREFER_VF) | |||||
const float* _k28293031 = k4; | |||||
const float* _k32333435 = k4 + 4; | |||||
#else | |||||
GI_FLOAT32_t _k28293031 = GiLoadFloat32(k4); | GI_FLOAT32_t _k28293031 = GiLoadFloat32(k4); | ||||
GI_FLOAT32_t _k32333435 = GiLoadFloat32(k4 + 4); | GI_FLOAT32_t _k32333435 = GiLoadFloat32(k4 + 4); | ||||
#endif | |||||
GI_FLOAT32_t _r40 = GiLoadFloat32(r4); | GI_FLOAT32_t _r40 = GiLoadFloat32(r4); | ||||
GI_FLOAT32_t _r44 = GiLoadFloat32(r4 + 4); | GI_FLOAT32_t _r44 = GiLoadFloat32(r4 + 4); | ||||
@@ -634,16 +701,21 @@ void conv_stride1::do_conv_7x7_stride1( | |||||
GI_FLOAT32_t _r45 = GiExtqFloat32(_r44, _r40n, 1); | GI_FLOAT32_t _r45 = GiExtqFloat32(_r44, _r40n, 1); | ||||
GI_FLOAT32_t _r46 = GiExtqFloat32(_r44, _r40n, 2); | GI_FLOAT32_t _r46 = GiExtqFloat32(_r44, _r40n, 2); | ||||
_sum = GiSimdFmaLane(_sum, _r40, _k28293031, 0); | |||||
_sum = GiSimdFmaLane(_sum, _r41, _k28293031, 1); | |||||
_sum = GiSimdFmaLane(_sum, _r42, _k28293031, 2); | |||||
_sum = GiSimdFmaLane(_sum, _r43, _k28293031, 3); | |||||
_sum = GiSimdFmaLane(_sum, _r44, _k32333435, 0); | |||||
_sum = GiSimdFmaLane(_sum, _r45, _k32333435, 1); | |||||
_sum = GiSimdFmaLane(_sum, _r46, _k32333435, 2); | |||||
_sum = MLA(_sum, _r40, _k28293031, 0); | |||||
_sum = MLA(_sum, _r41, _k28293031, 1); | |||||
_sum = MLA(_sum, _r42, _k28293031, 2); | |||||
_sum = MLA(_sum, _r43, _k28293031, 3); | |||||
_sum = MLA(_sum, _r44, _k32333435, 0); | |||||
_sum = MLA(_sum, _r45, _k32333435, 1); | |||||
_sum = MLA(_sum, _r46, _k32333435, 2); | |||||
#if defined(PREFER_VF) | |||||
const float* _k35363738 = k5; | |||||
const float* _k39404142 = k5 + 4; | |||||
#else | |||||
GI_FLOAT32_t _k35363738 = GiLoadFloat32(k5); | GI_FLOAT32_t _k35363738 = GiLoadFloat32(k5); | ||||
GI_FLOAT32_t _k39404142 = GiLoadFloat32(k5 + 4); | GI_FLOAT32_t _k39404142 = GiLoadFloat32(k5 + 4); | ||||
#endif | |||||
GI_FLOAT32_t _r50 = GiLoadFloat32(r5); | GI_FLOAT32_t _r50 = GiLoadFloat32(r5); | ||||
GI_FLOAT32_t _r54 = GiLoadFloat32(r5 + 4); | GI_FLOAT32_t _r54 = GiLoadFloat32(r5 + 4); | ||||
@@ -654,17 +726,24 @@ void conv_stride1::do_conv_7x7_stride1( | |||||
GI_FLOAT32_t _r55 = GiExtqFloat32(_r54, _r50n, 1); | GI_FLOAT32_t _r55 = GiExtqFloat32(_r54, _r50n, 1); | ||||
GI_FLOAT32_t _r56 = GiExtqFloat32(_r54, _r50n, 2); | GI_FLOAT32_t _r56 = GiExtqFloat32(_r54, _r50n, 2); | ||||
_sum = GiSimdFmaLane(_sum, _r50, _k35363738, 0); | |||||
_sum = GiSimdFmaLane(_sum, _r51, _k35363738, 1); | |||||
_sum = GiSimdFmaLane(_sum, _r52, _k35363738, 2); | |||||
_sum = GiSimdFmaLane(_sum, _r53, _k35363738, 3); | |||||
_sum = GiSimdFmaLane(_sum, _r54, _k39404142, 0); | |||||
_sum = GiSimdFmaLane(_sum, _r55, _k39404142, 1); | |||||
_sum = GiSimdFmaLane(_sum, _r56, _k39404142, 2); | |||||
_sum = MLA(_sum, _r50, _k35363738, 0); | |||||
_sum = MLA(_sum, _r51, _k35363738, 1); | |||||
_sum = MLA(_sum, _r52, _k35363738, 2); | |||||
_sum = MLA(_sum, _r53, _k35363738, 3); | |||||
_sum = MLA(_sum, _r54, _k39404142, 0); | |||||
_sum = MLA(_sum, _r55, _k39404142, 1); | |||||
_sum = MLA(_sum, _r56, _k39404142, 2); | |||||
#if defined(PREFER_VF) | |||||
const float* _k42434445 = k6; | |||||
float _k46474849[GI_SIMD_LEN_BYTE / sizeof(float)]; | |||||
memcpy(_k46474849, k6 + 4, | |||||
sizeof(float) * GI_SIMD_LEN_BYTE / sizeof(float) - 1); | |||||
#else | |||||
GI_FLOAT32_t _k42434445 = GiLoadFloat32(k6); | GI_FLOAT32_t _k42434445 = GiLoadFloat32(k6); | ||||
GI_FLOAT32_t _k46474849 = | GI_FLOAT32_t _k46474849 = | ||||
GiLd1qLaneFloat32(k6 + 4 + 2, GiLoadFloat32LowHalf(k6 + 4), 2); | GiLd1qLaneFloat32(k6 + 4 + 2, GiLoadFloat32LowHalf(k6 + 4), 2); | ||||
#endif | |||||
GI_FLOAT32_t _r60 = GiLoadFloat32(r6); | GI_FLOAT32_t _r60 = GiLoadFloat32(r6); | ||||
GI_FLOAT32_t _r64 = GiLoadFloat32(r6 + 4); | GI_FLOAT32_t _r64 = GiLoadFloat32(r6 + 4); | ||||
@@ -675,13 +754,13 @@ void conv_stride1::do_conv_7x7_stride1( | |||||
GI_FLOAT32_t _r65 = GiExtqFloat32(_r64, _r60n, 1); | GI_FLOAT32_t _r65 = GiExtqFloat32(_r64, _r60n, 1); | ||||
GI_FLOAT32_t _r66 = GiExtqFloat32(_r64, _r60n, 2); | GI_FLOAT32_t _r66 = GiExtqFloat32(_r64, _r60n, 2); | ||||
_sum = GiSimdFmaLane(_sum, _r60, _k42434445, 0); | |||||
_sum = GiSimdFmaLane(_sum, _r61, _k42434445, 1); | |||||
_sum = GiSimdFmaLane(_sum, _r62, _k42434445, 2); | |||||
_sum = GiSimdFmaLane(_sum, _r63, _k42434445, 3); | |||||
_sum = GiSimdFmaLane(_sum, _r64, _k46474849, 0); | |||||
_sum = GiSimdFmaLane(_sum, _r65, _k46474849, 1); | |||||
_sum = GiSimdFmaLane(_sum, _r66, _k46474849, 2); | |||||
_sum = MLA(_sum, _r60, _k42434445, 0); | |||||
_sum = MLA(_sum, _r61, _k42434445, 1); | |||||
_sum = MLA(_sum, _r62, _k42434445, 2); | |||||
_sum = MLA(_sum, _r63, _k42434445, 3); | |||||
_sum = MLA(_sum, _r64, _k46474849, 0); | |||||
_sum = MLA(_sum, _r65, _k46474849, 1); | |||||
_sum = MLA(_sum, _r66, _k46474849, 2); | |||||
GiStoreFloat32(outptr, _sum); | GiStoreFloat32(outptr, _sum); | ||||
@@ -15,6 +15,30 @@ using namespace conv_stride2; | |||||
using NCBKernSizeParam = fallback::ConvBiasImpl::NCBKernSizeParam; | using NCBKernSizeParam = fallback::ConvBiasImpl::NCBKernSizeParam; | ||||
using NCBKernParam = fallback::ConvBiasImpl::NCBKernParam; | using NCBKernParam = fallback::ConvBiasImpl::NCBKernParam; | ||||
#if defined(GI_RVV_INTRINSICS) | |||||
#define PREFER_VF | |||||
#endif | |||||
#if defined(PREFER_VF) | |||||
#define MLA(a, b, c, d) GiMultiplyAddScalarFloat32(a, b, *(c + d)) | |||||
namespace { | |||||
GI_FORCEINLINE void ext_float32_ptr( | |||||
const float* a, const float* b, const int n, float* ret) { | |||||
int t_count = GI_SIMD_LEN_BYTE / sizeof(float); | |||||
int a_count = t_count - n; | |||||
for (int i = 0; i < a_count; i++) { | |||||
ret[i] = a[i + n]; | |||||
} | |||||
for (int i = 0; i < n; i++) { | |||||
ret[i + a_count] = b[i]; | |||||
} | |||||
} | |||||
}; // namespace | |||||
#else | |||||
#define MLA(a, b, c, d) GiSimdFmaLane(a, b, c, d) | |||||
#endif | |||||
void conv_stride2::do_conv_2x2_stride2( | void conv_stride2::do_conv_2x2_stride2( | ||||
const float* src, const float* filter, float* dst, size_t IH, size_t IW, | const float* src, const float* filter, float* dst, size_t IH, size_t IW, | ||||
size_t OH, size_t OW, size_t IC) { | size_t OH, size_t OW, size_t IC) { | ||||
@@ -29,7 +53,11 @@ void conv_stride2::do_conv_2x2_stride2( | |||||
const float* k0 = filter; | const float* k0 = filter; | ||||
#if defined(PREFER_VF) | |||||
const float* _k0123 = k0; | |||||
#else | |||||
GI_FLOAT32_t _k0123 = GiLoadFloat32(k0); | GI_FLOAT32_t _k0123 = GiLoadFloat32(k0); | ||||
#endif | |||||
rep(h, OH) { | rep(h, OH) { | ||||
int nn = OW >> 2; | int nn = OW >> 2; | ||||
@@ -41,16 +69,16 @@ void conv_stride2::do_conv_2x2_stride2( | |||||
GI_FLOAT32_t _r00 = GiGetSubVectorFloat32V2(_r0, 0); // 0 2 4 6 | GI_FLOAT32_t _r00 = GiGetSubVectorFloat32V2(_r0, 0); // 0 2 4 6 | ||||
GI_FLOAT32_t _r01 = GiGetSubVectorFloat32V2(_r0, 1); // 1 3 5 7 | GI_FLOAT32_t _r01 = GiGetSubVectorFloat32V2(_r0, 1); // 1 3 5 7 | ||||
_outp = GiSimdFmaLane(_outp, _r00, _k0123, 0); | |||||
_outp = GiSimdFmaLane(_outp, _r01, _k0123, 1); | |||||
_outp = MLA(_outp, _r00, _k0123, 0); | |||||
_outp = MLA(_outp, _r01, _k0123, 1); | |||||
GI_FLOAT32_V2_t _r1 = GiLoadUzipFloat32V2(r1); | GI_FLOAT32_V2_t _r1 = GiLoadUzipFloat32V2(r1); | ||||
GI_FLOAT32_t _r10 = GiGetSubVectorFloat32V2(_r1, 0); | GI_FLOAT32_t _r10 = GiGetSubVectorFloat32V2(_r1, 0); | ||||
GI_FLOAT32_t _r11 = GiGetSubVectorFloat32V2(_r1, 1); | GI_FLOAT32_t _r11 = GiGetSubVectorFloat32V2(_r1, 1); | ||||
_outp = GiSimdFmaLane(_outp, _r10, _k0123, 2); | |||||
_outp = GiSimdFmaLane(_outp, _r11, _k0123, 3); | |||||
_outp = MLA(_outp, _r10, _k0123, 2); | |||||
_outp = MLA(_outp, _r11, _k0123, 3); | |||||
GiStoreFloat32(outptr, _outp); | GiStoreFloat32(outptr, _outp); | ||||
@@ -84,10 +112,18 @@ void conv_stride2::do_conv_3x3_stride2( | |||||
const float* k1 = filter + 3; | const float* k1 = filter + 3; | ||||
const float* k2 = filter + 5; | const float* k2 = filter + 5; | ||||
#if defined(PREFER_VF) | |||||
const float* _k0123 = k0; | |||||
const float* _k3456 = k1; | |||||
const float* _k5678 = k2; | |||||
float _k6789[GI_SIMD_LEN_BYTE / sizeof(float)]; | |||||
ext_float32_ptr(_k5678, _k5678, 1, _k6789); | |||||
#else | |||||
GI_FLOAT32_t _k0123 = GiLoadFloat32(k0); | GI_FLOAT32_t _k0123 = GiLoadFloat32(k0); | ||||
GI_FLOAT32_t _k3456 = GiLoadFloat32(k1); | GI_FLOAT32_t _k3456 = GiLoadFloat32(k1); | ||||
GI_FLOAT32_t _k5678 = GiLoadFloat32(k2); | GI_FLOAT32_t _k5678 = GiLoadFloat32(k2); | ||||
GI_FLOAT32_t _k6789 = GiExtqFloat32(_k5678, _k5678, 1); | GI_FLOAT32_t _k6789 = GiExtqFloat32(_k5678, _k5678, 1); | ||||
#endif | |||||
rep(h, OH) { | rep(h, OH) { | ||||
int nn = OW >> 2; | int nn = OW >> 2; | ||||
@@ -102,9 +138,9 @@ void conv_stride2::do_conv_3x3_stride2( | |||||
GI_FLOAT32_t _r02 = GiExtqFloat32( | GI_FLOAT32_t _r02 = GiExtqFloat32( | ||||
_r00, GiGetSubVectorFloat32V2(_r0n, 0), 1); // 2 4 6 8 | _r00, GiGetSubVectorFloat32V2(_r0n, 0), 1); // 2 4 6 8 | ||||
_outp = GiSimdFmaLane(_outp, _r00, _k0123, 0); | |||||
_outp = GiSimdFmaLane(_outp, _r01, _k0123, 1); | |||||
_outp = GiSimdFmaLane(_outp, _r02, _k0123, 2); | |||||
_outp = MLA(_outp, _r00, _k0123, 0); | |||||
_outp = MLA(_outp, _r01, _k0123, 1); | |||||
_outp = MLA(_outp, _r02, _k0123, 2); | |||||
GI_FLOAT32_V2_t _r1 = GiLoadUzipFloat32V2(r1); | GI_FLOAT32_V2_t _r1 = GiLoadUzipFloat32V2(r1); | ||||
GI_FLOAT32_V2_t _r1n = GiLoadUzipFloat32V2(r1 + 8); | GI_FLOAT32_V2_t _r1n = GiLoadUzipFloat32V2(r1 + 8); | ||||
@@ -114,9 +150,9 @@ void conv_stride2::do_conv_3x3_stride2( | |||||
GI_FLOAT32_t _r12 = | GI_FLOAT32_t _r12 = | ||||
GiExtqFloat32(_r10, GiGetSubVectorFloat32V2(_r1n, 0), 1); | GiExtqFloat32(_r10, GiGetSubVectorFloat32V2(_r1n, 0), 1); | ||||
_outp = GiSimdFmaLane(_outp, _r10, _k3456, 0); | |||||
_outp = GiSimdFmaLane(_outp, _r11, _k3456, 1); | |||||
_outp = GiSimdFmaLane(_outp, _r12, _k3456, 2); | |||||
_outp = MLA(_outp, _r10, _k3456, 0); | |||||
_outp = MLA(_outp, _r11, _k3456, 1); | |||||
_outp = MLA(_outp, _r12, _k3456, 2); | |||||
GI_FLOAT32_V2_t _r2 = GiLoadUzipFloat32V2(r2); | GI_FLOAT32_V2_t _r2 = GiLoadUzipFloat32V2(r2); | ||||
GI_FLOAT32_V2_t _r2n = GiLoadUzipFloat32V2(r2 + 8); | GI_FLOAT32_V2_t _r2n = GiLoadUzipFloat32V2(r2 + 8); | ||||
@@ -126,9 +162,9 @@ void conv_stride2::do_conv_3x3_stride2( | |||||
GI_FLOAT32_t _r22 = | GI_FLOAT32_t _r22 = | ||||
GiExtqFloat32(_r20, GiGetSubVectorFloat32V2(_r2n, 0), 1); | GiExtqFloat32(_r20, GiGetSubVectorFloat32V2(_r2n, 0), 1); | ||||
_outp = GiSimdFmaLane(_outp, _r20, _k6789, 0); | |||||
_outp = GiSimdFmaLane(_outp, _r21, _k6789, 1); | |||||
_outp = GiSimdFmaLane(_outp, _r22, _k6789, 2); | |||||
_outp = MLA(_outp, _r20, _k6789, 0); | |||||
_outp = MLA(_outp, _r21, _k6789, 1); | |||||
_outp = MLA(_outp, _r22, _k6789, 2); | |||||
GiStoreFloat32(outptr, _outp); | GiStoreFloat32(outptr, _outp); | ||||
@@ -162,6 +198,15 @@ void conv_stride2::do_conv_5x5_stride2( | |||||
const float* r3 = src_ptr + IW * 3; | const float* r3 = src_ptr + IW * 3; | ||||
const float* r4 = src_ptr + IW * 4; | const float* r4 = src_ptr + IW * 4; | ||||
#if defined(PREFER_VF) | |||||
const float* _k0123 = filter; | |||||
const float* _k4567 = filter + 4; | |||||
const float* _k891011 = filter + 8; | |||||
const float* _k12131415 = filter + 12; | |||||
const float* _k16171819 = filter + 16; | |||||
const float* _k20212223 = filter + 20; | |||||
const float* _k24242424 = filter + 24; | |||||
#else | |||||
GI_FLOAT32_t _k0123 = GiLoadFloat32(filter); | GI_FLOAT32_t _k0123 = GiLoadFloat32(filter); | ||||
GI_FLOAT32_t _k4567 = GiLoadFloat32(filter + 4); | GI_FLOAT32_t _k4567 = GiLoadFloat32(filter + 4); | ||||
GI_FLOAT32_t _k891011 = GiLoadFloat32(filter + 8); | GI_FLOAT32_t _k891011 = GiLoadFloat32(filter + 8); | ||||
@@ -169,6 +214,7 @@ void conv_stride2::do_conv_5x5_stride2( | |||||
GI_FLOAT32_t _k16171819 = GiLoadFloat32(filter + 16); | GI_FLOAT32_t _k16171819 = GiLoadFloat32(filter + 16); | ||||
GI_FLOAT32_t _k20212223 = GiLoadFloat32(filter + 20); | GI_FLOAT32_t _k20212223 = GiLoadFloat32(filter + 20); | ||||
GI_FLOAT32_t _k24242424 = GiBroadcastFloat32(filter[24]); | GI_FLOAT32_t _k24242424 = GiBroadcastFloat32(filter[24]); | ||||
#endif | |||||
for (size_t i = 0; i < OH; i++) { | for (size_t i = 0; i < OH; i++) { | ||||
int nn = OW >> 2; | int nn = OW >> 2; | ||||
@@ -230,35 +276,35 @@ void conv_stride2::do_conv_5x5_stride2( | |||||
GI_FLOAT32_t _r43 = GiExtqFloat32(_r41, _r4_9111315, 1); | GI_FLOAT32_t _r43 = GiExtqFloat32(_r41, _r4_9111315, 1); | ||||
GI_FLOAT32_t _r44 = GiExtqFloat32(_r40, _r4_8101214, 2); | GI_FLOAT32_t _r44 = GiExtqFloat32(_r40, _r4_8101214, 2); | ||||
_sum = GiSimdFmaLane(_sum, _r00, _k0123, 0); | |||||
_sum = GiSimdFmaLane(_sum, _r01, _k0123, 1); | |||||
_sum = GiSimdFmaLane(_sum, _r02, _k0123, 2); | |||||
_sum = GiSimdFmaLane(_sum, _r03, _k0123, 3); | |||||
_sum = GiSimdFmaLane(_sum, _r04, _k4567, 0); | |||||
_sum = GiSimdFmaLane(_sum, _r10, _k4567, 1); | |||||
_sum = GiSimdFmaLane(_sum, _r11, _k4567, 2); | |||||
_sum = GiSimdFmaLane(_sum, _r12, _k4567, 3); | |||||
_sum = GiSimdFmaLane(_sum, _r13, _k891011, 0); | |||||
_sum = GiSimdFmaLane(_sum, _r14, _k891011, 1); | |||||
_sum = GiSimdFmaLane(_sum, _r20, _k891011, 2); | |||||
_sum = GiSimdFmaLane(_sum, _r21, _k891011, 3); | |||||
_sum = GiSimdFmaLane(_sum, _r22, _k12131415, 0); | |||||
_sum = GiSimdFmaLane(_sum, _r23, _k12131415, 1); | |||||
_sum = GiSimdFmaLane(_sum, _r24, _k12131415, 2); | |||||
_sum = GiSimdFmaLane(_sum, _r30, _k12131415, 3); | |||||
_sum = GiSimdFmaLane(_sum, _r31, _k16171819, 0); | |||||
_sum = GiSimdFmaLane(_sum, _r32, _k16171819, 1); | |||||
_sum = GiSimdFmaLane(_sum, _r33, _k16171819, 2); | |||||
_sum = GiSimdFmaLane(_sum, _r34, _k16171819, 3); | |||||
_sum = GiSimdFmaLane(_sum, _r40, _k20212223, 0); | |||||
_sum = GiSimdFmaLane(_sum, _r41, _k20212223, 1); | |||||
_sum = GiSimdFmaLane(_sum, _r42, _k20212223, 2); | |||||
_sum = GiSimdFmaLane(_sum, _r43, _k20212223, 3); | |||||
_sum = GiSimdFmaLane(_sum, _r44, _k24242424, 0); | |||||
_sum = MLA(_sum, _r00, _k0123, 0); | |||||
_sum = MLA(_sum, _r01, _k0123, 1); | |||||
_sum = MLA(_sum, _r02, _k0123, 2); | |||||
_sum = MLA(_sum, _r03, _k0123, 3); | |||||
_sum = MLA(_sum, _r04, _k4567, 0); | |||||
_sum = MLA(_sum, _r10, _k4567, 1); | |||||
_sum = MLA(_sum, _r11, _k4567, 2); | |||||
_sum = MLA(_sum, _r12, _k4567, 3); | |||||
_sum = MLA(_sum, _r13, _k891011, 0); | |||||
_sum = MLA(_sum, _r14, _k891011, 1); | |||||
_sum = MLA(_sum, _r20, _k891011, 2); | |||||
_sum = MLA(_sum, _r21, _k891011, 3); | |||||
_sum = MLA(_sum, _r22, _k12131415, 0); | |||||
_sum = MLA(_sum, _r23, _k12131415, 1); | |||||
_sum = MLA(_sum, _r24, _k12131415, 2); | |||||
_sum = MLA(_sum, _r30, _k12131415, 3); | |||||
_sum = MLA(_sum, _r31, _k16171819, 0); | |||||
_sum = MLA(_sum, _r32, _k16171819, 1); | |||||
_sum = MLA(_sum, _r33, _k16171819, 2); | |||||
_sum = MLA(_sum, _r34, _k16171819, 3); | |||||
_sum = MLA(_sum, _r40, _k20212223, 0); | |||||
_sum = MLA(_sum, _r41, _k20212223, 1); | |||||
_sum = MLA(_sum, _r42, _k20212223, 2); | |||||
_sum = MLA(_sum, _r43, _k20212223, 3); | |||||
_sum = MLA(_sum, _r44, _k24242424, 0); | |||||
GiStoreFloat32(outptr, _sum); | GiStoreFloat32(outptr, _sum); | ||||
@@ -312,8 +358,13 @@ void conv_stride2::do_conv_7x7_stride2( | |||||
rep(i, nn) { | rep(i, nn) { | ||||
GI_FLOAT32_t _sum = GiLoadFloat32(outptr); | GI_FLOAT32_t _sum = GiLoadFloat32(outptr); | ||||
#if defined(PREFER_VF) | |||||
const float* _k0123 = k0; | |||||
const float* _k4567 = k0 + 4; | |||||
#else | |||||
GI_FLOAT32_t _k0123 = GiLoadFloat32(k0); | GI_FLOAT32_t _k0123 = GiLoadFloat32(k0); | ||||
GI_FLOAT32_t _k4567 = GiLoadFloat32(k0 + 4); | GI_FLOAT32_t _k4567 = GiLoadFloat32(k0 + 4); | ||||
#endif | |||||
GI_FLOAT32_V2_t _r00_02461357 = GiLoadUzipFloat32V2(r0); | GI_FLOAT32_V2_t _r00_02461357 = GiLoadUzipFloat32V2(r0); | ||||
GI_FLOAT32_V2_t _r00nx2 = GiLoadUzipFloat32V2(r0 + 8); | GI_FLOAT32_V2_t _r00nx2 = GiLoadUzipFloat32V2(r0 + 8); | ||||
@@ -331,16 +382,21 @@ void conv_stride2::do_conv_7x7_stride2( | |||||
GI_FLOAT32_t _r05 = GiExtqFloat32(_r01, _r0_9111315, 2); // 5 7 9 11 | GI_FLOAT32_t _r05 = GiExtqFloat32(_r01, _r0_9111315, 2); // 5 7 9 11 | ||||
GI_FLOAT32_t _r06 = GiExtqFloat32(_r00, _r0_8101214, 3); // 6 8 10 12 | GI_FLOAT32_t _r06 = GiExtqFloat32(_r00, _r0_8101214, 3); // 6 8 10 12 | ||||
_sum = GiSimdFmaLane(_sum, _r00, _k0123, 0); | |||||
_sum = GiSimdFmaLane(_sum, _r01, _k0123, 1); | |||||
_sum = GiSimdFmaLane(_sum, _r02, _k0123, 2); | |||||
_sum = GiSimdFmaLane(_sum, _r03, _k0123, 3); | |||||
_sum = GiSimdFmaLane(_sum, _r04, _k4567, 0); | |||||
_sum = GiSimdFmaLane(_sum, _r05, _k4567, 1); | |||||
_sum = GiSimdFmaLane(_sum, _r06, _k4567, 2); | |||||
_sum = MLA(_sum, _r00, _k0123, 0); | |||||
_sum = MLA(_sum, _r01, _k0123, 1); | |||||
_sum = MLA(_sum, _r02, _k0123, 2); | |||||
_sum = MLA(_sum, _r03, _k0123, 3); | |||||
_sum = MLA(_sum, _r04, _k4567, 0); | |||||
_sum = MLA(_sum, _r05, _k4567, 1); | |||||
_sum = MLA(_sum, _r06, _k4567, 2); | |||||
#if defined(PREFER_VF) | |||||
const float* _k78910 = k1; | |||||
const float* _k11121314 = k1 + 4; | |||||
#else | |||||
GI_FLOAT32_t _k78910 = GiLoadFloat32(k1); | GI_FLOAT32_t _k78910 = GiLoadFloat32(k1); | ||||
GI_FLOAT32_t _k11121314 = GiLoadFloat32(k1 + 4); | GI_FLOAT32_t _k11121314 = GiLoadFloat32(k1 + 4); | ||||
#endif | |||||
GI_FLOAT32_V2_t _r10_02461357 = GiLoadUzipFloat32V2(r1); | GI_FLOAT32_V2_t _r10_02461357 = GiLoadUzipFloat32V2(r1); | ||||
GI_FLOAT32_V2_t _r10nx2 = GiLoadUzipFloat32V2(r1 + 8); | GI_FLOAT32_V2_t _r10nx2 = GiLoadUzipFloat32V2(r1 + 8); | ||||
@@ -354,16 +410,21 @@ void conv_stride2::do_conv_7x7_stride2( | |||||
GI_FLOAT32_t _r15 = GiExtqFloat32(_r11, _r1_9111315, 2); | GI_FLOAT32_t _r15 = GiExtqFloat32(_r11, _r1_9111315, 2); | ||||
GI_FLOAT32_t _r16 = GiExtqFloat32(_r10, _r1_8101214, 3); | GI_FLOAT32_t _r16 = GiExtqFloat32(_r10, _r1_8101214, 3); | ||||
_sum = GiSimdFmaLane(_sum, _r10, _k78910, 0); | |||||
_sum = GiSimdFmaLane(_sum, _r11, _k78910, 1); | |||||
_sum = GiSimdFmaLane(_sum, _r12, _k78910, 2); | |||||
_sum = GiSimdFmaLane(_sum, _r13, _k78910, 3); | |||||
_sum = GiSimdFmaLane(_sum, _r14, _k11121314, 0); | |||||
_sum = GiSimdFmaLane(_sum, _r15, _k11121314, 1); | |||||
_sum = GiSimdFmaLane(_sum, _r16, _k11121314, 2); | |||||
_sum = MLA(_sum, _r10, _k78910, 0); | |||||
_sum = MLA(_sum, _r11, _k78910, 1); | |||||
_sum = MLA(_sum, _r12, _k78910, 2); | |||||
_sum = MLA(_sum, _r13, _k78910, 3); | |||||
_sum = MLA(_sum, _r14, _k11121314, 0); | |||||
_sum = MLA(_sum, _r15, _k11121314, 1); | |||||
_sum = MLA(_sum, _r16, _k11121314, 2); | |||||
#if defined(PREFER_VF) | |||||
const float* _k14151617 = k2; | |||||
const float* _k18192021 = k2 + 4; | |||||
#else | |||||
GI_FLOAT32_t _k14151617 = GiLoadFloat32(k2); | GI_FLOAT32_t _k14151617 = GiLoadFloat32(k2); | ||||
GI_FLOAT32_t _k18192021 = GiLoadFloat32(k2 + 4); | GI_FLOAT32_t _k18192021 = GiLoadFloat32(k2 + 4); | ||||
#endif | |||||
GI_FLOAT32_V2_t _r20_02461357 = GiLoadUzipFloat32V2(r2); | GI_FLOAT32_V2_t _r20_02461357 = GiLoadUzipFloat32V2(r2); | ||||
GI_FLOAT32_V2_t _r20nx2 = GiLoadUzipFloat32V2(r2 + 8); | GI_FLOAT32_V2_t _r20nx2 = GiLoadUzipFloat32V2(r2 + 8); | ||||
@@ -377,16 +438,21 @@ void conv_stride2::do_conv_7x7_stride2( | |||||
GI_FLOAT32_t _r25 = GiExtqFloat32(_r21, _r2_9111315, 2); | GI_FLOAT32_t _r25 = GiExtqFloat32(_r21, _r2_9111315, 2); | ||||
GI_FLOAT32_t _r26 = GiExtqFloat32(_r20, _r2_8101214, 3); | GI_FLOAT32_t _r26 = GiExtqFloat32(_r20, _r2_8101214, 3); | ||||
_sum = GiSimdFmaLane(_sum, _r20, _k14151617, 0); | |||||
_sum = GiSimdFmaLane(_sum, _r21, _k14151617, 1); | |||||
_sum = GiSimdFmaLane(_sum, _r22, _k14151617, 2); | |||||
_sum = GiSimdFmaLane(_sum, _r23, _k14151617, 3); | |||||
_sum = GiSimdFmaLane(_sum, _r24, _k18192021, 0); | |||||
_sum = GiSimdFmaLane(_sum, _r25, _k18192021, 1); | |||||
_sum = GiSimdFmaLane(_sum, _r26, _k18192021, 2); | |||||
_sum = MLA(_sum, _r20, _k14151617, 0); | |||||
_sum = MLA(_sum, _r21, _k14151617, 1); | |||||
_sum = MLA(_sum, _r22, _k14151617, 2); | |||||
_sum = MLA(_sum, _r23, _k14151617, 3); | |||||
_sum = MLA(_sum, _r24, _k18192021, 0); | |||||
_sum = MLA(_sum, _r25, _k18192021, 1); | |||||
_sum = MLA(_sum, _r26, _k18192021, 2); | |||||
#if defined(PREFER_VF) | |||||
const float* _k21222324 = k3; | |||||
const float* _k25262728 = k3 + 4; | |||||
#else | |||||
GI_FLOAT32_t _k21222324 = GiLoadFloat32(k3); | GI_FLOAT32_t _k21222324 = GiLoadFloat32(k3); | ||||
GI_FLOAT32_t _k25262728 = GiLoadFloat32(k3 + 4); | GI_FLOAT32_t _k25262728 = GiLoadFloat32(k3 + 4); | ||||
#endif | |||||
GI_FLOAT32_V2_t _r30_02461357 = GiLoadUzipFloat32V2(r3); | GI_FLOAT32_V2_t _r30_02461357 = GiLoadUzipFloat32V2(r3); | ||||
GI_FLOAT32_V2_t _r30nx2 = GiLoadUzipFloat32V2(r3 + 8); | GI_FLOAT32_V2_t _r30nx2 = GiLoadUzipFloat32V2(r3 + 8); | ||||
@@ -400,16 +466,21 @@ void conv_stride2::do_conv_7x7_stride2( | |||||
GI_FLOAT32_t _r35 = GiExtqFloat32(_r31, _r3_9111315, 2); | GI_FLOAT32_t _r35 = GiExtqFloat32(_r31, _r3_9111315, 2); | ||||
GI_FLOAT32_t _r36 = GiExtqFloat32(_r30, _r3_8101214, 3); | GI_FLOAT32_t _r36 = GiExtqFloat32(_r30, _r3_8101214, 3); | ||||
_sum = GiSimdFmaLane(_sum, _r30, _k21222324, 0); | |||||
_sum = GiSimdFmaLane(_sum, _r31, _k21222324, 1); | |||||
_sum = GiSimdFmaLane(_sum, _r32, _k21222324, 2); | |||||
_sum = GiSimdFmaLane(_sum, _r33, _k21222324, 3); | |||||
_sum = GiSimdFmaLane(_sum, _r34, _k25262728, 0); | |||||
_sum = GiSimdFmaLane(_sum, _r35, _k25262728, 1); | |||||
_sum = GiSimdFmaLane(_sum, _r36, _k25262728, 2); | |||||
_sum = MLA(_sum, _r30, _k21222324, 0); | |||||
_sum = MLA(_sum, _r31, _k21222324, 1); | |||||
_sum = MLA(_sum, _r32, _k21222324, 2); | |||||
_sum = MLA(_sum, _r33, _k21222324, 3); | |||||
_sum = MLA(_sum, _r34, _k25262728, 0); | |||||
_sum = MLA(_sum, _r35, _k25262728, 1); | |||||
_sum = MLA(_sum, _r36, _k25262728, 2); | |||||
#if defined(PREFER_VF) | |||||
const float* _k28293031 = k4; | |||||
const float* _k32333435 = k4 + 4; | |||||
#else | |||||
GI_FLOAT32_t _k28293031 = GiLoadFloat32(k4); | GI_FLOAT32_t _k28293031 = GiLoadFloat32(k4); | ||||
GI_FLOAT32_t _k32333435 = GiLoadFloat32(k4 + 4); | GI_FLOAT32_t _k32333435 = GiLoadFloat32(k4 + 4); | ||||
#endif | |||||
GI_FLOAT32_V2_t _r40_02461357 = GiLoadUzipFloat32V2(r4); | GI_FLOAT32_V2_t _r40_02461357 = GiLoadUzipFloat32V2(r4); | ||||
GI_FLOAT32_V2_t _r40nx2 = GiLoadUzipFloat32V2(r4 + 8); | GI_FLOAT32_V2_t _r40nx2 = GiLoadUzipFloat32V2(r4 + 8); | ||||
@@ -423,16 +494,21 @@ void conv_stride2::do_conv_7x7_stride2( | |||||
GI_FLOAT32_t _r45 = GiExtqFloat32(_r41, _r4_9111315, 2); | GI_FLOAT32_t _r45 = GiExtqFloat32(_r41, _r4_9111315, 2); | ||||
GI_FLOAT32_t _r46 = GiExtqFloat32(_r40, _r4_8101214, 3); | GI_FLOAT32_t _r46 = GiExtqFloat32(_r40, _r4_8101214, 3); | ||||
_sum = GiSimdFmaLane(_sum, _r40, _k28293031, 0); | |||||
_sum = GiSimdFmaLane(_sum, _r41, _k28293031, 1); | |||||
_sum = GiSimdFmaLane(_sum, _r42, _k28293031, 2); | |||||
_sum = GiSimdFmaLane(_sum, _r43, _k28293031, 3); | |||||
_sum = GiSimdFmaLane(_sum, _r44, _k32333435, 0); | |||||
_sum = GiSimdFmaLane(_sum, _r45, _k32333435, 1); | |||||
_sum = GiSimdFmaLane(_sum, _r46, _k32333435, 2); | |||||
_sum = MLA(_sum, _r40, _k28293031, 0); | |||||
_sum = MLA(_sum, _r41, _k28293031, 1); | |||||
_sum = MLA(_sum, _r42, _k28293031, 2); | |||||
_sum = MLA(_sum, _r43, _k28293031, 3); | |||||
_sum = MLA(_sum, _r44, _k32333435, 0); | |||||
_sum = MLA(_sum, _r45, _k32333435, 1); | |||||
_sum = MLA(_sum, _r46, _k32333435, 2); | |||||
#if defined(PREFER_VF) | |||||
const float* _k35363738 = k5; | |||||
const float* _k39404142 = k5 + 4; | |||||
#else | |||||
GI_FLOAT32_t _k35363738 = GiLoadFloat32(k5); | GI_FLOAT32_t _k35363738 = GiLoadFloat32(k5); | ||||
GI_FLOAT32_t _k39404142 = GiLoadFloat32(k5 + 4); | GI_FLOAT32_t _k39404142 = GiLoadFloat32(k5 + 4); | ||||
#endif | |||||
GI_FLOAT32_V2_t _r50_02461357 = GiLoadUzipFloat32V2(r5); | GI_FLOAT32_V2_t _r50_02461357 = GiLoadUzipFloat32V2(r5); | ||||
GI_FLOAT32_V2_t _r50nx2 = GiLoadUzipFloat32V2(r5 + 8); | GI_FLOAT32_V2_t _r50nx2 = GiLoadUzipFloat32V2(r5 + 8); | ||||
@@ -446,16 +522,21 @@ void conv_stride2::do_conv_7x7_stride2( | |||||
GI_FLOAT32_t _r55 = GiExtqFloat32(_r51, _r5_9111315, 2); | GI_FLOAT32_t _r55 = GiExtqFloat32(_r51, _r5_9111315, 2); | ||||
GI_FLOAT32_t _r56 = GiExtqFloat32(_r50, _r5_8101214, 3); | GI_FLOAT32_t _r56 = GiExtqFloat32(_r50, _r5_8101214, 3); | ||||
_sum = GiSimdFmaLane(_sum, _r50, _k35363738, 0); | |||||
_sum = GiSimdFmaLane(_sum, _r51, _k35363738, 1); | |||||
_sum = GiSimdFmaLane(_sum, _r52, _k35363738, 2); | |||||
_sum = GiSimdFmaLane(_sum, _r53, _k35363738, 3); | |||||
_sum = GiSimdFmaLane(_sum, _r54, _k39404142, 0); | |||||
_sum = GiSimdFmaLane(_sum, _r55, _k39404142, 1); | |||||
_sum = GiSimdFmaLane(_sum, _r56, _k39404142, 2); | |||||
_sum = MLA(_sum, _r50, _k35363738, 0); | |||||
_sum = MLA(_sum, _r51, _k35363738, 1); | |||||
_sum = MLA(_sum, _r52, _k35363738, 2); | |||||
_sum = MLA(_sum, _r53, _k35363738, 3); | |||||
_sum = MLA(_sum, _r54, _k39404142, 0); | |||||
_sum = MLA(_sum, _r55, _k39404142, 1); | |||||
_sum = MLA(_sum, _r56, _k39404142, 2); | |||||
#if defined(PREFER_VF) | |||||
const float* _k42434445 = k6; | |||||
const float* _k45464748 = k6 + 3; | |||||
#else | |||||
GI_FLOAT32_t _k42434445 = GiLoadFloat32(k6); | GI_FLOAT32_t _k42434445 = GiLoadFloat32(k6); | ||||
GI_FLOAT32_t _k45464748 = GiLoadFloat32(k6 + 3); | GI_FLOAT32_t _k45464748 = GiLoadFloat32(k6 + 3); | ||||
#endif | |||||
GI_FLOAT32_V2_t _r60_02461357 = GiLoadUzipFloat32V2(r6); | GI_FLOAT32_V2_t _r60_02461357 = GiLoadUzipFloat32V2(r6); | ||||
GI_FLOAT32_V2_t _r60nx2 = GiLoadUzipFloat32V2(r6 + 8); | GI_FLOAT32_V2_t _r60nx2 = GiLoadUzipFloat32V2(r6 + 8); | ||||
@@ -469,13 +550,13 @@ void conv_stride2::do_conv_7x7_stride2( | |||||
GI_FLOAT32_t _r65 = GiExtqFloat32(_r61, _r6_9111315, 2); | GI_FLOAT32_t _r65 = GiExtqFloat32(_r61, _r6_9111315, 2); | ||||
GI_FLOAT32_t _r66 = GiExtqFloat32(_r60, _r6_8101214, 3); | GI_FLOAT32_t _r66 = GiExtqFloat32(_r60, _r6_8101214, 3); | ||||
_sum = GiSimdFmaLane(_sum, _r60, _k42434445, 0); | |||||
_sum = GiSimdFmaLane(_sum, _r61, _k42434445, 1); | |||||
_sum = GiSimdFmaLane(_sum, _r62, _k42434445, 2); | |||||
_sum = GiSimdFmaLane(_sum, _r63, _k42434445, 3); | |||||
_sum = GiSimdFmaLane(_sum, _r64, _k45464748, 1); | |||||
_sum = GiSimdFmaLane(_sum, _r65, _k45464748, 2); | |||||
_sum = GiSimdFmaLane(_sum, _r66, _k45464748, 3); | |||||
_sum = MLA(_sum, _r60, _k42434445, 0); | |||||
_sum = MLA(_sum, _r61, _k42434445, 1); | |||||
_sum = MLA(_sum, _r62, _k42434445, 2); | |||||
_sum = MLA(_sum, _r63, _k42434445, 3); | |||||
_sum = MLA(_sum, _r64, _k45464748, 1); | |||||
_sum = MLA(_sum, _r65, _k45464748, 2); | |||||
_sum = MLA(_sum, _r66, _k45464748, 3); | |||||
GiStoreFloat32(outptr, _sum); | GiStoreFloat32(outptr, _sum); | ||||
@@ -75,6 +75,21 @@ struct InputTransformF73_NCHW44 { | |||||
size_t icb = ic / pack_size; | size_t icb = ic / pack_size; | ||||
GI_FLOAT32_t d0, d1, d2, d3, d4, d5, d6, d7, d8; | GI_FLOAT32_t d0, d1, d2, d3, d4, d5, d6, d7, d8; | ||||
#if defined(GI_TARGET_X86) || defined(GI_RVV_INTRINSICS) | |||||
//! x86 and rvv GiSimdFmaLane API is slowly, as an alternate, use | |||||
//! GiMultiplyAddScalarFloat32 | |||||
#define MADD(a, b, c, d) GiMultiplyAddScalarFloat32(a, b, *(c + d)) | |||||
const float* v0 = input_parameters + 0; | |||||
const float* v1 = input_parameters + 4; | |||||
const float* v2 = input_parameters + 8; | |||||
const float* v3 = input_parameters + 12; | |||||
const float* v4 = input_parameters + 16; | |||||
const float* v5 = input_parameters + 20; | |||||
const float* v6 = input_parameters + 24; | |||||
#define MSUB(a, b, c, d) GiMultiplySubScalarFloat32(a, b, *(c + d)) | |||||
#else | |||||
#define MADD(a, b, c, d) GiSimdFmaLane(a, b, c, d) | |||||
#define MSUB(a, b, c, d) GiFmsqLaneQFloat32(a, b, c, d) | |||||
GI_FLOAT32_t v0 = GiLoadFloat32(input_parameters + 0); | GI_FLOAT32_t v0 = GiLoadFloat32(input_parameters + 0); | ||||
GI_FLOAT32_t v1 = GiLoadFloat32(input_parameters + 4); | GI_FLOAT32_t v1 = GiLoadFloat32(input_parameters + 4); | ||||
GI_FLOAT32_t v2 = GiLoadFloat32(input_parameters + 8); | GI_FLOAT32_t v2 = GiLoadFloat32(input_parameters + 8); | ||||
@@ -82,6 +97,7 @@ struct InputTransformF73_NCHW44 { | |||||
GI_FLOAT32_t v4 = GiLoadFloat32(input_parameters + 16); | GI_FLOAT32_t v4 = GiLoadFloat32(input_parameters + 16); | ||||
GI_FLOAT32_t v5 = GiLoadFloat32(input_parameters + 20); | GI_FLOAT32_t v5 = GiLoadFloat32(input_parameters + 20); | ||||
GI_FLOAT32_t v6 = GiLoadFloat32(input_parameters + 24); | GI_FLOAT32_t v6 = GiLoadFloat32(input_parameters + 24); | ||||
#endif | |||||
//! B | //! B | ||||
//! 1.5 0 0 0 0 0 0 0 0 | //! 1.5 0 0 0 0 0 0 0 0 | ||||
@@ -120,59 +136,59 @@ struct InputTransformF73_NCHW44 { | |||||
auto t##i##5 = d7; \ | auto t##i##5 = d7; \ | ||||
auto t##i##6 = d7; \ | auto t##i##6 = d7; \ | ||||
auto t##i##7 = d7; \ | auto t##i##7 = d7; \ | ||||
t##i##8 = GiFmsqLaneQFloat32(t##i##8, d7, v0, 0); \ | |||||
t##i##8 = MSUB(t##i##8, d7, v0, 0); \ | |||||
t##i##0 = GiSubtractFloat32(t##i##0, d1); \ | t##i##0 = GiSubtractFloat32(t##i##0, d1); \ | ||||
t##i##1 = GiFmsqLaneQFloat32(t##i##1, d1, v0, 0); \ | |||||
t##i##2 = GiSimdFmaLane(t##i##2, d1, v0, 0); \ | |||||
t##i##3 = GiFmsqLaneQFloat32(t##i##3, d1, v0, 1); \ | |||||
t##i##4 = GiSimdFmaLane(t##i##4, d1, v0, 1); \ | |||||
t##i##5 = GiFmsqLaneQFloat32(t##i##5, d1, v0, 2); \ | |||||
t##i##6 = GiSimdFmaLane(t##i##6, d1, v0, 2); \ | |||||
t##i##1 = MSUB(t##i##1, d1, v0, 0); \ | |||||
t##i##2 = MADD(t##i##2, d1, v0, 0); \ | |||||
t##i##3 = MSUB(t##i##3, d1, v0, 1); \ | |||||
t##i##4 = MADD(t##i##4, d1, v0, 1); \ | |||||
t##i##5 = MSUB(t##i##5, d1, v0, 2); \ | |||||
t##i##6 = MADD(t##i##6, d1, v0, 2); \ | |||||
t##i##7 = GiSubtractFloat32(t##i##7, d1); \ | t##i##7 = GiSubtractFloat32(t##i##7, d1); \ | ||||
t##i##8 = GiSimdFmaLane(t##i##8, d1, v0, 0); \ | |||||
t##i##0 = GiFmsqLaneQFloat32(t##i##0, d2, v0, 3); \ | |||||
t##i##1 = GiFmsqLaneQFloat32(t##i##1, d2, v1, 0); \ | |||||
t##i##2 = GiFmsqLaneQFloat32(t##i##2, d2, v1, 1); \ | |||||
t##i##3 = GiSimdFmaLane(t##i##3, d2, v1, 2); \ | |||||
t##i##4 = GiFmsqLaneQFloat32(t##i##4, d2, v1, 3); \ | |||||
t##i##5 = GiFmsqLaneQFloat32(t##i##5, d2, v2, 0); \ | |||||
t##i##6 = GiFmsqLaneQFloat32(t##i##6, d2, v2, 1); \ | |||||
t##i##8 = MADD(t##i##8, d1, v0, 0); \ | |||||
t##i##0 = MSUB(t##i##0, d2, v0, 3); \ | |||||
t##i##1 = MSUB(t##i##1, d2, v1, 0); \ | |||||
t##i##2 = MSUB(t##i##2, d2, v1, 1); \ | |||||
t##i##3 = MADD(t##i##3, d2, v1, 2); \ | |||||
t##i##4 = MSUB(t##i##4, d2, v1, 3); \ | |||||
t##i##5 = MSUB(t##i##5, d2, v2, 0); \ | |||||
t##i##6 = MSUB(t##i##6, d2, v2, 1); \ | |||||
t##i##8 = GiSubtractFloat32(t##i##8, d2); \ | t##i##8 = GiSubtractFloat32(t##i##8, d2); \ | ||||
t##i##0 = GiSimdFmaLane(t##i##0, d3, v2, 2); \ | |||||
t##i##1 = GiSimdFmaLane(t##i##1, d3, v2, 3); \ | |||||
t##i##2 = GiFmsqLaneQFloat32(t##i##2, d3, v3, 0); \ | |||||
t##i##3 = GiSimdFmaLane(t##i##3, d3, v2, 0); \ | |||||
t##i##4 = GiFmsqLaneQFloat32(t##i##4, d3, v3, 1); \ | |||||
t##i##5 = GiSimdFmaLane(t##i##5, d3, v3, 2); \ | |||||
t##i##6 = GiSimdFmaLane(t##i##6, d3, v3, 3); \ | |||||
t##i##7 = GiSimdFmaLane(t##i##7, d3, v2, 2); \ | |||||
t##i##8 = GiFmsqLaneQFloat32(t##i##8, d3, v0, 3); \ | |||||
t##i##0 = GiSimdFmaLane(t##i##0, d4, v0, 3); \ | |||||
t##i##1 = GiSimdFmaLane(t##i##1, d4, v4, 0); \ | |||||
t##i##2 = GiSimdFmaLane(t##i##2, d4, v4, 1); \ | |||||
t##i##3 = GiFmsqLaneQFloat32(t##i##3, d4, v4, 2); \ | |||||
t##i##4 = GiSimdFmaLane(t##i##4, d4, v4, 3); \ | |||||
t##i##5 = GiSimdFmaLane(t##i##5, d4, v5, 0); \ | |||||
t##i##6 = GiSimdFmaLane(t##i##6, d4, v5, 1); \ | |||||
t##i##8 = GiSimdFmaLane(t##i##8, d4, v2, 2); \ | |||||
t##i##0 = GiFmsqLaneQFloat32(t##i##0, d5, v2, 2); \ | |||||
t##i##1 = GiFmsqLaneQFloat32(t##i##1, d5, v5, 2); \ | |||||
t##i##2 = GiFmsqLaneQFloat32(t##i##2, d5, v5, 3); \ | |||||
t##i##3 = GiFmsqLaneQFloat32(t##i##3, d5, v6, 0); \ | |||||
t##i##4 = GiSimdFmaLane(t##i##4, d5, v6, 1); \ | |||||
t##i##5 = GiFmsqLaneQFloat32(t##i##5, d5, v5, 2); \ | |||||
t##i##6 = GiFmsqLaneQFloat32(t##i##6, d5, v6, 0); \ | |||||
t##i##7 = GiFmsqLaneQFloat32(t##i##7, d5, v2, 2); \ | |||||
t##i##8 = GiSimdFmaLane(t##i##8, d5, v0, 3); \ | |||||
t##i##0 = GiFmsqLaneQFloat32(t##i##0, d6, v0, 0); \ | |||||
t##i##1 = GiFmsqLaneQFloat32(t##i##1, d6, v1, 0); \ | |||||
t##i##2 = GiFmsqLaneQFloat32(t##i##2, d6, v1, 1); \ | |||||
t##i##3 = GiSimdFmaLane(t##i##3, d6, v1, 0); \ | |||||
t##i##4 = GiFmsqLaneQFloat32(t##i##4, d6, v3, 1); \ | |||||
t##i##0 = MADD(t##i##0, d3, v2, 2); \ | |||||
t##i##1 = MADD(t##i##1, d3, v2, 3); \ | |||||
t##i##2 = MSUB(t##i##2, d3, v3, 0); \ | |||||
t##i##3 = MADD(t##i##3, d3, v2, 0); \ | |||||
t##i##4 = MSUB(t##i##4, d3, v3, 1); \ | |||||
t##i##5 = MADD(t##i##5, d3, v3, 2); \ | |||||
t##i##6 = MADD(t##i##6, d3, v3, 3); \ | |||||
t##i##7 = MADD(t##i##7, d3, v2, 2); \ | |||||
t##i##8 = MSUB(t##i##8, d3, v0, 3); \ | |||||
t##i##0 = MADD(t##i##0, d4, v0, 3); \ | |||||
t##i##1 = MADD(t##i##1, d4, v4, 0); \ | |||||
t##i##2 = MADD(t##i##2, d4, v4, 1); \ | |||||
t##i##3 = MSUB(t##i##3, d4, v4, 2); \ | |||||
t##i##4 = MADD(t##i##4, d4, v4, 3); \ | |||||
t##i##5 = MADD(t##i##5, d4, v5, 0); \ | |||||
t##i##6 = MADD(t##i##6, d4, v5, 1); \ | |||||
t##i##8 = MADD(t##i##8, d4, v2, 2); \ | |||||
t##i##0 = MSUB(t##i##0, d5, v2, 2); \ | |||||
t##i##1 = MSUB(t##i##1, d5, v5, 2); \ | |||||
t##i##2 = MSUB(t##i##2, d5, v5, 3); \ | |||||
t##i##3 = MSUB(t##i##3, d5, v6, 0); \ | |||||
t##i##4 = MADD(t##i##4, d5, v6, 1); \ | |||||
t##i##5 = MSUB(t##i##5, d5, v5, 2); \ | |||||
t##i##6 = MSUB(t##i##6, d5, v6, 0); \ | |||||
t##i##7 = MSUB(t##i##7, d5, v2, 2); \ | |||||
t##i##8 = MADD(t##i##8, d5, v0, 3); \ | |||||
t##i##0 = MSUB(t##i##0, d6, v0, 0); \ | |||||
t##i##1 = MSUB(t##i##1, d6, v1, 0); \ | |||||
t##i##2 = MSUB(t##i##2, d6, v1, 1); \ | |||||
t##i##3 = MADD(t##i##3, d6, v1, 0); \ | |||||
t##i##4 = MSUB(t##i##4, d6, v3, 1); \ | |||||
t##i##5 = GiSubtractFloat32(t##i##5, d6); \ | t##i##5 = GiSubtractFloat32(t##i##5, d6); \ | ||||
t##i##6 = GiFmsqLaneQFloat32(t##i##6, d6, v6, 2); \ | |||||
t##i##8 = GiFmsqLaneQFloat32(t##i##8, d6, v2, 2); \ | |||||
t##i##0 = GiSimdFmaLane(t##i##0, d0, v0, 0); | |||||
t##i##6 = MSUB(t##i##6, d6, v6, 2); \ | |||||
t##i##8 = MSUB(t##i##8, d6, v2, 2); \ | |||||
t##i##0 = MADD(t##i##0, d0, v0, 0); | |||||
UNROLL_CALL_RAW(9, cb); | UNROLL_CALL_RAW(9, cb); | ||||
#undef cb | #undef cb | ||||
@@ -187,59 +203,59 @@ struct InputTransformF73_NCHW44 { | |||||
d5 = t7##i; \ | d5 = t7##i; \ | ||||
d6 = t7##i; \ | d6 = t7##i; \ | ||||
d7 = t7##i; \ | d7 = t7##i; \ | ||||
d8 = GiFmsqLaneQFloat32(d8, t7##i, v0, 0); \ | |||||
d8 = MSUB(d8, t7##i, v0, 0); \ | |||||
d0 = GiSubtractFloat32(d0, t1##i); \ | d0 = GiSubtractFloat32(d0, t1##i); \ | ||||
d1 = GiFmsqLaneQFloat32(d1, t1##i, v0, 0); \ | |||||
d2 = GiSimdFmaLane(d2, t1##i, v0, 0); \ | |||||
d3 = GiFmsqLaneQFloat32(d3, t1##i, v0, 1); \ | |||||
d4 = GiSimdFmaLane(d4, t1##i, v0, 1); \ | |||||
d5 = GiFmsqLaneQFloat32(d5, t1##i, v0, 2); \ | |||||
d6 = GiSimdFmaLane(d6, t1##i, v0, 2); \ | |||||
d1 = MSUB(d1, t1##i, v0, 0); \ | |||||
d2 = MADD(d2, t1##i, v0, 0); \ | |||||
d3 = MSUB(d3, t1##i, v0, 1); \ | |||||
d4 = MADD(d4, t1##i, v0, 1); \ | |||||
d5 = MSUB(d5, t1##i, v0, 2); \ | |||||
d6 = MADD(d6, t1##i, v0, 2); \ | |||||
d7 = GiSubtractFloat32(d7, t1##i); \ | d7 = GiSubtractFloat32(d7, t1##i); \ | ||||
d8 = GiSimdFmaLane(d8, t1##i, v0, 0); \ | |||||
d0 = GiFmsqLaneQFloat32(d0, t2##i, v0, 3); \ | |||||
d1 = GiFmsqLaneQFloat32(d1, t2##i, v1, 0); \ | |||||
d2 = GiFmsqLaneQFloat32(d2, t2##i, v1, 1); \ | |||||
d3 = GiSimdFmaLane(d3, t2##i, v1, 2); \ | |||||
d4 = GiFmsqLaneQFloat32(d4, t2##i, v1, 3); \ | |||||
d5 = GiFmsqLaneQFloat32(d5, t2##i, v2, 0); \ | |||||
d6 = GiFmsqLaneQFloat32(d6, t2##i, v2, 1); \ | |||||
d8 = MADD(d8, t1##i, v0, 0); \ | |||||
d0 = MSUB(d0, t2##i, v0, 3); \ | |||||
d1 = MSUB(d1, t2##i, v1, 0); \ | |||||
d2 = MSUB(d2, t2##i, v1, 1); \ | |||||
d3 = MADD(d3, t2##i, v1, 2); \ | |||||
d4 = MSUB(d4, t2##i, v1, 3); \ | |||||
d5 = MSUB(d5, t2##i, v2, 0); \ | |||||
d6 = MSUB(d6, t2##i, v2, 1); \ | |||||
d8 = GiSubtractFloat32(d8, t2##i); \ | d8 = GiSubtractFloat32(d8, t2##i); \ | ||||
d0 = GiSimdFmaLane(d0, t3##i, v2, 2); \ | |||||
d1 = GiSimdFmaLane(d1, t3##i, v2, 3); \ | |||||
d2 = GiFmsqLaneQFloat32(d2, t3##i, v3, 0); \ | |||||
d3 = GiSimdFmaLane(d3, t3##i, v2, 0); \ | |||||
d4 = GiFmsqLaneQFloat32(d4, t3##i, v3, 1); \ | |||||
d5 = GiSimdFmaLane(d5, t3##i, v3, 2); \ | |||||
d6 = GiSimdFmaLane(d6, t3##i, v3, 3); \ | |||||
d7 = GiSimdFmaLane(d7, t3##i, v2, 2); \ | |||||
d8 = GiFmsqLaneQFloat32(d8, t3##i, v0, 3); \ | |||||
d0 = GiSimdFmaLane(d0, t4##i, v0, 3); \ | |||||
d1 = GiSimdFmaLane(d1, t4##i, v4, 0); \ | |||||
d2 = GiSimdFmaLane(d2, t4##i, v4, 1); \ | |||||
d3 = GiFmsqLaneQFloat32(d3, t4##i, v4, 2); \ | |||||
d4 = GiSimdFmaLane(d4, t4##i, v4, 3); \ | |||||
d5 = GiSimdFmaLane(d5, t4##i, v5, 0); \ | |||||
d6 = GiSimdFmaLane(d6, t4##i, v5, 1); \ | |||||
d8 = GiSimdFmaLane(d8, t4##i, v2, 2); \ | |||||
d0 = GiFmsqLaneQFloat32(d0, t5##i, v2, 2); \ | |||||
d1 = GiFmsqLaneQFloat32(d1, t5##i, v5, 2); \ | |||||
d2 = GiFmsqLaneQFloat32(d2, t5##i, v5, 3); \ | |||||
d3 = GiFmsqLaneQFloat32(d3, t5##i, v6, 0); \ | |||||
d4 = GiSimdFmaLane(d4, t5##i, v6, 1); \ | |||||
d5 = GiFmsqLaneQFloat32(d5, t5##i, v5, 2); \ | |||||
d6 = GiFmsqLaneQFloat32(d6, t5##i, v6, 0); \ | |||||
d7 = GiFmsqLaneQFloat32(d7, t5##i, v2, 2); \ | |||||
d8 = GiSimdFmaLane(d8, t5##i, v0, 3); \ | |||||
d0 = GiFmsqLaneQFloat32(d0, t6##i, v0, 0); \ | |||||
d1 = GiFmsqLaneQFloat32(d1, t6##i, v1, 0); \ | |||||
d2 = GiFmsqLaneQFloat32(d2, t6##i, v1, 1); \ | |||||
d3 = GiSimdFmaLane(d3, t6##i, v1, 0); \ | |||||
d4 = GiFmsqLaneQFloat32(d4, t6##i, v3, 1); \ | |||||
d0 = MADD(d0, t3##i, v2, 2); \ | |||||
d1 = MADD(d1, t3##i, v2, 3); \ | |||||
d2 = MSUB(d2, t3##i, v3, 0); \ | |||||
d3 = MADD(d3, t3##i, v2, 0); \ | |||||
d4 = MSUB(d4, t3##i, v3, 1); \ | |||||
d5 = MADD(d5, t3##i, v3, 2); \ | |||||
d6 = MADD(d6, t3##i, v3, 3); \ | |||||
d7 = MADD(d7, t3##i, v2, 2); \ | |||||
d8 = MSUB(d8, t3##i, v0, 3); \ | |||||
d0 = MADD(d0, t4##i, v0, 3); \ | |||||
d1 = MADD(d1, t4##i, v4, 0); \ | |||||
d2 = MADD(d2, t4##i, v4, 1); \ | |||||
d3 = MSUB(d3, t4##i, v4, 2); \ | |||||
d4 = MADD(d4, t4##i, v4, 3); \ | |||||
d5 = MADD(d5, t4##i, v5, 0); \ | |||||
d6 = MADD(d6, t4##i, v5, 1); \ | |||||
d8 = MADD(d8, t4##i, v2, 2); \ | |||||
d0 = MSUB(d0, t5##i, v2, 2); \ | |||||
d1 = MSUB(d1, t5##i, v5, 2); \ | |||||
d2 = MSUB(d2, t5##i, v5, 3); \ | |||||
d3 = MSUB(d3, t5##i, v6, 0); \ | |||||
d4 = MADD(d4, t5##i, v6, 1); \ | |||||
d5 = MSUB(d5, t5##i, v5, 2); \ | |||||
d6 = MSUB(d6, t5##i, v6, 0); \ | |||||
d7 = MSUB(d7, t5##i, v2, 2); \ | |||||
d8 = MADD(d8, t5##i, v0, 3); \ | |||||
d0 = MSUB(d0, t6##i, v0, 0); \ | |||||
d1 = MSUB(d1, t6##i, v1, 0); \ | |||||
d2 = MSUB(d2, t6##i, v1, 1); \ | |||||
d3 = MADD(d3, t6##i, v1, 0); \ | |||||
d4 = MSUB(d4, t6##i, v3, 1); \ | |||||
d5 = GiSubtractFloat32(d5, t6##i); \ | d5 = GiSubtractFloat32(d5, t6##i); \ | ||||
d6 = GiFmsqLaneQFloat32(d6, t6##i, v6, 2); \ | |||||
d8 = GiFmsqLaneQFloat32(d8, t6##i, v2, 2); \ | |||||
d0 = GiSimdFmaLane(d0, t0##i, v0, 0); \ | |||||
d6 = MSUB(d6, t6##i, v6, 2); \ | |||||
d8 = MSUB(d8, t6##i, v2, 2); \ | |||||
d0 = MADD(d0, t0##i, v0, 0); \ | |||||
GiStoreFloat32( \ | GiStoreFloat32( \ | ||||
input_transform_buf + \ | input_transform_buf + \ | ||||
(0 * alpha + i) * ICB * nr_units_in_tile * pack_size + \ | (0 * alpha + i) * ICB * nr_units_in_tile * pack_size + \ | ||||
@@ -288,6 +304,8 @@ struct InputTransformF73_NCHW44 { | |||||
UNROLL_CALL_RAW(9, cb); | UNROLL_CALL_RAW(9, cb); | ||||
#undef cb | #undef cb | ||||
#undef MADD | |||||
#undef MSUB | |||||
} | } | ||||
}; | }; | ||||
@@ -224,9 +224,7 @@ GI_FLOAT32_t GiMlaqFloat32(GI_FLOAT32_t a, GI_FLOAT32_t b, GI_FLOAT32_t c) { | |||||
#endif | #endif | ||||
#elif defined(GI_SSE2_INTRINSICS) | #elif defined(GI_SSE2_INTRINSICS) | ||||
// fma is coming soon, but right now: | // fma is coming soon, but right now: | ||||
__m128 res; | |||||
res = _mm_mul_ps(c, b); | |||||
return _mm_add_ps(a, res); | |||||
return _mm_add_ps(a, _mm_mul_ps(c, b)); | |||||
#elif defined(GI_RVV_INTRINSICS) | #elif defined(GI_RVV_INTRINSICS) | ||||
return vfmadd_vv_f32m1(b, c, a, GI_SIMD_LEN_BYTE / sizeof(float)); | return vfmadd_vv_f32m1(b, c, a, GI_SIMD_LEN_BYTE / sizeof(float)); | ||||
#else | #else | ||||