diff --git a/dnn/src/fallback/resize/gi/helper.h b/dnn/src/fallback/resize/gi/helper.h index 7ce5bc53..5c178b67 100644 --- a/dnn/src/fallback/resize/gi/helper.h +++ b/dnn/src/fallback/resize/gi/helper.h @@ -14,7 +14,9 @@ struct SIMDHelper {}; template <> struct SIMDHelper { using simd_type = GI_FLOAT32_t; + using simd_fixlen_type = GI_FLOAT32_FIXLEN_t; using simd_type_x2 = GI_FLOAT32_V2_t; + using simd_type_x4 = GI_FLOAT32_V4_t; using ctype = float; static constexpr size_t simd_width = 4; @@ -27,8 +29,8 @@ struct SIMDHelper { static GI_FORCEINLINE void store2_interleave( ctype* dst_ptr, const simd_type& rdst1, const simd_type& rdst2) { simd_type_x2 rdst; - rdst.val[0] = rdst1; - rdst.val[1] = rdst2; + GiSetSubVectorFloat32V2(rdst, 0, rdst1); + GiSetSubVectorFloat32V2(rdst, 1, rdst2); GiStoreZipFloat32V2(dst_ptr, rdst); } static GI_FORCEINLINE simd_type diff --git a/dnn/src/fallback/resize/gi/resize_cv.cpp b/dnn/src/fallback/resize/gi/resize_cv.cpp index 17b2867f..23668e56 100644 --- a/dnn/src/fallback/resize/gi/resize_cv.cpp +++ b/dnn/src/fallback/resize/gi/resize_cv.cpp @@ -375,17 +375,24 @@ void resize_linear_32f_gi(const Mat32f& src, Mat32f& dst) { int dy = 0; GI_FLOAT32_t v_rx = GiBroadcastFloat32(rx); GI_FLOAT32_t v_irx = GiBroadcastFloat32(irx); -#define EXPAND(x) \ - v_cache0 = GiLoadUzipFloat32V3(cache0_ptr + dy + (x)*3); \ - v_cache1 = GiLoadUzipFloat32V3(cache1_ptr + dy + (x)*3); \ - v_dst.val[0] = GiMlaqFloat32( \ - GiMultiplyFloat32(v_rx, v_cache1.val[0]), v_irx, v_cache0.val[0]); \ - v_dst.val[1] = GiMlaqFloat32( \ - GiMultiplyFloat32(v_rx, v_cache1.val[1]), v_irx, v_cache0.val[1]); \ - v_dst.val[2] = GiMlaqFloat32( \ - GiMultiplyFloat32(v_rx, v_cache1.val[2]), v_irx, v_cache0.val[2]); \ +#define EXPAND(x) \ + v_cache0 = GiLoadUzipFloat32V3(cache0_ptr + dy + (x)*3); \ + v_cache1 = GiLoadUzipFloat32V3(cache1_ptr + dy + (x)*3); \ + a0 = GiMlaqFloat32( \ + GiMultiplyFloat32(v_rx, GiGetSubVectorFloat32V3(v_cache1, 0)), v_irx, \ + GiGetSubVectorFloat32V3(v_cache0, 0)); \ + GiSetSubVectorFloat32V3(v_dst, 0, a0); \ + a1 = GiMlaqFloat32( \ + GiMultiplyFloat32(v_rx, GiGetSubVectorFloat32V3(v_cache1, 1)), v_irx, \ + GiGetSubVectorFloat32V3(v_cache0, 1)); \ + GiSetSubVectorFloat32V3(v_dst, 1, a1); \ + a2 = GiMlaqFloat32( \ + GiMultiplyFloat32(v_rx, GiGetSubVectorFloat32V3(v_cache1, 2)), v_irx, \ + GiGetSubVectorFloat32V3(v_cache0, 2)); \ + GiSetSubVectorFloat32V3(v_dst, 2, a2); \ GiStoreZipFloat32V3(pdst + dy + (x)*3, v_dst); + GI_FLOAT32_t a0, a1, a2; for (; dy + 8 * 3 <= dstcols; dy += 8 * 3) { GI_FLOAT32_V3_t v_cache0; GI_FLOAT32_V3_t v_cache1; @@ -560,8 +567,12 @@ struct ResizeAreaFastVec_SIMD_32f { for (; dx <= w - 4; dx += 4, S0 += 8, S1 += 8, D += 4) { GI_FLOAT32_V2_t v_row0 = GiLd2qFloat32(S0), v_row1 = GiLd2qFloat32(S1); - GI_FLOAT32_t v_dst0 = GiAddFloat32(v_row0.val[0], v_row0.val[1]); - GI_FLOAT32_t v_dst1 = GiAddFloat32(v_row1.val[0], v_row1.val[1]); + GI_FLOAT32_t v_dst0 = GiAddFloat32( + GiGetSubVectorFloat32V2(v_row0, 0), + GiGetSubVectorFloat32V2(v_row0, 1)); + GI_FLOAT32_t v_dst1 = GiAddFloat32( + GiGetSubVectorFloat32V2(v_row1, 0), + GiGetSubVectorFloat32V2(v_row1, 1)); GiStoreFloat32( D, GiMultiplyFloat32(GiAddFloat32(v_dst0, v_dst1), v_025)); diff --git a/dnn/src/fallback/resize/gi/upsample2_nchw.cpp b/dnn/src/fallback/resize/gi/upsample2_nchw.cpp index 126884b2..d9d4ce08 100644 --- a/dnn/src/fallback/resize/gi/upsample2_nchw.cpp +++ b/dnn/src/fallback/resize/gi/upsample2_nchw.cpp @@ -17,13 +17,21 @@ compute_linear_element(const ctype src[4], const ctype alpha[2]) { template static GI_FORCEINLINE typename simd_helper::simd_type compute_linear_element_simd( - const typename simd_helper::simd_type src[4], - const typename simd_helper::simd_type alpha[2][2]) { + const typename simd_helper::simd_type_x4 src, + const typename simd_helper::simd_fixlen_type alpha[2][2]) { typename simd_helper::simd_type c = simd_helper::dup(0); - c = simd_helper::fma(c, src[0], alpha[0 ^ fh][0 ^ fw]); - c = simd_helper::fma(c, src[1], alpha[0 ^ fh][1 ^ fw]); - c = simd_helper::fma(c, src[2], alpha[1 ^ fh][0 ^ fw]); - c = simd_helper::fma(c, src[3], alpha[1 ^ fh][1 ^ fw]); + c = simd_helper::fma( + c, GiGetSubVectorFloat32V4(src, 0), + GiFixLenType2GiFloat32Type(alpha[0 ^ fh][0 ^ fw])); + c = simd_helper::fma( + c, GiGetSubVectorFloat32V4(src, 1), + GiFixLenType2GiFloat32Type(alpha[0 ^ fh][1 ^ fw])); + c = simd_helper::fma( + c, GiGetSubVectorFloat32V4(src, 2), + GiFixLenType2GiFloat32Type(alpha[1 ^ fh][0 ^ fw])); + c = simd_helper::fma( + c, GiGetSubVectorFloat32V4(src, 3), + GiFixLenType2GiFloat32Type(alpha[1 ^ fh][1 ^ fw])); return c; } @@ -62,23 +70,37 @@ static GI_FORCEINLINE void compute_linear_2x2_element( template static GI_FORCEINLINE void compute_linear_2x2_element_simd( const typename simd_helper::ctype* src, typename simd_helper::ctype* dst, - size_t IW, size_t OW, const typename simd_helper::simd_type alpha[2][2]) { + size_t IW, size_t OW, + const typename simd_helper::simd_fixlen_type alpha[2][2]) { + using simd_type_x4 = typename simd_helper::simd_type_x4; using simd_type = typename simd_helper::simd_type; - simd_type rsrc[4]; - rsrc[0] = simd_helper::load(src); - rsrc[1] = simd_helper::load(src + 1); - rsrc[2] = simd_helper::load(src + IW); - rsrc[3] = simd_helper::load(src + IW + 1); - - simd_type rdst[4]; - rdst[0] = compute_linear_element_simd(rsrc, alpha); - rdst[1] = compute_linear_element_simd(rsrc, alpha); - rdst[2] = compute_linear_element_simd(rsrc, alpha); - rdst[3] = compute_linear_element_simd(rsrc, alpha); - - simd_helper::store2_interleave(dst, rdst[0], rdst[1]); - simd_helper::store2_interleave(dst + OW, rdst[2], rdst[3]); + simd_type_x4 rsrc; + simd_type tmp; + tmp = simd_helper::load(src); + GiSetSubVectorFloat32V4(rsrc, 0, tmp); + tmp = simd_helper::load(src + 1); + GiSetSubVectorFloat32V4(rsrc, 1, tmp); + tmp = simd_helper::load(src + IW); + GiSetSubVectorFloat32V4(rsrc, 2, tmp); + tmp = simd_helper::load(src + IW + 1); + GiSetSubVectorFloat32V4(rsrc, 3, tmp); + + simd_type_x4 rdst; + tmp = compute_linear_element_simd(rsrc, alpha); + GiSetSubVectorFloat32V4(rdst, 0, tmp); + tmp = compute_linear_element_simd(rsrc, alpha); + GiSetSubVectorFloat32V4(rdst, 1, tmp); + tmp = compute_linear_element_simd(rsrc, alpha); + GiSetSubVectorFloat32V4(rdst, 2, tmp); + tmp = compute_linear_element_simd(rsrc, alpha); + GiSetSubVectorFloat32V4(rdst, 3, tmp); + + simd_helper::store2_interleave( + dst, GiGetSubVectorFloat32V4(rdst, 0), GiGetSubVectorFloat32V4(rdst, 1)); + simd_helper::store2_interleave( + dst + OW, GiGetSubVectorFloat32V4(rdst, 2), + GiGetSubVectorFloat32V4(rdst, 3)); } template @@ -90,11 +112,11 @@ void linear_upsample2_nchw( ctype alpha[2] = {0.75, 0.25}; - typename simd_helper::simd_type simd_alpha[2][2]; - simd_alpha[0][0] = simd_helper::dup(0.75 * 0.75); - simd_alpha[0][1] = simd_helper::dup(0.75 * 0.25); - simd_alpha[1][0] = simd_helper::dup(0.25 * 0.75); - simd_alpha[1][1] = simd_helper::dup(0.25 * 0.25); + typename simd_helper::simd_fixlen_type simd_alpha[2][2]; + simd_alpha[0][0] = GiFloat32Type2FixLenType(simd_helper::dup(0.75 * 0.75)); + simd_alpha[0][1] = GiFloat32Type2FixLenType(simd_helper::dup(0.75 * 0.25)); + simd_alpha[1][0] = GiFloat32Type2FixLenType(simd_helper::dup(0.25 * 0.75)); + simd_alpha[1][1] = GiFloat32Type2FixLenType(simd_helper::dup(0.25 * 0.25)); for (size_t i = 0; i < N; ++i) { compute_linear_2x2_element( diff --git a/dnn/src/fallback/resize/gi/upsample2_nchwxx.cpp b/dnn/src/fallback/resize/gi/upsample2_nchwxx.cpp index 6648e6cd..60178f78 100644 --- a/dnn/src/fallback/resize/gi/upsample2_nchwxx.cpp +++ b/dnn/src/fallback/resize/gi/upsample2_nchwxx.cpp @@ -8,20 +8,29 @@ namespace { template static GI_FORCEINLINE typename simd_helper::simd_type compute_linear_element( - const typename simd_helper::simd_type src[4], - const typename simd_helper::simd_type alpha[2][2]) { + const typename simd_helper::simd_type_x4 src, + const typename simd_helper::simd_fixlen_type alpha[2][2]) { typename simd_helper::simd_type c = simd_helper::dup(0); - c = simd_helper::fma(c, src[0], alpha[0 ^ fh][0 ^ fw]); - c = simd_helper::fma(c, src[1], alpha[0 ^ fh][1 ^ fw]); - c = simd_helper::fma(c, src[2], alpha[1 ^ fh][0 ^ fw]); - c = simd_helper::fma(c, src[3], alpha[1 ^ fh][1 ^ fw]); + c = simd_helper::fma( + c, GiGetSubVectorFloat32V4(src, 0), + GiFixLenType2GiFloat32Type(alpha[0 ^ fh][0 ^ fw])); + c = simd_helper::fma( + c, GiGetSubVectorFloat32V4(src, 1), + GiFixLenType2GiFloat32Type(alpha[0 ^ fh][1 ^ fw])); + c = simd_helper::fma( + c, GiGetSubVectorFloat32V4(src, 2), + GiFixLenType2GiFloat32Type(alpha[1 ^ fh][0 ^ fw])); + c = simd_helper::fma( + c, GiGetSubVectorFloat32V4(src, 3), + GiFixLenType2GiFloat32Type(alpha[1 ^ fh][1 ^ fw])); return c; } template static GI_FORCEINLINE void compute_linear_2x2_element( const typename simd_helper::ctype* src, typename simd_helper::ctype* dst, - size_t IW, size_t OW, const typename simd_helper::simd_type alpha[2][2]) { + size_t IW, size_t OW, + const typename simd_helper::simd_fixlen_type alpha[2][2]) { constexpr size_t PC = simd_helper::simd_width; const typename simd_helper::ctype* src_ptr[4] = {src, src, src, src}; @@ -34,27 +43,33 @@ static GI_FORCEINLINE void compute_linear_2x2_element( src_ptr[3] += IW * PC; } - typename simd_helper::simd_type rsrc[4]; - rsrc[0] = simd_helper::load(src_ptr[0]); - rsrc[1] = simd_helper::load(src_ptr[1]); - rsrc[2] = simd_helper::load(src_ptr[2]); - rsrc[3] = simd_helper::load(src_ptr[3]); - - typename simd_helper::simd_type rdst[4]; - rdst[0] = compute_linear_element(rsrc, alpha); - rdst[1] = compute_linear_element(rsrc, alpha); - rdst[2] = compute_linear_element(rsrc, alpha); - rdst[3] = compute_linear_element(rsrc, alpha); - - simd_helper::store(dst, rdst[0]); + typename simd_helper::simd_type_x4 rsrc; + GiSetSubVectorFloat32V4(rsrc, 0, simd_helper::load(src_ptr[0])); + GiSetSubVectorFloat32V4(rsrc, 1, simd_helper::load(src_ptr[1])); + GiSetSubVectorFloat32V4(rsrc, 2, simd_helper::load(src_ptr[2])); + GiSetSubVectorFloat32V4(rsrc, 3, simd_helper::load(src_ptr[3])); + + typename simd_helper::simd_type_x4 rdst; + typename simd_helper::simd_type a, b, c, d; + a = compute_linear_element(rsrc, alpha); + b = compute_linear_element(rsrc, alpha); + c = compute_linear_element(rsrc, alpha); + d = compute_linear_element(rsrc, alpha); + + GiSetSubVectorFloat32V4(rdst, 0, a); + GiSetSubVectorFloat32V4(rdst, 1, b); + GiSetSubVectorFloat32V4(rdst, 2, c); + GiSetSubVectorFloat32V4(rdst, 3, d); + + simd_helper::store(dst, GiGetSubVectorFloat32V4(rdst, 0)); if (has_right) { - simd_helper::store(dst + PC, rdst[1]); + simd_helper::store(dst + PC, GiGetSubVectorFloat32V4(rdst, 1)); } if (has_bottom) { - simd_helper::store(dst + OW * PC, rdst[2]); + simd_helper::store(dst + OW * PC, GiGetSubVectorFloat32V4(rdst, 2)); } if (has_right && has_bottom) { - simd_helper::store(dst + (OW + 1) * PC, rdst[3]); + simd_helper::store(dst + (OW + 1) * PC, GiGetSubVectorFloat32V4(rdst, 3)); } } @@ -65,11 +80,12 @@ void linear_upsample2_nchwxx( size_t OW = IW * 2; constexpr size_t PC = simd_helper::simd_width; - typename simd_helper::simd_type alpha[2][2]; - alpha[0][0] = simd_helper::dup(0.75 * 0.75); - alpha[0][1] = simd_helper::dup(0.75 * 0.25); - alpha[1][0] = simd_helper::dup(0.25 * 0.75); - alpha[1][1] = simd_helper::dup(0.25 * 0.25); + typename simd_helper::simd_fixlen_type alpha[2][2]; + + alpha[0][0] = GiFloat32Type2FixLenType(simd_helper::dup(0.75 * 0.75)); + alpha[0][1] = GiFloat32Type2FixLenType(simd_helper::dup(0.75 * 0.25)); + alpha[1][0] = GiFloat32Type2FixLenType(simd_helper::dup(0.25 * 0.75)); + alpha[1][1] = GiFloat32Type2FixLenType(simd_helper::dup(0.25 * 0.25)); for (size_t i = 0; i < N; ++i) { compute_linear_2x2_element(