@@ -14,7 +14,9 @@ struct SIMDHelper {}; | |||||
template <> | template <> | ||||
struct SIMDHelper<float> { | struct SIMDHelper<float> { | ||||
using simd_type = GI_FLOAT32_t; | using simd_type = GI_FLOAT32_t; | ||||
using simd_fixlen_type = GI_FLOAT32_FIXLEN_t; | |||||
using simd_type_x2 = GI_FLOAT32_V2_t; | using simd_type_x2 = GI_FLOAT32_V2_t; | ||||
using simd_type_x4 = GI_FLOAT32_V4_t; | |||||
using ctype = float; | using ctype = float; | ||||
static constexpr size_t simd_width = 4; | static constexpr size_t simd_width = 4; | ||||
@@ -27,8 +29,8 @@ struct SIMDHelper<float> { | |||||
static GI_FORCEINLINE void store2_interleave( | static GI_FORCEINLINE void store2_interleave( | ||||
ctype* dst_ptr, const simd_type& rdst1, const simd_type& rdst2) { | ctype* dst_ptr, const simd_type& rdst1, const simd_type& rdst2) { | ||||
simd_type_x2 rdst; | simd_type_x2 rdst; | ||||
rdst.val[0] = rdst1; | |||||
rdst.val[1] = rdst2; | |||||
GiSetSubVectorFloat32V2(rdst, 0, rdst1); | |||||
GiSetSubVectorFloat32V2(rdst, 1, rdst2); | |||||
GiStoreZipFloat32V2(dst_ptr, rdst); | GiStoreZipFloat32V2(dst_ptr, rdst); | ||||
} | } | ||||
static GI_FORCEINLINE simd_type | static GI_FORCEINLINE simd_type | ||||
@@ -375,17 +375,24 @@ void resize_linear_32f_gi(const Mat32f& src, Mat32f& dst) { | |||||
int dy = 0; | int dy = 0; | ||||
GI_FLOAT32_t v_rx = GiBroadcastFloat32(rx); | GI_FLOAT32_t v_rx = GiBroadcastFloat32(rx); | ||||
GI_FLOAT32_t v_irx = GiBroadcastFloat32(irx); | GI_FLOAT32_t v_irx = GiBroadcastFloat32(irx); | ||||
#define EXPAND(x) \ | |||||
v_cache0 = GiLoadUzipFloat32V3(cache0_ptr + dy + (x)*3); \ | |||||
v_cache1 = GiLoadUzipFloat32V3(cache1_ptr + dy + (x)*3); \ | |||||
v_dst.val[0] = GiMlaqFloat32( \ | |||||
GiMultiplyFloat32(v_rx, v_cache1.val[0]), v_irx, v_cache0.val[0]); \ | |||||
v_dst.val[1] = GiMlaqFloat32( \ | |||||
GiMultiplyFloat32(v_rx, v_cache1.val[1]), v_irx, v_cache0.val[1]); \ | |||||
v_dst.val[2] = GiMlaqFloat32( \ | |||||
GiMultiplyFloat32(v_rx, v_cache1.val[2]), v_irx, v_cache0.val[2]); \ | |||||
#define EXPAND(x) \ | |||||
v_cache0 = GiLoadUzipFloat32V3(cache0_ptr + dy + (x)*3); \ | |||||
v_cache1 = GiLoadUzipFloat32V3(cache1_ptr + dy + (x)*3); \ | |||||
a0 = GiMlaqFloat32( \ | |||||
GiMultiplyFloat32(v_rx, GiGetSubVectorFloat32V3(v_cache1, 0)), v_irx, \ | |||||
GiGetSubVectorFloat32V3(v_cache0, 0)); \ | |||||
GiSetSubVectorFloat32V3(v_dst, 0, a0); \ | |||||
a1 = GiMlaqFloat32( \ | |||||
GiMultiplyFloat32(v_rx, GiGetSubVectorFloat32V3(v_cache1, 1)), v_irx, \ | |||||
GiGetSubVectorFloat32V3(v_cache0, 1)); \ | |||||
GiSetSubVectorFloat32V3(v_dst, 1, a1); \ | |||||
a2 = GiMlaqFloat32( \ | |||||
GiMultiplyFloat32(v_rx, GiGetSubVectorFloat32V3(v_cache1, 2)), v_irx, \ | |||||
GiGetSubVectorFloat32V3(v_cache0, 2)); \ | |||||
GiSetSubVectorFloat32V3(v_dst, 2, a2); \ | |||||
GiStoreZipFloat32V3(pdst + dy + (x)*3, v_dst); | GiStoreZipFloat32V3(pdst + dy + (x)*3, v_dst); | ||||
GI_FLOAT32_t a0, a1, a2; | |||||
for (; dy + 8 * 3 <= dstcols; dy += 8 * 3) { | for (; dy + 8 * 3 <= dstcols; dy += 8 * 3) { | ||||
GI_FLOAT32_V3_t v_cache0; | GI_FLOAT32_V3_t v_cache0; | ||||
GI_FLOAT32_V3_t v_cache1; | GI_FLOAT32_V3_t v_cache1; | ||||
@@ -560,8 +567,12 @@ struct ResizeAreaFastVec_SIMD_32f { | |||||
for (; dx <= w - 4; dx += 4, S0 += 8, S1 += 8, D += 4) { | for (; dx <= w - 4; dx += 4, S0 += 8, S1 += 8, D += 4) { | ||||
GI_FLOAT32_V2_t v_row0 = GiLd2qFloat32(S0), v_row1 = GiLd2qFloat32(S1); | GI_FLOAT32_V2_t v_row0 = GiLd2qFloat32(S0), v_row1 = GiLd2qFloat32(S1); | ||||
GI_FLOAT32_t v_dst0 = GiAddFloat32(v_row0.val[0], v_row0.val[1]); | |||||
GI_FLOAT32_t v_dst1 = GiAddFloat32(v_row1.val[0], v_row1.val[1]); | |||||
GI_FLOAT32_t v_dst0 = GiAddFloat32( | |||||
GiGetSubVectorFloat32V2(v_row0, 0), | |||||
GiGetSubVectorFloat32V2(v_row0, 1)); | |||||
GI_FLOAT32_t v_dst1 = GiAddFloat32( | |||||
GiGetSubVectorFloat32V2(v_row1, 0), | |||||
GiGetSubVectorFloat32V2(v_row1, 1)); | |||||
GiStoreFloat32( | GiStoreFloat32( | ||||
D, GiMultiplyFloat32(GiAddFloat32(v_dst0, v_dst1), v_025)); | D, GiMultiplyFloat32(GiAddFloat32(v_dst0, v_dst1), v_025)); | ||||
@@ -17,13 +17,21 @@ compute_linear_element(const ctype src[4], const ctype alpha[2]) { | |||||
template <typename simd_helper, size_t fh, size_t fw> | template <typename simd_helper, size_t fh, size_t fw> | ||||
static GI_FORCEINLINE typename simd_helper::simd_type compute_linear_element_simd( | static GI_FORCEINLINE typename simd_helper::simd_type compute_linear_element_simd( | ||||
const typename simd_helper::simd_type src[4], | |||||
const typename simd_helper::simd_type alpha[2][2]) { | |||||
const typename simd_helper::simd_type_x4 src, | |||||
const typename simd_helper::simd_fixlen_type alpha[2][2]) { | |||||
typename simd_helper::simd_type c = simd_helper::dup(0); | typename simd_helper::simd_type c = simd_helper::dup(0); | ||||
c = simd_helper::fma(c, src[0], alpha[0 ^ fh][0 ^ fw]); | |||||
c = simd_helper::fma(c, src[1], alpha[0 ^ fh][1 ^ fw]); | |||||
c = simd_helper::fma(c, src[2], alpha[1 ^ fh][0 ^ fw]); | |||||
c = simd_helper::fma(c, src[3], alpha[1 ^ fh][1 ^ fw]); | |||||
c = simd_helper::fma( | |||||
c, GiGetSubVectorFloat32V4(src, 0), | |||||
GiFixLenType2GiFloat32Type(alpha[0 ^ fh][0 ^ fw])); | |||||
c = simd_helper::fma( | |||||
c, GiGetSubVectorFloat32V4(src, 1), | |||||
GiFixLenType2GiFloat32Type(alpha[0 ^ fh][1 ^ fw])); | |||||
c = simd_helper::fma( | |||||
c, GiGetSubVectorFloat32V4(src, 2), | |||||
GiFixLenType2GiFloat32Type(alpha[1 ^ fh][0 ^ fw])); | |||||
c = simd_helper::fma( | |||||
c, GiGetSubVectorFloat32V4(src, 3), | |||||
GiFixLenType2GiFloat32Type(alpha[1 ^ fh][1 ^ fw])); | |||||
return c; | return c; | ||||
} | } | ||||
@@ -62,23 +70,37 @@ static GI_FORCEINLINE void compute_linear_2x2_element( | |||||
template <typename simd_helper> | template <typename simd_helper> | ||||
static GI_FORCEINLINE void compute_linear_2x2_element_simd( | static GI_FORCEINLINE void compute_linear_2x2_element_simd( | ||||
const typename simd_helper::ctype* src, typename simd_helper::ctype* dst, | const typename simd_helper::ctype* src, typename simd_helper::ctype* dst, | ||||
size_t IW, size_t OW, const typename simd_helper::simd_type alpha[2][2]) { | |||||
size_t IW, size_t OW, | |||||
const typename simd_helper::simd_fixlen_type alpha[2][2]) { | |||||
using simd_type_x4 = typename simd_helper::simd_type_x4; | |||||
using simd_type = typename simd_helper::simd_type; | using simd_type = typename simd_helper::simd_type; | ||||
simd_type rsrc[4]; | |||||
rsrc[0] = simd_helper::load(src); | |||||
rsrc[1] = simd_helper::load(src + 1); | |||||
rsrc[2] = simd_helper::load(src + IW); | |||||
rsrc[3] = simd_helper::load(src + IW + 1); | |||||
simd_type rdst[4]; | |||||
rdst[0] = compute_linear_element_simd<simd_helper, 0, 0>(rsrc, alpha); | |||||
rdst[1] = compute_linear_element_simd<simd_helper, 0, 1>(rsrc, alpha); | |||||
rdst[2] = compute_linear_element_simd<simd_helper, 1, 0>(rsrc, alpha); | |||||
rdst[3] = compute_linear_element_simd<simd_helper, 1, 1>(rsrc, alpha); | |||||
simd_helper::store2_interleave(dst, rdst[0], rdst[1]); | |||||
simd_helper::store2_interleave(dst + OW, rdst[2], rdst[3]); | |||||
simd_type_x4 rsrc; | |||||
simd_type tmp; | |||||
tmp = simd_helper::load(src); | |||||
GiSetSubVectorFloat32V4(rsrc, 0, tmp); | |||||
tmp = simd_helper::load(src + 1); | |||||
GiSetSubVectorFloat32V4(rsrc, 1, tmp); | |||||
tmp = simd_helper::load(src + IW); | |||||
GiSetSubVectorFloat32V4(rsrc, 2, tmp); | |||||
tmp = simd_helper::load(src + IW + 1); | |||||
GiSetSubVectorFloat32V4(rsrc, 3, tmp); | |||||
simd_type_x4 rdst; | |||||
tmp = compute_linear_element_simd<simd_helper, 0, 0>(rsrc, alpha); | |||||
GiSetSubVectorFloat32V4(rdst, 0, tmp); | |||||
tmp = compute_linear_element_simd<simd_helper, 0, 1>(rsrc, alpha); | |||||
GiSetSubVectorFloat32V4(rdst, 1, tmp); | |||||
tmp = compute_linear_element_simd<simd_helper, 1, 0>(rsrc, alpha); | |||||
GiSetSubVectorFloat32V4(rdst, 2, tmp); | |||||
tmp = compute_linear_element_simd<simd_helper, 1, 1>(rsrc, alpha); | |||||
GiSetSubVectorFloat32V4(rdst, 3, tmp); | |||||
simd_helper::store2_interleave( | |||||
dst, GiGetSubVectorFloat32V4(rdst, 0), GiGetSubVectorFloat32V4(rdst, 1)); | |||||
simd_helper::store2_interleave( | |||||
dst + OW, GiGetSubVectorFloat32V4(rdst, 2), | |||||
GiGetSubVectorFloat32V4(rdst, 3)); | |||||
} | } | ||||
template <typename ctype> | template <typename ctype> | ||||
@@ -90,11 +112,11 @@ void linear_upsample2_nchw( | |||||
ctype alpha[2] = {0.75, 0.25}; | ctype alpha[2] = {0.75, 0.25}; | ||||
typename simd_helper::simd_type simd_alpha[2][2]; | |||||
simd_alpha[0][0] = simd_helper::dup(0.75 * 0.75); | |||||
simd_alpha[0][1] = simd_helper::dup(0.75 * 0.25); | |||||
simd_alpha[1][0] = simd_helper::dup(0.25 * 0.75); | |||||
simd_alpha[1][1] = simd_helper::dup(0.25 * 0.25); | |||||
typename simd_helper::simd_fixlen_type simd_alpha[2][2]; | |||||
simd_alpha[0][0] = GiFloat32Type2FixLenType(simd_helper::dup(0.75 * 0.75)); | |||||
simd_alpha[0][1] = GiFloat32Type2FixLenType(simd_helper::dup(0.75 * 0.25)); | |||||
simd_alpha[1][0] = GiFloat32Type2FixLenType(simd_helper::dup(0.25 * 0.75)); | |||||
simd_alpha[1][1] = GiFloat32Type2FixLenType(simd_helper::dup(0.25 * 0.25)); | |||||
for (size_t i = 0; i < N; ++i) { | for (size_t i = 0; i < N; ++i) { | ||||
compute_linear_2x2_element<ctype, false, false>( | compute_linear_2x2_element<ctype, false, false>( | ||||
@@ -8,20 +8,29 @@ namespace { | |||||
template <typename simd_helper, size_t fh, size_t fw> | template <typename simd_helper, size_t fh, size_t fw> | ||||
static GI_FORCEINLINE typename simd_helper::simd_type compute_linear_element( | static GI_FORCEINLINE typename simd_helper::simd_type compute_linear_element( | ||||
const typename simd_helper::simd_type src[4], | |||||
const typename simd_helper::simd_type alpha[2][2]) { | |||||
const typename simd_helper::simd_type_x4 src, | |||||
const typename simd_helper::simd_fixlen_type alpha[2][2]) { | |||||
typename simd_helper::simd_type c = simd_helper::dup(0); | typename simd_helper::simd_type c = simd_helper::dup(0); | ||||
c = simd_helper::fma(c, src[0], alpha[0 ^ fh][0 ^ fw]); | |||||
c = simd_helper::fma(c, src[1], alpha[0 ^ fh][1 ^ fw]); | |||||
c = simd_helper::fma(c, src[2], alpha[1 ^ fh][0 ^ fw]); | |||||
c = simd_helper::fma(c, src[3], alpha[1 ^ fh][1 ^ fw]); | |||||
c = simd_helper::fma( | |||||
c, GiGetSubVectorFloat32V4(src, 0), | |||||
GiFixLenType2GiFloat32Type(alpha[0 ^ fh][0 ^ fw])); | |||||
c = simd_helper::fma( | |||||
c, GiGetSubVectorFloat32V4(src, 1), | |||||
GiFixLenType2GiFloat32Type(alpha[0 ^ fh][1 ^ fw])); | |||||
c = simd_helper::fma( | |||||
c, GiGetSubVectorFloat32V4(src, 2), | |||||
GiFixLenType2GiFloat32Type(alpha[1 ^ fh][0 ^ fw])); | |||||
c = simd_helper::fma( | |||||
c, GiGetSubVectorFloat32V4(src, 3), | |||||
GiFixLenType2GiFloat32Type(alpha[1 ^ fh][1 ^ fw])); | |||||
return c; | return c; | ||||
} | } | ||||
template <typename simd_helper, bool has_right, bool has_bottom> | template <typename simd_helper, bool has_right, bool has_bottom> | ||||
static GI_FORCEINLINE void compute_linear_2x2_element( | static GI_FORCEINLINE void compute_linear_2x2_element( | ||||
const typename simd_helper::ctype* src, typename simd_helper::ctype* dst, | const typename simd_helper::ctype* src, typename simd_helper::ctype* dst, | ||||
size_t IW, size_t OW, const typename simd_helper::simd_type alpha[2][2]) { | |||||
size_t IW, size_t OW, | |||||
const typename simd_helper::simd_fixlen_type alpha[2][2]) { | |||||
constexpr size_t PC = simd_helper::simd_width; | constexpr size_t PC = simd_helper::simd_width; | ||||
const typename simd_helper::ctype* src_ptr[4] = {src, src, src, src}; | const typename simd_helper::ctype* src_ptr[4] = {src, src, src, src}; | ||||
@@ -34,27 +43,33 @@ static GI_FORCEINLINE void compute_linear_2x2_element( | |||||
src_ptr[3] += IW * PC; | src_ptr[3] += IW * PC; | ||||
} | } | ||||
typename simd_helper::simd_type rsrc[4]; | |||||
rsrc[0] = simd_helper::load(src_ptr[0]); | |||||
rsrc[1] = simd_helper::load(src_ptr[1]); | |||||
rsrc[2] = simd_helper::load(src_ptr[2]); | |||||
rsrc[3] = simd_helper::load(src_ptr[3]); | |||||
typename simd_helper::simd_type rdst[4]; | |||||
rdst[0] = compute_linear_element<simd_helper, 0, 0>(rsrc, alpha); | |||||
rdst[1] = compute_linear_element<simd_helper, 0, 1>(rsrc, alpha); | |||||
rdst[2] = compute_linear_element<simd_helper, 1, 0>(rsrc, alpha); | |||||
rdst[3] = compute_linear_element<simd_helper, 1, 1>(rsrc, alpha); | |||||
simd_helper::store(dst, rdst[0]); | |||||
typename simd_helper::simd_type_x4 rsrc; | |||||
GiSetSubVectorFloat32V4(rsrc, 0, simd_helper::load(src_ptr[0])); | |||||
GiSetSubVectorFloat32V4(rsrc, 1, simd_helper::load(src_ptr[1])); | |||||
GiSetSubVectorFloat32V4(rsrc, 2, simd_helper::load(src_ptr[2])); | |||||
GiSetSubVectorFloat32V4(rsrc, 3, simd_helper::load(src_ptr[3])); | |||||
typename simd_helper::simd_type_x4 rdst; | |||||
typename simd_helper::simd_type a, b, c, d; | |||||
a = compute_linear_element<simd_helper, 0, 0>(rsrc, alpha); | |||||
b = compute_linear_element<simd_helper, 0, 1>(rsrc, alpha); | |||||
c = compute_linear_element<simd_helper, 1, 0>(rsrc, alpha); | |||||
d = compute_linear_element<simd_helper, 1, 1>(rsrc, alpha); | |||||
GiSetSubVectorFloat32V4(rdst, 0, a); | |||||
GiSetSubVectorFloat32V4(rdst, 1, b); | |||||
GiSetSubVectorFloat32V4(rdst, 2, c); | |||||
GiSetSubVectorFloat32V4(rdst, 3, d); | |||||
simd_helper::store(dst, GiGetSubVectorFloat32V4(rdst, 0)); | |||||
if (has_right) { | if (has_right) { | ||||
simd_helper::store(dst + PC, rdst[1]); | |||||
simd_helper::store(dst + PC, GiGetSubVectorFloat32V4(rdst, 1)); | |||||
} | } | ||||
if (has_bottom) { | if (has_bottom) { | ||||
simd_helper::store(dst + OW * PC, rdst[2]); | |||||
simd_helper::store(dst + OW * PC, GiGetSubVectorFloat32V4(rdst, 2)); | |||||
} | } | ||||
if (has_right && has_bottom) { | if (has_right && has_bottom) { | ||||
simd_helper::store(dst + (OW + 1) * PC, rdst[3]); | |||||
simd_helper::store(dst + (OW + 1) * PC, GiGetSubVectorFloat32V4(rdst, 3)); | |||||
} | } | ||||
} | } | ||||
@@ -65,11 +80,12 @@ void linear_upsample2_nchwxx( | |||||
size_t OW = IW * 2; | size_t OW = IW * 2; | ||||
constexpr size_t PC = simd_helper::simd_width; | constexpr size_t PC = simd_helper::simd_width; | ||||
typename simd_helper::simd_type alpha[2][2]; | |||||
alpha[0][0] = simd_helper::dup(0.75 * 0.75); | |||||
alpha[0][1] = simd_helper::dup(0.75 * 0.25); | |||||
alpha[1][0] = simd_helper::dup(0.25 * 0.75); | |||||
alpha[1][1] = simd_helper::dup(0.25 * 0.25); | |||||
typename simd_helper::simd_fixlen_type alpha[2][2]; | |||||
alpha[0][0] = GiFloat32Type2FixLenType(simd_helper::dup(0.75 * 0.75)); | |||||
alpha[0][1] = GiFloat32Type2FixLenType(simd_helper::dup(0.75 * 0.25)); | |||||
alpha[1][0] = GiFloat32Type2FixLenType(simd_helper::dup(0.25 * 0.75)); | |||||
alpha[1][1] = GiFloat32Type2FixLenType(simd_helper::dup(0.25 * 0.25)); | |||||
for (size_t i = 0; i < N; ++i) { | for (size_t i = 0; i < N; ++i) { | ||||
compute_linear_2x2_element<simd_helper, false, false>( | compute_linear_2x2_element<simd_helper, false, false>( | ||||