@@ -14,7 +14,9 @@ struct SIMDHelper {}; | |||
template <> | |||
struct SIMDHelper<float> { | |||
using simd_type = GI_FLOAT32_t; | |||
using simd_fixlen_type = GI_FLOAT32_FIXLEN_t; | |||
using simd_type_x2 = GI_FLOAT32_V2_t; | |||
using simd_type_x4 = GI_FLOAT32_V4_t; | |||
using ctype = float; | |||
static constexpr size_t simd_width = 4; | |||
@@ -27,8 +29,8 @@ struct SIMDHelper<float> { | |||
static GI_FORCEINLINE void store2_interleave( | |||
ctype* dst_ptr, const simd_type& rdst1, const simd_type& rdst2) { | |||
simd_type_x2 rdst; | |||
rdst.val[0] = rdst1; | |||
rdst.val[1] = rdst2; | |||
GiSetSubVectorFloat32V2(rdst, 0, rdst1); | |||
GiSetSubVectorFloat32V2(rdst, 1, rdst2); | |||
GiStoreZipFloat32V2(dst_ptr, rdst); | |||
} | |||
static GI_FORCEINLINE simd_type | |||
@@ -375,17 +375,24 @@ void resize_linear_32f_gi(const Mat32f& src, Mat32f& dst) { | |||
int dy = 0; | |||
GI_FLOAT32_t v_rx = GiBroadcastFloat32(rx); | |||
GI_FLOAT32_t v_irx = GiBroadcastFloat32(irx); | |||
#define EXPAND(x) \ | |||
v_cache0 = GiLoadUzipFloat32V3(cache0_ptr + dy + (x)*3); \ | |||
v_cache1 = GiLoadUzipFloat32V3(cache1_ptr + dy + (x)*3); \ | |||
v_dst.val[0] = GiMlaqFloat32( \ | |||
GiMultiplyFloat32(v_rx, v_cache1.val[0]), v_irx, v_cache0.val[0]); \ | |||
v_dst.val[1] = GiMlaqFloat32( \ | |||
GiMultiplyFloat32(v_rx, v_cache1.val[1]), v_irx, v_cache0.val[1]); \ | |||
v_dst.val[2] = GiMlaqFloat32( \ | |||
GiMultiplyFloat32(v_rx, v_cache1.val[2]), v_irx, v_cache0.val[2]); \ | |||
#define EXPAND(x) \ | |||
v_cache0 = GiLoadUzipFloat32V3(cache0_ptr + dy + (x)*3); \ | |||
v_cache1 = GiLoadUzipFloat32V3(cache1_ptr + dy + (x)*3); \ | |||
a0 = GiMlaqFloat32( \ | |||
GiMultiplyFloat32(v_rx, GiGetSubVectorFloat32V3(v_cache1, 0)), v_irx, \ | |||
GiGetSubVectorFloat32V3(v_cache0, 0)); \ | |||
GiSetSubVectorFloat32V3(v_dst, 0, a0); \ | |||
a1 = GiMlaqFloat32( \ | |||
GiMultiplyFloat32(v_rx, GiGetSubVectorFloat32V3(v_cache1, 1)), v_irx, \ | |||
GiGetSubVectorFloat32V3(v_cache0, 1)); \ | |||
GiSetSubVectorFloat32V3(v_dst, 1, a1); \ | |||
a2 = GiMlaqFloat32( \ | |||
GiMultiplyFloat32(v_rx, GiGetSubVectorFloat32V3(v_cache1, 2)), v_irx, \ | |||
GiGetSubVectorFloat32V3(v_cache0, 2)); \ | |||
GiSetSubVectorFloat32V3(v_dst, 2, a2); \ | |||
GiStoreZipFloat32V3(pdst + dy + (x)*3, v_dst); | |||
GI_FLOAT32_t a0, a1, a2; | |||
for (; dy + 8 * 3 <= dstcols; dy += 8 * 3) { | |||
GI_FLOAT32_V3_t v_cache0; | |||
GI_FLOAT32_V3_t v_cache1; | |||
@@ -560,8 +567,12 @@ struct ResizeAreaFastVec_SIMD_32f { | |||
for (; dx <= w - 4; dx += 4, S0 += 8, S1 += 8, D += 4) { | |||
GI_FLOAT32_V2_t v_row0 = GiLd2qFloat32(S0), v_row1 = GiLd2qFloat32(S1); | |||
GI_FLOAT32_t v_dst0 = GiAddFloat32(v_row0.val[0], v_row0.val[1]); | |||
GI_FLOAT32_t v_dst1 = GiAddFloat32(v_row1.val[0], v_row1.val[1]); | |||
GI_FLOAT32_t v_dst0 = GiAddFloat32( | |||
GiGetSubVectorFloat32V2(v_row0, 0), | |||
GiGetSubVectorFloat32V2(v_row0, 1)); | |||
GI_FLOAT32_t v_dst1 = GiAddFloat32( | |||
GiGetSubVectorFloat32V2(v_row1, 0), | |||
GiGetSubVectorFloat32V2(v_row1, 1)); | |||
GiStoreFloat32( | |||
D, GiMultiplyFloat32(GiAddFloat32(v_dst0, v_dst1), v_025)); | |||
@@ -17,13 +17,21 @@ compute_linear_element(const ctype src[4], const ctype alpha[2]) { | |||
template <typename simd_helper, size_t fh, size_t fw> | |||
static GI_FORCEINLINE typename simd_helper::simd_type compute_linear_element_simd( | |||
const typename simd_helper::simd_type src[4], | |||
const typename simd_helper::simd_type alpha[2][2]) { | |||
const typename simd_helper::simd_type_x4 src, | |||
const typename simd_helper::simd_fixlen_type alpha[2][2]) { | |||
typename simd_helper::simd_type c = simd_helper::dup(0); | |||
c = simd_helper::fma(c, src[0], alpha[0 ^ fh][0 ^ fw]); | |||
c = simd_helper::fma(c, src[1], alpha[0 ^ fh][1 ^ fw]); | |||
c = simd_helper::fma(c, src[2], alpha[1 ^ fh][0 ^ fw]); | |||
c = simd_helper::fma(c, src[3], alpha[1 ^ fh][1 ^ fw]); | |||
c = simd_helper::fma( | |||
c, GiGetSubVectorFloat32V4(src, 0), | |||
GiFixLenType2GiFloat32Type(alpha[0 ^ fh][0 ^ fw])); | |||
c = simd_helper::fma( | |||
c, GiGetSubVectorFloat32V4(src, 1), | |||
GiFixLenType2GiFloat32Type(alpha[0 ^ fh][1 ^ fw])); | |||
c = simd_helper::fma( | |||
c, GiGetSubVectorFloat32V4(src, 2), | |||
GiFixLenType2GiFloat32Type(alpha[1 ^ fh][0 ^ fw])); | |||
c = simd_helper::fma( | |||
c, GiGetSubVectorFloat32V4(src, 3), | |||
GiFixLenType2GiFloat32Type(alpha[1 ^ fh][1 ^ fw])); | |||
return c; | |||
} | |||
@@ -62,23 +70,37 @@ static GI_FORCEINLINE void compute_linear_2x2_element( | |||
template <typename simd_helper> | |||
static GI_FORCEINLINE void compute_linear_2x2_element_simd( | |||
const typename simd_helper::ctype* src, typename simd_helper::ctype* dst, | |||
size_t IW, size_t OW, const typename simd_helper::simd_type alpha[2][2]) { | |||
size_t IW, size_t OW, | |||
const typename simd_helper::simd_fixlen_type alpha[2][2]) { | |||
using simd_type_x4 = typename simd_helper::simd_type_x4; | |||
using simd_type = typename simd_helper::simd_type; | |||
simd_type rsrc[4]; | |||
rsrc[0] = simd_helper::load(src); | |||
rsrc[1] = simd_helper::load(src + 1); | |||
rsrc[2] = simd_helper::load(src + IW); | |||
rsrc[3] = simd_helper::load(src + IW + 1); | |||
simd_type rdst[4]; | |||
rdst[0] = compute_linear_element_simd<simd_helper, 0, 0>(rsrc, alpha); | |||
rdst[1] = compute_linear_element_simd<simd_helper, 0, 1>(rsrc, alpha); | |||
rdst[2] = compute_linear_element_simd<simd_helper, 1, 0>(rsrc, alpha); | |||
rdst[3] = compute_linear_element_simd<simd_helper, 1, 1>(rsrc, alpha); | |||
simd_helper::store2_interleave(dst, rdst[0], rdst[1]); | |||
simd_helper::store2_interleave(dst + OW, rdst[2], rdst[3]); | |||
simd_type_x4 rsrc; | |||
simd_type tmp; | |||
tmp = simd_helper::load(src); | |||
GiSetSubVectorFloat32V4(rsrc, 0, tmp); | |||
tmp = simd_helper::load(src + 1); | |||
GiSetSubVectorFloat32V4(rsrc, 1, tmp); | |||
tmp = simd_helper::load(src + IW); | |||
GiSetSubVectorFloat32V4(rsrc, 2, tmp); | |||
tmp = simd_helper::load(src + IW + 1); | |||
GiSetSubVectorFloat32V4(rsrc, 3, tmp); | |||
simd_type_x4 rdst; | |||
tmp = compute_linear_element_simd<simd_helper, 0, 0>(rsrc, alpha); | |||
GiSetSubVectorFloat32V4(rdst, 0, tmp); | |||
tmp = compute_linear_element_simd<simd_helper, 0, 1>(rsrc, alpha); | |||
GiSetSubVectorFloat32V4(rdst, 1, tmp); | |||
tmp = compute_linear_element_simd<simd_helper, 1, 0>(rsrc, alpha); | |||
GiSetSubVectorFloat32V4(rdst, 2, tmp); | |||
tmp = compute_linear_element_simd<simd_helper, 1, 1>(rsrc, alpha); | |||
GiSetSubVectorFloat32V4(rdst, 3, tmp); | |||
simd_helper::store2_interleave( | |||
dst, GiGetSubVectorFloat32V4(rdst, 0), GiGetSubVectorFloat32V4(rdst, 1)); | |||
simd_helper::store2_interleave( | |||
dst + OW, GiGetSubVectorFloat32V4(rdst, 2), | |||
GiGetSubVectorFloat32V4(rdst, 3)); | |||
} | |||
template <typename ctype> | |||
@@ -90,11 +112,11 @@ void linear_upsample2_nchw( | |||
ctype alpha[2] = {0.75, 0.25}; | |||
typename simd_helper::simd_type simd_alpha[2][2]; | |||
simd_alpha[0][0] = simd_helper::dup(0.75 * 0.75); | |||
simd_alpha[0][1] = simd_helper::dup(0.75 * 0.25); | |||
simd_alpha[1][0] = simd_helper::dup(0.25 * 0.75); | |||
simd_alpha[1][1] = simd_helper::dup(0.25 * 0.25); | |||
typename simd_helper::simd_fixlen_type simd_alpha[2][2]; | |||
simd_alpha[0][0] = GiFloat32Type2FixLenType(simd_helper::dup(0.75 * 0.75)); | |||
simd_alpha[0][1] = GiFloat32Type2FixLenType(simd_helper::dup(0.75 * 0.25)); | |||
simd_alpha[1][0] = GiFloat32Type2FixLenType(simd_helper::dup(0.25 * 0.75)); | |||
simd_alpha[1][1] = GiFloat32Type2FixLenType(simd_helper::dup(0.25 * 0.25)); | |||
for (size_t i = 0; i < N; ++i) { | |||
compute_linear_2x2_element<ctype, false, false>( | |||
@@ -8,20 +8,29 @@ namespace { | |||
template <typename simd_helper, size_t fh, size_t fw> | |||
static GI_FORCEINLINE typename simd_helper::simd_type compute_linear_element( | |||
const typename simd_helper::simd_type src[4], | |||
const typename simd_helper::simd_type alpha[2][2]) { | |||
const typename simd_helper::simd_type_x4 src, | |||
const typename simd_helper::simd_fixlen_type alpha[2][2]) { | |||
typename simd_helper::simd_type c = simd_helper::dup(0); | |||
c = simd_helper::fma(c, src[0], alpha[0 ^ fh][0 ^ fw]); | |||
c = simd_helper::fma(c, src[1], alpha[0 ^ fh][1 ^ fw]); | |||
c = simd_helper::fma(c, src[2], alpha[1 ^ fh][0 ^ fw]); | |||
c = simd_helper::fma(c, src[3], alpha[1 ^ fh][1 ^ fw]); | |||
c = simd_helper::fma( | |||
c, GiGetSubVectorFloat32V4(src, 0), | |||
GiFixLenType2GiFloat32Type(alpha[0 ^ fh][0 ^ fw])); | |||
c = simd_helper::fma( | |||
c, GiGetSubVectorFloat32V4(src, 1), | |||
GiFixLenType2GiFloat32Type(alpha[0 ^ fh][1 ^ fw])); | |||
c = simd_helper::fma( | |||
c, GiGetSubVectorFloat32V4(src, 2), | |||
GiFixLenType2GiFloat32Type(alpha[1 ^ fh][0 ^ fw])); | |||
c = simd_helper::fma( | |||
c, GiGetSubVectorFloat32V4(src, 3), | |||
GiFixLenType2GiFloat32Type(alpha[1 ^ fh][1 ^ fw])); | |||
return c; | |||
} | |||
template <typename simd_helper, bool has_right, bool has_bottom> | |||
static GI_FORCEINLINE void compute_linear_2x2_element( | |||
const typename simd_helper::ctype* src, typename simd_helper::ctype* dst, | |||
size_t IW, size_t OW, const typename simd_helper::simd_type alpha[2][2]) { | |||
size_t IW, size_t OW, | |||
const typename simd_helper::simd_fixlen_type alpha[2][2]) { | |||
constexpr size_t PC = simd_helper::simd_width; | |||
const typename simd_helper::ctype* src_ptr[4] = {src, src, src, src}; | |||
@@ -34,27 +43,33 @@ static GI_FORCEINLINE void compute_linear_2x2_element( | |||
src_ptr[3] += IW * PC; | |||
} | |||
typename simd_helper::simd_type rsrc[4]; | |||
rsrc[0] = simd_helper::load(src_ptr[0]); | |||
rsrc[1] = simd_helper::load(src_ptr[1]); | |||
rsrc[2] = simd_helper::load(src_ptr[2]); | |||
rsrc[3] = simd_helper::load(src_ptr[3]); | |||
typename simd_helper::simd_type rdst[4]; | |||
rdst[0] = compute_linear_element<simd_helper, 0, 0>(rsrc, alpha); | |||
rdst[1] = compute_linear_element<simd_helper, 0, 1>(rsrc, alpha); | |||
rdst[2] = compute_linear_element<simd_helper, 1, 0>(rsrc, alpha); | |||
rdst[3] = compute_linear_element<simd_helper, 1, 1>(rsrc, alpha); | |||
simd_helper::store(dst, rdst[0]); | |||
typename simd_helper::simd_type_x4 rsrc; | |||
GiSetSubVectorFloat32V4(rsrc, 0, simd_helper::load(src_ptr[0])); | |||
GiSetSubVectorFloat32V4(rsrc, 1, simd_helper::load(src_ptr[1])); | |||
GiSetSubVectorFloat32V4(rsrc, 2, simd_helper::load(src_ptr[2])); | |||
GiSetSubVectorFloat32V4(rsrc, 3, simd_helper::load(src_ptr[3])); | |||
typename simd_helper::simd_type_x4 rdst; | |||
typename simd_helper::simd_type a, b, c, d; | |||
a = compute_linear_element<simd_helper, 0, 0>(rsrc, alpha); | |||
b = compute_linear_element<simd_helper, 0, 1>(rsrc, alpha); | |||
c = compute_linear_element<simd_helper, 1, 0>(rsrc, alpha); | |||
d = compute_linear_element<simd_helper, 1, 1>(rsrc, alpha); | |||
GiSetSubVectorFloat32V4(rdst, 0, a); | |||
GiSetSubVectorFloat32V4(rdst, 1, b); | |||
GiSetSubVectorFloat32V4(rdst, 2, c); | |||
GiSetSubVectorFloat32V4(rdst, 3, d); | |||
simd_helper::store(dst, GiGetSubVectorFloat32V4(rdst, 0)); | |||
if (has_right) { | |||
simd_helper::store(dst + PC, rdst[1]); | |||
simd_helper::store(dst + PC, GiGetSubVectorFloat32V4(rdst, 1)); | |||
} | |||
if (has_bottom) { | |||
simd_helper::store(dst + OW * PC, rdst[2]); | |||
simd_helper::store(dst + OW * PC, GiGetSubVectorFloat32V4(rdst, 2)); | |||
} | |||
if (has_right && has_bottom) { | |||
simd_helper::store(dst + (OW + 1) * PC, rdst[3]); | |||
simd_helper::store(dst + (OW + 1) * PC, GiGetSubVectorFloat32V4(rdst, 3)); | |||
} | |||
} | |||
@@ -65,11 +80,12 @@ void linear_upsample2_nchwxx( | |||
size_t OW = IW * 2; | |||
constexpr size_t PC = simd_helper::simd_width; | |||
typename simd_helper::simd_type alpha[2][2]; | |||
alpha[0][0] = simd_helper::dup(0.75 * 0.75); | |||
alpha[0][1] = simd_helper::dup(0.75 * 0.25); | |||
alpha[1][0] = simd_helper::dup(0.25 * 0.75); | |||
alpha[1][1] = simd_helper::dup(0.25 * 0.25); | |||
typename simd_helper::simd_fixlen_type alpha[2][2]; | |||
alpha[0][0] = GiFloat32Type2FixLenType(simd_helper::dup(0.75 * 0.75)); | |||
alpha[0][1] = GiFloat32Type2FixLenType(simd_helper::dup(0.75 * 0.25)); | |||
alpha[1][0] = GiFloat32Type2FixLenType(simd_helper::dup(0.25 * 0.75)); | |||
alpha[1][1] = GiFloat32Type2FixLenType(simd_helper::dup(0.25 * 0.25)); | |||
for (size_t i = 0; i < N; ++i) { | |||
compute_linear_2x2_element<simd_helper, false, false>( | |||