Browse Source

feat(gi): make resize apply gi class type

GitOrigin-RevId: 11acee2a0b
release-1.10
Megvii Engine Team 3 years ago
parent
commit
45b26400e7
4 changed files with 118 additions and 67 deletions
  1. +4
    -2
      dnn/src/fallback/resize/gi/helper.h
  2. +22
    -11
      dnn/src/fallback/resize/gi/resize_cv.cpp
  3. +48
    -26
      dnn/src/fallback/resize/gi/upsample2_nchw.cpp
  4. +44
    -28
      dnn/src/fallback/resize/gi/upsample2_nchwxx.cpp

+ 4
- 2
dnn/src/fallback/resize/gi/helper.h View File

@@ -14,7 +14,9 @@ struct SIMDHelper {};
template <> template <>
struct SIMDHelper<float> { struct SIMDHelper<float> {
using simd_type = GI_FLOAT32_t; using simd_type = GI_FLOAT32_t;
using simd_fixlen_type = GI_FLOAT32_FIXLEN_t;
using simd_type_x2 = GI_FLOAT32_V2_t; using simd_type_x2 = GI_FLOAT32_V2_t;
using simd_type_x4 = GI_FLOAT32_V4_t;
using ctype = float; using ctype = float;
static constexpr size_t simd_width = 4; static constexpr size_t simd_width = 4;


@@ -27,8 +29,8 @@ struct SIMDHelper<float> {
static GI_FORCEINLINE void store2_interleave( static GI_FORCEINLINE void store2_interleave(
ctype* dst_ptr, const simd_type& rdst1, const simd_type& rdst2) { ctype* dst_ptr, const simd_type& rdst1, const simd_type& rdst2) {
simd_type_x2 rdst; simd_type_x2 rdst;
rdst.val[0] = rdst1;
rdst.val[1] = rdst2;
GiSetSubVectorFloat32V2(rdst, 0, rdst1);
GiSetSubVectorFloat32V2(rdst, 1, rdst2);
GiStoreZipFloat32V2(dst_ptr, rdst); GiStoreZipFloat32V2(dst_ptr, rdst);
} }
static GI_FORCEINLINE simd_type static GI_FORCEINLINE simd_type


+ 22
- 11
dnn/src/fallback/resize/gi/resize_cv.cpp View File

@@ -375,17 +375,24 @@ void resize_linear_32f_gi(const Mat32f& src, Mat32f& dst) {
int dy = 0; int dy = 0;
GI_FLOAT32_t v_rx = GiBroadcastFloat32(rx); GI_FLOAT32_t v_rx = GiBroadcastFloat32(rx);
GI_FLOAT32_t v_irx = GiBroadcastFloat32(irx); GI_FLOAT32_t v_irx = GiBroadcastFloat32(irx);
#define EXPAND(x) \
v_cache0 = GiLoadUzipFloat32V3(cache0_ptr + dy + (x)*3); \
v_cache1 = GiLoadUzipFloat32V3(cache1_ptr + dy + (x)*3); \
v_dst.val[0] = GiMlaqFloat32( \
GiMultiplyFloat32(v_rx, v_cache1.val[0]), v_irx, v_cache0.val[0]); \
v_dst.val[1] = GiMlaqFloat32( \
GiMultiplyFloat32(v_rx, v_cache1.val[1]), v_irx, v_cache0.val[1]); \
v_dst.val[2] = GiMlaqFloat32( \
GiMultiplyFloat32(v_rx, v_cache1.val[2]), v_irx, v_cache0.val[2]); \
#define EXPAND(x) \
v_cache0 = GiLoadUzipFloat32V3(cache0_ptr + dy + (x)*3); \
v_cache1 = GiLoadUzipFloat32V3(cache1_ptr + dy + (x)*3); \
a0 = GiMlaqFloat32( \
GiMultiplyFloat32(v_rx, GiGetSubVectorFloat32V3(v_cache1, 0)), v_irx, \
GiGetSubVectorFloat32V3(v_cache0, 0)); \
GiSetSubVectorFloat32V3(v_dst, 0, a0); \
a1 = GiMlaqFloat32( \
GiMultiplyFloat32(v_rx, GiGetSubVectorFloat32V3(v_cache1, 1)), v_irx, \
GiGetSubVectorFloat32V3(v_cache0, 1)); \
GiSetSubVectorFloat32V3(v_dst, 1, a1); \
a2 = GiMlaqFloat32( \
GiMultiplyFloat32(v_rx, GiGetSubVectorFloat32V3(v_cache1, 2)), v_irx, \
GiGetSubVectorFloat32V3(v_cache0, 2)); \
GiSetSubVectorFloat32V3(v_dst, 2, a2); \
GiStoreZipFloat32V3(pdst + dy + (x)*3, v_dst); GiStoreZipFloat32V3(pdst + dy + (x)*3, v_dst);


GI_FLOAT32_t a0, a1, a2;
for (; dy + 8 * 3 <= dstcols; dy += 8 * 3) { for (; dy + 8 * 3 <= dstcols; dy += 8 * 3) {
GI_FLOAT32_V3_t v_cache0; GI_FLOAT32_V3_t v_cache0;
GI_FLOAT32_V3_t v_cache1; GI_FLOAT32_V3_t v_cache1;
@@ -560,8 +567,12 @@ struct ResizeAreaFastVec_SIMD_32f {
for (; dx <= w - 4; dx += 4, S0 += 8, S1 += 8, D += 4) { for (; dx <= w - 4; dx += 4, S0 += 8, S1 += 8, D += 4) {
GI_FLOAT32_V2_t v_row0 = GiLd2qFloat32(S0), v_row1 = GiLd2qFloat32(S1); GI_FLOAT32_V2_t v_row0 = GiLd2qFloat32(S0), v_row1 = GiLd2qFloat32(S1);


GI_FLOAT32_t v_dst0 = GiAddFloat32(v_row0.val[0], v_row0.val[1]);
GI_FLOAT32_t v_dst1 = GiAddFloat32(v_row1.val[0], v_row1.val[1]);
GI_FLOAT32_t v_dst0 = GiAddFloat32(
GiGetSubVectorFloat32V2(v_row0, 0),
GiGetSubVectorFloat32V2(v_row0, 1));
GI_FLOAT32_t v_dst1 = GiAddFloat32(
GiGetSubVectorFloat32V2(v_row1, 0),
GiGetSubVectorFloat32V2(v_row1, 1));


GiStoreFloat32( GiStoreFloat32(
D, GiMultiplyFloat32(GiAddFloat32(v_dst0, v_dst1), v_025)); D, GiMultiplyFloat32(GiAddFloat32(v_dst0, v_dst1), v_025));


+ 48
- 26
dnn/src/fallback/resize/gi/upsample2_nchw.cpp View File

@@ -17,13 +17,21 @@ compute_linear_element(const ctype src[4], const ctype alpha[2]) {


template <typename simd_helper, size_t fh, size_t fw> template <typename simd_helper, size_t fh, size_t fw>
static GI_FORCEINLINE typename simd_helper::simd_type compute_linear_element_simd( static GI_FORCEINLINE typename simd_helper::simd_type compute_linear_element_simd(
const typename simd_helper::simd_type src[4],
const typename simd_helper::simd_type alpha[2][2]) {
const typename simd_helper::simd_type_x4 src,
const typename simd_helper::simd_fixlen_type alpha[2][2]) {
typename simd_helper::simd_type c = simd_helper::dup(0); typename simd_helper::simd_type c = simd_helper::dup(0);
c = simd_helper::fma(c, src[0], alpha[0 ^ fh][0 ^ fw]);
c = simd_helper::fma(c, src[1], alpha[0 ^ fh][1 ^ fw]);
c = simd_helper::fma(c, src[2], alpha[1 ^ fh][0 ^ fw]);
c = simd_helper::fma(c, src[3], alpha[1 ^ fh][1 ^ fw]);
c = simd_helper::fma(
c, GiGetSubVectorFloat32V4(src, 0),
GiFixLenType2GiFloat32Type(alpha[0 ^ fh][0 ^ fw]));
c = simd_helper::fma(
c, GiGetSubVectorFloat32V4(src, 1),
GiFixLenType2GiFloat32Type(alpha[0 ^ fh][1 ^ fw]));
c = simd_helper::fma(
c, GiGetSubVectorFloat32V4(src, 2),
GiFixLenType2GiFloat32Type(alpha[1 ^ fh][0 ^ fw]));
c = simd_helper::fma(
c, GiGetSubVectorFloat32V4(src, 3),
GiFixLenType2GiFloat32Type(alpha[1 ^ fh][1 ^ fw]));
return c; return c;
} }


@@ -62,23 +70,37 @@ static GI_FORCEINLINE void compute_linear_2x2_element(
template <typename simd_helper> template <typename simd_helper>
static GI_FORCEINLINE void compute_linear_2x2_element_simd( static GI_FORCEINLINE void compute_linear_2x2_element_simd(
const typename simd_helper::ctype* src, typename simd_helper::ctype* dst, const typename simd_helper::ctype* src, typename simd_helper::ctype* dst,
size_t IW, size_t OW, const typename simd_helper::simd_type alpha[2][2]) {
size_t IW, size_t OW,
const typename simd_helper::simd_fixlen_type alpha[2][2]) {
using simd_type_x4 = typename simd_helper::simd_type_x4;
using simd_type = typename simd_helper::simd_type; using simd_type = typename simd_helper::simd_type;


simd_type rsrc[4];
rsrc[0] = simd_helper::load(src);
rsrc[1] = simd_helper::load(src + 1);
rsrc[2] = simd_helper::load(src + IW);
rsrc[3] = simd_helper::load(src + IW + 1);

simd_type rdst[4];
rdst[0] = compute_linear_element_simd<simd_helper, 0, 0>(rsrc, alpha);
rdst[1] = compute_linear_element_simd<simd_helper, 0, 1>(rsrc, alpha);
rdst[2] = compute_linear_element_simd<simd_helper, 1, 0>(rsrc, alpha);
rdst[3] = compute_linear_element_simd<simd_helper, 1, 1>(rsrc, alpha);

simd_helper::store2_interleave(dst, rdst[0], rdst[1]);
simd_helper::store2_interleave(dst + OW, rdst[2], rdst[3]);
simd_type_x4 rsrc;
simd_type tmp;
tmp = simd_helper::load(src);
GiSetSubVectorFloat32V4(rsrc, 0, tmp);
tmp = simd_helper::load(src + 1);
GiSetSubVectorFloat32V4(rsrc, 1, tmp);
tmp = simd_helper::load(src + IW);
GiSetSubVectorFloat32V4(rsrc, 2, tmp);
tmp = simd_helper::load(src + IW + 1);
GiSetSubVectorFloat32V4(rsrc, 3, tmp);

simd_type_x4 rdst;
tmp = compute_linear_element_simd<simd_helper, 0, 0>(rsrc, alpha);
GiSetSubVectorFloat32V4(rdst, 0, tmp);
tmp = compute_linear_element_simd<simd_helper, 0, 1>(rsrc, alpha);
GiSetSubVectorFloat32V4(rdst, 1, tmp);
tmp = compute_linear_element_simd<simd_helper, 1, 0>(rsrc, alpha);
GiSetSubVectorFloat32V4(rdst, 2, tmp);
tmp = compute_linear_element_simd<simd_helper, 1, 1>(rsrc, alpha);
GiSetSubVectorFloat32V4(rdst, 3, tmp);

simd_helper::store2_interleave(
dst, GiGetSubVectorFloat32V4(rdst, 0), GiGetSubVectorFloat32V4(rdst, 1));
simd_helper::store2_interleave(
dst + OW, GiGetSubVectorFloat32V4(rdst, 2),
GiGetSubVectorFloat32V4(rdst, 3));
} }


template <typename ctype> template <typename ctype>
@@ -90,11 +112,11 @@ void linear_upsample2_nchw(


ctype alpha[2] = {0.75, 0.25}; ctype alpha[2] = {0.75, 0.25};


typename simd_helper::simd_type simd_alpha[2][2];
simd_alpha[0][0] = simd_helper::dup(0.75 * 0.75);
simd_alpha[0][1] = simd_helper::dup(0.75 * 0.25);
simd_alpha[1][0] = simd_helper::dup(0.25 * 0.75);
simd_alpha[1][1] = simd_helper::dup(0.25 * 0.25);
typename simd_helper::simd_fixlen_type simd_alpha[2][2];
simd_alpha[0][0] = GiFloat32Type2FixLenType(simd_helper::dup(0.75 * 0.75));
simd_alpha[0][1] = GiFloat32Type2FixLenType(simd_helper::dup(0.75 * 0.25));
simd_alpha[1][0] = GiFloat32Type2FixLenType(simd_helper::dup(0.25 * 0.75));
simd_alpha[1][1] = GiFloat32Type2FixLenType(simd_helper::dup(0.25 * 0.25));


for (size_t i = 0; i < N; ++i) { for (size_t i = 0; i < N; ++i) {
compute_linear_2x2_element<ctype, false, false>( compute_linear_2x2_element<ctype, false, false>(


+ 44
- 28
dnn/src/fallback/resize/gi/upsample2_nchwxx.cpp View File

@@ -8,20 +8,29 @@ namespace {


template <typename simd_helper, size_t fh, size_t fw> template <typename simd_helper, size_t fh, size_t fw>
static GI_FORCEINLINE typename simd_helper::simd_type compute_linear_element( static GI_FORCEINLINE typename simd_helper::simd_type compute_linear_element(
const typename simd_helper::simd_type src[4],
const typename simd_helper::simd_type alpha[2][2]) {
const typename simd_helper::simd_type_x4 src,
const typename simd_helper::simd_fixlen_type alpha[2][2]) {
typename simd_helper::simd_type c = simd_helper::dup(0); typename simd_helper::simd_type c = simd_helper::dup(0);
c = simd_helper::fma(c, src[0], alpha[0 ^ fh][0 ^ fw]);
c = simd_helper::fma(c, src[1], alpha[0 ^ fh][1 ^ fw]);
c = simd_helper::fma(c, src[2], alpha[1 ^ fh][0 ^ fw]);
c = simd_helper::fma(c, src[3], alpha[1 ^ fh][1 ^ fw]);
c = simd_helper::fma(
c, GiGetSubVectorFloat32V4(src, 0),
GiFixLenType2GiFloat32Type(alpha[0 ^ fh][0 ^ fw]));
c = simd_helper::fma(
c, GiGetSubVectorFloat32V4(src, 1),
GiFixLenType2GiFloat32Type(alpha[0 ^ fh][1 ^ fw]));
c = simd_helper::fma(
c, GiGetSubVectorFloat32V4(src, 2),
GiFixLenType2GiFloat32Type(alpha[1 ^ fh][0 ^ fw]));
c = simd_helper::fma(
c, GiGetSubVectorFloat32V4(src, 3),
GiFixLenType2GiFloat32Type(alpha[1 ^ fh][1 ^ fw]));
return c; return c;
} }


template <typename simd_helper, bool has_right, bool has_bottom> template <typename simd_helper, bool has_right, bool has_bottom>
static GI_FORCEINLINE void compute_linear_2x2_element( static GI_FORCEINLINE void compute_linear_2x2_element(
const typename simd_helper::ctype* src, typename simd_helper::ctype* dst, const typename simd_helper::ctype* src, typename simd_helper::ctype* dst,
size_t IW, size_t OW, const typename simd_helper::simd_type alpha[2][2]) {
size_t IW, size_t OW,
const typename simd_helper::simd_fixlen_type alpha[2][2]) {
constexpr size_t PC = simd_helper::simd_width; constexpr size_t PC = simd_helper::simd_width;
const typename simd_helper::ctype* src_ptr[4] = {src, src, src, src}; const typename simd_helper::ctype* src_ptr[4] = {src, src, src, src};


@@ -34,27 +43,33 @@ static GI_FORCEINLINE void compute_linear_2x2_element(
src_ptr[3] += IW * PC; src_ptr[3] += IW * PC;
} }


typename simd_helper::simd_type rsrc[4];
rsrc[0] = simd_helper::load(src_ptr[0]);
rsrc[1] = simd_helper::load(src_ptr[1]);
rsrc[2] = simd_helper::load(src_ptr[2]);
rsrc[3] = simd_helper::load(src_ptr[3]);

typename simd_helper::simd_type rdst[4];
rdst[0] = compute_linear_element<simd_helper, 0, 0>(rsrc, alpha);
rdst[1] = compute_linear_element<simd_helper, 0, 1>(rsrc, alpha);
rdst[2] = compute_linear_element<simd_helper, 1, 0>(rsrc, alpha);
rdst[3] = compute_linear_element<simd_helper, 1, 1>(rsrc, alpha);

simd_helper::store(dst, rdst[0]);
typename simd_helper::simd_type_x4 rsrc;
GiSetSubVectorFloat32V4(rsrc, 0, simd_helper::load(src_ptr[0]));
GiSetSubVectorFloat32V4(rsrc, 1, simd_helper::load(src_ptr[1]));
GiSetSubVectorFloat32V4(rsrc, 2, simd_helper::load(src_ptr[2]));
GiSetSubVectorFloat32V4(rsrc, 3, simd_helper::load(src_ptr[3]));

typename simd_helper::simd_type_x4 rdst;
typename simd_helper::simd_type a, b, c, d;
a = compute_linear_element<simd_helper, 0, 0>(rsrc, alpha);
b = compute_linear_element<simd_helper, 0, 1>(rsrc, alpha);
c = compute_linear_element<simd_helper, 1, 0>(rsrc, alpha);
d = compute_linear_element<simd_helper, 1, 1>(rsrc, alpha);

GiSetSubVectorFloat32V4(rdst, 0, a);
GiSetSubVectorFloat32V4(rdst, 1, b);
GiSetSubVectorFloat32V4(rdst, 2, c);
GiSetSubVectorFloat32V4(rdst, 3, d);

simd_helper::store(dst, GiGetSubVectorFloat32V4(rdst, 0));
if (has_right) { if (has_right) {
simd_helper::store(dst + PC, rdst[1]);
simd_helper::store(dst + PC, GiGetSubVectorFloat32V4(rdst, 1));
} }
if (has_bottom) { if (has_bottom) {
simd_helper::store(dst + OW * PC, rdst[2]);
simd_helper::store(dst + OW * PC, GiGetSubVectorFloat32V4(rdst, 2));
} }
if (has_right && has_bottom) { if (has_right && has_bottom) {
simd_helper::store(dst + (OW + 1) * PC, rdst[3]);
simd_helper::store(dst + (OW + 1) * PC, GiGetSubVectorFloat32V4(rdst, 3));
} }
} }


@@ -65,11 +80,12 @@ void linear_upsample2_nchwxx(
size_t OW = IW * 2; size_t OW = IW * 2;
constexpr size_t PC = simd_helper::simd_width; constexpr size_t PC = simd_helper::simd_width;


typename simd_helper::simd_type alpha[2][2];
alpha[0][0] = simd_helper::dup(0.75 * 0.75);
alpha[0][1] = simd_helper::dup(0.75 * 0.25);
alpha[1][0] = simd_helper::dup(0.25 * 0.75);
alpha[1][1] = simd_helper::dup(0.25 * 0.25);
typename simd_helper::simd_fixlen_type alpha[2][2];

alpha[0][0] = GiFloat32Type2FixLenType(simd_helper::dup(0.75 * 0.75));
alpha[0][1] = GiFloat32Type2FixLenType(simd_helper::dup(0.75 * 0.25));
alpha[1][0] = GiFloat32Type2FixLenType(simd_helper::dup(0.25 * 0.75));
alpha[1][1] = GiFloat32Type2FixLenType(simd_helper::dup(0.25 * 0.25));


for (size_t i = 0; i < N; ++i) { for (size_t i = 0; i < N; ++i) {
compute_linear_2x2_element<simd_helper, false, false>( compute_linear_2x2_element<simd_helper, false, false>(


Loading…
Cancel
Save