Browse Source

feat(gi): make elemwise apply gi class type

GitOrigin-RevId: 6ff1a8a55c
release-1.10
Megvii Engine Team 3 years ago
parent
commit
8546c15d45
28 changed files with 1297 additions and 838 deletions
  1. +46
    -14
      dnn/src/arm_common/elemwise_helper/elemwise_op.h
  2. +1
    -1
      dnn/src/fallback/elemwise/gi_impl/gi_mathfun.cpp
  3. +32
    -19
      dnn/src/fallback/elemwise_helper/kimpl/abs.h
  4. +47
    -16
      dnn/src/fallback/elemwise_helper/kimpl/add.h
  5. +22
    -16
      dnn/src/fallback/elemwise_helper/kimpl/exp.h
  6. +9
    -5
      dnn/src/fallback/elemwise_helper/kimpl/fast_tanh.h
  7. +31
    -14
      dnn/src/fallback/elemwise_helper/kimpl/fuse_add_h_swish.h
  8. +43
    -17
      dnn/src/fallback/elemwise_helper/kimpl/fuse_add_relu.h
  9. +11
    -7
      dnn/src/fallback/elemwise_helper/kimpl/fuse_add_sigmoid.h
  10. +11
    -7
      dnn/src/fallback/elemwise_helper/kimpl/fuse_add_tanh.h
  11. +30
    -22
      dnn/src/fallback/elemwise_helper/kimpl/fuse_mul_add3.h
  12. +51
    -38
      dnn/src/fallback/elemwise_helper/kimpl/hswish.h
  13. +29
    -10
      dnn/src/fallback/elemwise_helper/kimpl/max.h
  14. +29
    -10
      dnn/src/fallback/elemwise_helper/kimpl/min.h
  15. +29
    -10
      dnn/src/fallback/elemwise_helper/kimpl/mul.h
  16. +20
    -19
      dnn/src/fallback/elemwise_helper/kimpl/none.h
  17. +206
    -142
      dnn/src/fallback/elemwise_helper/kimpl/op_base.h
  18. +67
    -39
      dnn/src/fallback/elemwise_helper/kimpl/relu.h
  19. +27
    -21
      dnn/src/fallback/elemwise_helper/kimpl/sigmoid.h
  20. +29
    -10
      dnn/src/fallback/elemwise_helper/kimpl/sub.h
  21. +52
    -48
      dnn/src/fallback/elemwise_helper/kimpl/tanh.h
  22. +11
    -7
      dnn/src/fallback/elemwise_helper/kimpl/true_div.h
  23. +16
    -7
      dnn/src/fallback/elemwise_helper/kimpl/typecvt.h
  24. +182
    -131
      dnn/src/fallback/elemwise_helper/op_common.h
  25. +3
    -2
      dnn/src/fallback/gi_intrinsic_helper.h
  26. +7
    -1
      dnn/src/fallback/quantized_converter.h
  27. +195
    -156
      dnn/src/fallback/reduce/reducer.h
  28. +61
    -49
      dnn/src/fallback/type_cvt/typecvt_helper.h

+ 46
- 14
dnn/src/arm_common/elemwise_helper/elemwise_op.h View File

@@ -12,7 +12,7 @@ using BcastType = megdnn::elemwise::BcastType;

///////////////////////////////// ParamElemVistor ///////////////////////////

#define cb(_ctype, _inner_ctype, _neon_type, _fun_suffix) \
#define cb(_ctype, _inner_ctype, _neon_type, _fun_suffix, _neon_type_v2) \
template <> \
struct ParamElemVisitor<_ctype> { \
_neon_type operator()(const _ctype* src) const { \
@@ -24,29 +24,61 @@ using BcastType = megdnn::elemwise::BcastType;
_neon_type operator()(const _ctype* src) const { \
return vdupq_n_##_fun_suffix(*reinterpret_cast<const _inner_ctype*>(src)); \
} \
}; \
template <> \
struct ParamElemVisitorV2<_ctype> { \
_neon_type_v2 operator()(const _ctype* src, const _ctype* src_1) const { \
_neon_type_v2 ret; \
ret.val[0] = \
vld1q_##_fun_suffix(reinterpret_cast<const _inner_ctype*>(src)); \
ret.val[1] = \
vld1q_##_fun_suffix(reinterpret_cast<const _inner_ctype*>(src_1)); \
return ret; \
} \
}; \
template <> \
struct ParamElemVisitorDupV2<_ctype> { \
_neon_type_v2 operator()(const _ctype* src) const { \
_neon_type_v2 ret; \
ret.val[0] = vdupq_n_##_fun_suffix( \
*reinterpret_cast<const _inner_ctype*>(src)); \
ret.val[1] = ret.val[0]; \
return ret; \
} \
}
cb(dt_quint8, uint8_t, uint8x16_t, u8);
cb(dt_quint8, uint8_t, uint8x16_t, u8, uint8x16x2_t);
#if __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
cb(__fp16, __fp16, float16x8_t, f16);
cb(__fp16, __fp16, float16x8_t, f16, float16x8x2_t);
#endif
cb(dt_int16, int16_t, int16x8_t, s16);
cb(dt_int16, int16_t, int16x8_t, s16, int16x8x2_t);
#undef cb

template <typename ctype>
struct ParamElemVisitorBcast101x4;
#define cb(_ctype, _inner_ctype, _neon_type, _fun_suffix, rel_suffix) \
template <> \
struct ParamElemVisitorBcast101x4<_ctype> { \
_neon_type operator()(const _ctype* src) const { \
return vreinterpretq_##_fun_suffix##_##rel_suffix(vld1q_dup_##rel_suffix( \
reinterpret_cast<const _inner_ctype*>(src))); \
} \
#define cb(_ctype, _inner_ctype, _neon_type, _fun_suffix, rel_suffix, _neon_type_v2) \
template <> \
struct ParamElemVisitorBcast101x4<_ctype> { \
_neon_type operator()(const _ctype* src) const { \
return vreinterpretq_##_fun_suffix##_##rel_suffix(vld1q_dup_##rel_suffix( \
reinterpret_cast<const _inner_ctype*>(src))); \
} \
}; \
template <> \
struct ParamElemVisitorBcast101x4V2<_ctype> { \
_neon_type_v2 operator()(const _ctype* src) const { \
_neon_type_v2 ret; \
ret.val[0] = \
vreinterpretq_##_fun_suffix##_##rel_suffix(vld1q_dup_##rel_suffix( \
reinterpret_cast<const _inner_ctype*>(src))); \
ret.val[1] = ret.val[0]; \
return ret; \
} \
}

cb(dt_quint8, uint32_t, uint8x16_t, u8, u32);
cb(dt_int16, int64_t, int16x8_t, s16, s64);
cb(dt_quint8, uint32_t, uint8x16_t, u8, u32, uint8x16x2_t);
cb(dt_int16, int64_t, int16x8_t, s16, s64, int16x8x2_t);
#if __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
cb(__fp16, uint64_t, float16x8_t, f16, u64);
cb(__fp16, uint64_t, float16x8_t, f16, u64, float16x8x2_t);
#endif
#undef cb



+ 1
- 1
dnn/src/fallback/elemwise/gi_impl/gi_mathfun.cpp View File

@@ -283,7 +283,7 @@ v4sf GiCosPsFloat32(v4sf x) {
v4sf GiTanPsFloat32(v4sf x) {
v4sf ysin, ycos;
GiSinCosPsFloat32(x, &ysin, &ycos);
return ysin / ycos;
return GiDivFloat32(ysin, ycos);
}

#undef c_exp_hi


+ 32
- 19
dnn/src/fallback/elemwise_helper/kimpl/abs.h View File

@@ -20,22 +20,28 @@ struct AbsOpBase : UnaryOpBase<src_ctype, dst_ctype> {
template <typename src_ctype, typename dst_ctype = src_ctype>
struct AbsOp;

#define OP(_ctype, _gi_type, _func_suffix, _simd_width) \
template <> \
struct AbsOp<_ctype> : AbsOpBase<_ctype> { \
using AbsOpBase::AbsOpBase; \
using AbsOpBase::operator(); \
constexpr static size_t SIMD_WIDTH = _simd_width; \
void operator()(const _gi_type& src, _ctype* dst) const { \
auto vitem = operator()(src); \
GiStore##_func_suffix(dst, vitem.val[0]); \
GiStore##_func_suffix(dst + SIMD_WIDTH, vitem.val[1]); \
} \
_gi_type operator()(const _gi_type& src) const { \
auto vitem0 = GiAbs##_func_suffix(src.val[0]); \
auto vitem1 = GiAbs##_func_suffix(src.val[1]); \
return {{vitem0, vitem1}}; \
} \
#define OP(_ctype, _gi_type, _func_suffix, _simd_width) \
template <> \
struct AbsOp<_ctype> : AbsOpBase<_ctype> { \
using AbsOpBase::AbsOpBase; \
using AbsOpBase::operator(); \
constexpr static size_t SIMD_WIDTH = _simd_width; \
void operator()(const _gi_type& src, _ctype* dst) const { \
auto vitem = operator()(src); \
GiStore##_func_suffix(dst, GiGetSubVector##_func_suffix##V2(vitem, 0)); \
GiStore##_func_suffix( \
dst + SIMD_WIDTH, GiGetSubVector##_func_suffix##V2(vitem, 1)); \
} \
_gi_type operator()(const _gi_type& src) const { \
auto vitem0 = \
GiAbs##_func_suffix(GiGetSubVector##_func_suffix##V2(src, 0)); \
auto vitem1 = \
GiAbs##_func_suffix(GiGetSubVector##_func_suffix##V2(src, 1)); \
_gi_type ret; \
GiSetSubVector##_func_suffix##V2(ret, 0, vitem0); \
GiSetSubVector##_func_suffix##V2(ret, 1, vitem1); \
return ret; \
} \
};
OP(dt_float32, GI_FLOAT32_V2_t, Float32, GI_SIMD_LEN_BYTE / sizeof(dt_float32))
OP(dt_int32, GI_INT32_V2_t, Int32, GI_SIMD_LEN_BYTE / sizeof(dt_int32))
@@ -64,11 +70,18 @@ struct AbsOp<dt_qint8, dt_qint8> : AbsOpBase<dt_qint8, dt_qint8> {
OPERATOR_UNARY_QINT8_FALLBACK;
}
GI_INT8_t operator()(const GI_INT32_V2_t& vsrc) const {
auto vitem0 = GiMultiplyFloat32(GiCastToFloat32(vsrc.val[0]), this->vscale);
auto vitem1 = GiMultiplyFloat32(GiCastToFloat32(vsrc.val[1]), this->vscale);
auto vitem0 = GiMultiplyFloat32(
GiCastToFloat32(GiGetSubVectorInt32V2(vsrc, 0)),
GiFixLenType2GiFloat32Type(this->vscale));
auto vitem1 = GiMultiplyFloat32(
GiCastToFloat32(GiGetSubVectorInt32V2(vsrc, 1)),
GiFixLenType2GiFloat32Type(this->vscale));
vitem0 = GiAbsFloat32(vitem0);
vitem1 = GiAbsFloat32(vitem1);
return QConverter::convert<GI_INT8_t, GI_FLOAT32_V2_t>({{vitem0, vitem1}});
GI_FLOAT32_V2_t tmp;
GiSetSubVectorFloat32V2(tmp, 0, vitem0);
GiSetSubVectorFloat32V2(tmp, 1, vitem1);
return QConverter::convert<GI_INT8_t, GI_FLOAT32_V2_t>(tmp);
}
};



+ 47
- 16
dnn/src/fallback/elemwise_helper/kimpl/add.h View File

@@ -33,13 +33,21 @@ struct AddOp;
void operator()( \
const _gi_type2& src0, const _gi_type2& src1, dst_ctype* dst) const { \
auto vitem = operator()(src0, src1); \
GiStore##_func_suffix(dst, vitem.val[0]); \
GiStore##_func_suffix(dst + SIMD_WIDTH, vitem.val[1]); \
GiStore##_func_suffix(dst, GiGetSubVector##_func_suffix##V2(vitem, 0)); \
GiStore##_func_suffix( \
dst + SIMD_WIDTH, GiGetSubVector##_func_suffix##V2(vitem, 1)); \
} \
_gi_type2 operator()(const _gi_type2& src0, const _gi_type2& src1) const { \
auto vitem0 = GiAdd##_func_suffix(src0.val[0], src1.val[0]); \
auto vitem1 = GiAdd##_func_suffix(src0.val[1], src1.val[1]); \
return {{vitem0, vitem1}}; \
auto vitem0 = GiAdd##_func_suffix( \
GiGetSubVector##_func_suffix##V2(src0, 0), \
GiGetSubVector##_func_suffix##V2(src1, 0)); \
auto vitem1 = GiAdd##_func_suffix( \
GiGetSubVector##_func_suffix##V2(src0, 1), \
GiGetSubVector##_func_suffix##V2(src1, 1)); \
_gi_type2 ret; \
GiSetSubVector##_func_suffix##V2(ret, 0, vitem0); \
GiSetSubVector##_func_suffix##V2(ret, 1, vitem1); \
return ret; \
} \
void operator()( \
const _gi_type& src0, const _gi_type& src1, dst_ctype* dst) const { \
@@ -82,13 +90,24 @@ struct AddOp<dt_qint8, dt_qint8> : AddOpBase<dt_qint8, dt_qint8> {

GI_INT8_t operator()(const GI_INT32_V2_t& vsrc0, const GI_INT32_V2_t& vsrc1) const {
auto vitem0 = GiAddFloat32(
GiMultiplyFloat32(GiCastToFloat32(vsrc0.val[0]), this->vscale0),
GiMultiplyFloat32(GiCastToFloat32(vsrc1.val[0]), this->vscale1));
GiMultiplyFloat32(
GiCastToFloat32(GiGetSubVectorInt32V2(vsrc0, 0)),
GiFixLenType2GiFloat32Type(this->vscale0)),
GiMultiplyFloat32(
GiCastToFloat32(GiGetSubVectorInt32V2(vsrc1, 0)),
GiFixLenType2GiFloat32Type(this->vscale1)));
auto vitem1 = GiAddFloat32(
GiMultiplyFloat32(GiCastToFloat32(vsrc0.val[1]), this->vscale0),
GiMultiplyFloat32(GiCastToFloat32(vsrc1.val[1]), this->vscale1));

return QConverter::convert<GI_INT8_t, GI_FLOAT32_V2_t>({{vitem0, vitem1}});
GiMultiplyFloat32(
GiCastToFloat32(GiGetSubVectorInt32V2(vsrc0, 1)),
GiFixLenType2GiFloat32Type(this->vscale0)),
GiMultiplyFloat32(
GiCastToFloat32(GiGetSubVectorInt32V2(vsrc1, 1)),
GiFixLenType2GiFloat32Type(this->vscale1)));
GI_FLOAT32_V2_t ret;
GiSetSubVectorFloat32V2(ret, 0, vitem0);
GiSetSubVectorFloat32V2(ret, 1, vitem1);

return QConverter::convert<GI_INT8_t, GI_FLOAT32_V2_t>(ret);
}
};

@@ -119,12 +138,24 @@ struct AddOp<dt_qint32, dt_qint8> : AddOpBase<dt_qint32, dt_qint8> {

GI_INT8_t operator()(const GI_INT32_V2_t& vsrc0, const GI_INT32_V2_t& vsrc1) const {
auto vitem0 = GiAddFloat32(
GiMultiplyFloat32(GiCastToFloat32(vsrc0.val[0]), this->vscale0),
GiMultiplyFloat32(GiCastToFloat32(vsrc1.val[0]), this->vscale1));
GiMultiplyFloat32(
GiCastToFloat32(GiGetSubVectorInt32V2(vsrc0, 0)),
GiFixLenType2GiFloat32Type(this->vscale0)),
GiMultiplyFloat32(
GiCastToFloat32(GiGetSubVectorInt32V2(vsrc1, 0)),
GiFixLenType2GiFloat32Type(this->vscale1)));
auto vitem1 = GiAddFloat32(
GiMultiplyFloat32(GiCastToFloat32(vsrc0.val[1]), this->vscale0),
GiMultiplyFloat32(GiCastToFloat32(vsrc1.val[1]), this->vscale1));
return QConverter::convert<GI_INT8_t, GI_FLOAT32_V2_t>({{vitem0, vitem1}});
GiMultiplyFloat32(
GiCastToFloat32(GiGetSubVectorInt32V2(vsrc0, 1)),
GiFixLenType2GiFloat32Type(this->vscale0)),
GiMultiplyFloat32(
GiCastToFloat32(GiGetSubVectorInt32V2(vsrc1, 1)),
GiFixLenType2GiFloat32Type(this->vscale1)));
GI_FLOAT32_V2_t ret;
GiSetSubVectorFloat32V2(ret, 0, vitem0);
GiSetSubVectorFloat32V2(ret, 1, vitem1);

return QConverter::convert<GI_INT8_t, GI_FLOAT32_V2_t>(ret);
}
};



+ 22
- 16
dnn/src/fallback/elemwise_helper/kimpl/exp.h View File

@@ -23,22 +23,28 @@ struct ExpOpBase : UnaryOpBase<src_ctype, dst_ctype> {
template <typename src_ctype, typename dst_ctype = src_ctype>
struct ExpOp;

#define OP(_ctype, _simd_type, _func_suffix, _simd_width) \
template <> \
struct ExpOp<_ctype> : ExpOpBase<_ctype> { \
using ExpOpBase::ExpOpBase; \
using ExpOpBase::operator(); \
constexpr static size_t SIMD_WIDTH = _simd_width; \
void operator()(const _simd_type& src, _ctype* dst) const { \
auto vitem = operator()(src); \
GiStore##_func_suffix(dst, vitem.val[0]); \
GiStore##_func_suffix(dst + SIMD_WIDTH, vitem.val[1]); \
} \
_simd_type operator()(const _simd_type& src) const { \
auto vitem0 = GiExpPs##_func_suffix(src.val[0]); \
auto vitem1 = GiExpPs##_func_suffix(src.val[1]); \
return {{vitem0, vitem1}}; \
} \
#define OP(_ctype, _simd_type, _func_suffix, _simd_width) \
template <> \
struct ExpOp<_ctype> : ExpOpBase<_ctype> { \
using ExpOpBase::ExpOpBase; \
using ExpOpBase::operator(); \
constexpr static size_t SIMD_WIDTH = _simd_width; \
void operator()(const _simd_type& src, _ctype* dst) const { \
auto vitem = operator()(src); \
GiStore##_func_suffix(dst, GiGetSubVector##_func_suffix##V2(vitem, 0)); \
GiStore##_func_suffix( \
dst + SIMD_WIDTH, GiGetSubVector##_func_suffix##V2(vitem, 1)); \
} \
_simd_type operator()(const _simd_type& src) const { \
auto vitem0 = \
GiExpPs##_func_suffix(GiGetSubVector##_func_suffix##V2(src, 0)); \
auto vitem1 = \
GiExpPs##_func_suffix(GiGetSubVector##_func_suffix##V2(src, 1)); \
_simd_type ret; \
GiSetSubVector##_func_suffix##V2(ret, 0, vitem0); \
GiSetSubVector##_func_suffix##V2(ret, 1, vitem1); \
return ret; \
} \
};
OP(dt_float32, GI_FLOAT32_V2_t, Float32, GI_SIMD_LEN_BYTE / sizeof(float))
#undef OP


+ 9
- 5
dnn/src/fallback/elemwise_helper/kimpl/fast_tanh.h View File

@@ -32,14 +32,15 @@ struct FastTanhOp;
constexpr static size_t SIMD_WIDTH = _simd_width; \
void operator()(const _simd_type& src, _ctype* dst) const { \
auto vitem = operator()(src); \
GiStore##_func_suffix(dst, vitem.val[0]); \
GiStore##_func_suffix(dst + SIMD_WIDTH, vitem.val[1]); \
GiStore##_func_suffix(dst, GiGetSubVector##_func_suffix##V2(vitem, 0)); \
GiStore##_func_suffix( \
dst + SIMD_WIDTH, GiGetSubVector##_func_suffix##V2(vitem, 1)); \
} \
_simd_type operator()(const _simd_type& src) const { \
auto val_27 = GiBroadcast##_func_suffix(27.f); \
auto val_9 = GiBroadcast##_func_suffix(9.f); \
auto valx = src.val[0]; \
auto valx1 = src.val[1]; \
auto valx = GiGetSubVector##_func_suffix##V2(src, 0); \
auto valx1 = GiGetSubVector##_func_suffix##V2(src, 1); \
auto valxp2 = GiMultiply##_fix_func_suffix(valx, valx); \
auto valx1p2 = GiMultiply##_fix_func_suffix(valx1, valx1); \
auto denominator = GiAdd##_fix_func_suffix(valxp2, val_27); \
@@ -58,7 +59,10 @@ struct FastTanhOp;
r_denominator1); \
valx = GiMultiply##_fix_func_suffix(valx, r_denominator); \
valx1 = GiMultiply##_fix_func_suffix(valx1, r_denominator1); \
return {{valx, valx1}}; \
_simd_type ret; \
GiSetSubVector##_func_suffix##V2(ret, 0, valx); \
GiSetSubVector##_func_suffix##V2(ret, 1, valx1); \
return ret; \
} \
};
OP(dt_float32, GI_FLOAT32_V2_t, Float32, Float32, GI_SIMD_LEN_BYTE / sizeof(float))


+ 31
- 14
dnn/src/fallback/elemwise_helper/kimpl/fuse_add_h_swish.h View File

@@ -36,19 +36,23 @@ struct FuseAddHSwishOp;
const _simd_type2& src0, const _simd_type2& src1, \
dst_ctype* dst) const { \
auto vitem = operator()(src0, src1); \
GiStore##_func_suffix(dst, vitem.val[0]); \
GiStore##_func_suffix(dst + SIMD_WIDTH, vitem.val[1]); \
GiStore##_func_suffix(dst, GiGetSubVector##_func_suffix##V2(vitem, 0)); \
GiStore##_func_suffix( \
dst + SIMD_WIDTH, GiGetSubVector##_func_suffix##V2(vitem, 1)); \
} \
_simd_type2 operator()( \
const _simd_type2& src0, const _simd_type2& src1) const { \
auto val1 = src0.val[0]; \
auto val2 = src0.val[1]; \
auto val3 = src1.val[0]; \
auto val4 = src1.val[1]; \
auto val1 = GiGetSubVector##_func_suffix##V2(src0, 0); \
auto val2 = GiGetSubVector##_func_suffix##V2(src0, 1); \
auto val3 = GiGetSubVector##_func_suffix##V2(src1, 0); \
auto val4 = GiGetSubVector##_func_suffix##V2(src1, 1); \
val1 = GiAdd##_func_suffix(val1, val3); \
val2 = GiAdd##_func_suffix(val2, val4); \
H_SWISH_KERN_FALLBACK(_func_suffix, val1, val2); \
return {{val1, val2}}; \
_simd_type2 ret; \
GiSetSubVector##_func_suffix##V2(ret, 0, val1); \
GiSetSubVector##_func_suffix##V2(ret, 1, val2); \
return ret; \
} \
void operator()( \
const _simd_type& src0, const _simd_type& src1, \
@@ -98,15 +102,28 @@ struct FuseAddHSwishOp<dt_qint32, dt_qint8> : FuseAddHSwishOpBase<dt_qint32, dt_
GI_FLOAT32_t vitem0, vitem1;

vitem0 = GiAddFloat32(
GiMultiplyFloat32(GiCastToFloat32(vsrc0.val[0]), this->vscale_src0),
GiMultiplyFloat32(GiCastToFloat32(vsrc1.val[0]), this->vscale_src1));
GiMultiplyFloat32(
GiCastToFloat32(GiGetSubVectorInt32V2(vsrc0, 0)),
GiFixLenType2GiFloat32Type(this->vscale_src0)),
GiMultiplyFloat32(
GiCastToFloat32(GiGetSubVectorInt32V2(vsrc1, 0)),
GiFixLenType2GiFloat32Type(this->vscale_src1)));
vitem1 = GiAddFloat32(
GiMultiplyFloat32(GiCastToFloat32(vsrc0.val[1]), this->vscale_src0),
GiMultiplyFloat32(GiCastToFloat32(vsrc1.val[1]), this->vscale_src1));
GiMultiplyFloat32(
GiCastToFloat32(GiGetSubVectorInt32V2(vsrc0, 1)),
GiFixLenType2GiFloat32Type(this->vscale_src0)),
GiMultiplyFloat32(
GiCastToFloat32(GiGetSubVectorInt32V2(vsrc1, 1)),
GiFixLenType2GiFloat32Type(this->vscale_src1)));
H_SWISH_KERN_FALLBACK(Float32, vitem0, vitem1);
vitem0 = GiMultiplyFloat32(vitem0, this->vscale_dst);
vitem1 = GiMultiplyFloat32(vitem1, this->vscale_dst);
return QConverter::convert<GI_INT8_t, GI_FLOAT32_V2_t>({{vitem0, vitem1}});
vitem0 =
GiMultiplyFloat32(vitem0, GiFixLenType2GiFloat32Type(this->vscale_dst));
vitem1 =
GiMultiplyFloat32(vitem1, GiFixLenType2GiFloat32Type(this->vscale_dst));
GI_FLOAT32_V2_t ret;
GiSetSubVectorFloat32V2(ret, 0, vitem0);
GiSetSubVectorFloat32V2(ret, 1, vitem1);
return QConverter::convert<GI_INT8_t, GI_FLOAT32_V2_t>(ret);
}
};



+ 43
- 17
dnn/src/fallback/elemwise_helper/kimpl/fuse_add_relu.h View File

@@ -35,17 +35,21 @@ struct FuseAddReluOp;
const _simd_type2& src0, const _simd_type2& src1, \
dst_ctype* dst) const { \
auto vitem = operator()(src0, src1); \
GiStore##_func_suffix(dst, vitem.val[0]); \
GiStore##_func_suffix(dst + SIMD_WIDTH, vitem.val[1]); \
GiStore##_func_suffix(dst, GiGetSubVector##_func_suffix##V2(vitem, 0)); \
GiStore##_func_suffix( \
dst + SIMD_WIDTH, GiGetSubVector##_func_suffix##V2(vitem, 1)); \
} \
_simd_type2 operator()( \
const _simd_type2& src0, const _simd_type2& src1) const { \
auto val1 = src0.val[0]; \
auto val2 = src0.val[1]; \
auto val3 = src1.val[0]; \
auto val4 = src1.val[1]; \
auto val1 = GiGetSubVector##_func_suffix##V2(src0, 0); \
auto val2 = GiGetSubVector##_func_suffix##V2(src0, 1); \
auto val3 = GiGetSubVector##_func_suffix##V2(src1, 0); \
auto val4 = GiGetSubVector##_func_suffix##V2(src1, 1); \
FUSE_ADD_RELU_SIMD_PACK2_FALLBACK(val1, val2, val3, val4, _func_suffix); \
return {{val1, val2}}; \
_simd_type2 ret; \
GiSetSubVector##_func_suffix##V2(ret, 0, val1); \
GiSetSubVector##_func_suffix##V2(ret, 1, val2); \
return ret; \
} \
void operator()( \
const _simd_type& src0, const _simd_type& src1, \
@@ -105,15 +109,26 @@ struct FuseAddReluOp<dt_qint8, dt_qint8> : FuseAddReluOpBase<dt_qint8, dt_qint8>

GI_INT8_t operator()(const GI_INT32_V2_t& vsrc0, const GI_INT32_V2_t& vsrc1) const {
auto vitem0 = GiAddFloat32(
GiMultiplyFloat32(GiCastToFloat32(vsrc0.val[0]), this->vscale0),
GiMultiplyFloat32(GiCastToFloat32(vsrc1.val[0]), this->vscale1));
GiMultiplyFloat32(
GiCastToFloat32(GiGetSubVectorInt32V2(vsrc0, 0)),
GiFixLenType2GiFloat32Type(this->vscale0)),
GiMultiplyFloat32(
GiCastToFloat32(GiGetSubVectorInt32V2(vsrc1, 0)),
GiFixLenType2GiFloat32Type(this->vscale1)));
auto vitem1 = GiAddFloat32(
GiMultiplyFloat32(GiCastToFloat32(vsrc0.val[1]), this->vscale0),
GiMultiplyFloat32(GiCastToFloat32(vsrc1.val[1]), this->vscale1));
GiMultiplyFloat32(
GiCastToFloat32(GiGetSubVectorInt32V2(vsrc0, 1)),
GiFixLenType2GiFloat32Type(this->vscale0)),
GiMultiplyFloat32(
GiCastToFloat32(GiGetSubVectorInt32V2(vsrc1, 1)),
GiFixLenType2GiFloat32Type(this->vscale1)));

vitem0 = GiMaximumFloat32(vitem0, this->vzero());
vitem1 = GiMaximumFloat32(vitem1, this->vzero());
return QConverter::convert<GI_INT8_t, GI_FLOAT32_V2_t>({{vitem0, vitem1}});
GI_FLOAT32_V2_t ret;
GiSetSubVectorFloat32V2(ret, 0, vitem0);
GiSetSubVectorFloat32V2(ret, 1, vitem1);
return QConverter::convert<GI_INT8_t, GI_FLOAT32_V2_t>(ret);
}
};

@@ -144,15 +159,26 @@ struct FuseAddReluOp<dt_qint32, dt_qint8> : FuseAddReluOpBase<dt_qint32, dt_qint

GI_INT8_t operator()(const GI_INT32_V2_t& vsrc0, const GI_INT32_V2_t& vsrc1) const {
auto vitem0 = GiAddFloat32(
GiMultiplyFloat32(GiCastToFloat32(vsrc0.val[0]), this->vscale0),
GiMultiplyFloat32(GiCastToFloat32(vsrc1.val[0]), this->vscale1));
GiMultiplyFloat32(
GiCastToFloat32(GiGetSubVectorInt32V2(vsrc0, 0)),
GiFixLenType2GiFloat32Type(this->vscale0)),
GiMultiplyFloat32(
GiCastToFloat32(GiGetSubVectorInt32V2(vsrc1, 0)),
GiFixLenType2GiFloat32Type(this->vscale1)));
auto vitem1 = GiAddFloat32(
GiMultiplyFloat32(GiCastToFloat32(vsrc0.val[1]), this->vscale0),
GiMultiplyFloat32(GiCastToFloat32(vsrc1.val[1]), this->vscale1));
GiMultiplyFloat32(
GiCastToFloat32(GiGetSubVectorInt32V2(vsrc0, 1)),
GiFixLenType2GiFloat32Type(this->vscale0)),
GiMultiplyFloat32(
GiCastToFloat32(GiGetSubVectorInt32V2(vsrc1, 1)),
GiFixLenType2GiFloat32Type(this->vscale1)));

vitem0 = GiMaximumFloat32(vitem0, this->vzero());
vitem1 = GiMaximumFloat32(vitem1, this->vzero());
return QConverter::convert<GI_INT8_t, GI_FLOAT32_V2_t>({{vitem0, vitem1}});
GI_FLOAT32_V2_t ret;
GiSetSubVectorFloat32V2(ret, 0, vitem0);
GiSetSubVectorFloat32V2(ret, 1, vitem1);
return QConverter::convert<GI_INT8_t, GI_FLOAT32_V2_t>(ret);
}
};



+ 11
- 7
dnn/src/fallback/elemwise_helper/kimpl/fuse_add_sigmoid.h View File

@@ -36,19 +36,23 @@ struct FuseAddSigmoidOp;
const _simd_type& src0, const _simd_type& src1, \
dst_ctype* dst) const { \
auto vitem = operator()(src0, src1); \
GiStore##_func_suffix(dst, vitem.val[0]); \
GiStore##_func_suffix(dst + SIMD_WIDTH, vitem.val[1]); \
GiStore##_func_suffix(dst, GiGetSubVector##_func_suffix##V2(vitem, 0)); \
GiStore##_func_suffix( \
dst + SIMD_WIDTH, GiGetSubVector##_func_suffix##V2(vitem, 1)); \
} \
_simd_type operator()(const _simd_type& src0, const _simd_type& src1) const { \
auto val1 = src0.val[0]; \
auto val2 = src0.val[1]; \
auto val3 = src1.val[0]; \
auto val4 = src1.val[1]; \
auto val1 = GiGetSubVector##_func_suffix##V2(src0, 0); \
auto val2 = GiGetSubVector##_func_suffix##V2(src0, 1); \
auto val3 = GiGetSubVector##_func_suffix##V2(src1, 0); \
auto val4 = GiGetSubVector##_func_suffix##V2(src1, 1); \
val1 = GiAdd##_func_suffix(val1, val3); \
val2 = GiAdd##_func_suffix(val2, val4); \
val1 = GiSigmoidPs##_func_suffix(val1); \
val2 = GiSigmoidPs##_func_suffix(val2); \
return {{val1, val2}}; \
_simd_type ret; \
GiSetSubVector##_func_suffix##V2(ret, 0, val1); \
GiSetSubVector##_func_suffix##V2(ret, 1, val2); \
return ret; \
} \
};
OP(dt_float32, GI_FLOAT32_V2_t, Float32, GI_SIMD_LEN_BYTE / sizeof(float))


+ 11
- 7
dnn/src/fallback/elemwise_helper/kimpl/fuse_add_tanh.h View File

@@ -35,14 +35,15 @@ struct FuseAddTanhOp;
const _simd_type& src0, const _simd_type& src1, \
dst_ctype* dst) const { \
auto vitem = operator()(src0, src1); \
GiStore##_func_suffix(dst, vitem.val[0]); \
GiStore##_func_suffix(dst + SIMD_WIDTH, vitem.val[1]); \
GiStore##_func_suffix(dst, GiGetSubVector##_func_suffix##V2(vitem, 0)); \
GiStore##_func_suffix( \
dst + SIMD_WIDTH, GiGetSubVector##_func_suffix##V2(vitem, 1)); \
} \
_simd_type operator()(const _simd_type& src0, const _simd_type& src1) const { \
auto val1 = src0.val[0]; \
auto val2 = src0.val[1]; \
auto val3 = src1.val[0]; \
auto val4 = src1.val[1]; \
auto val1 = GiGetSubVector##_func_suffix##V2(src0, 0); \
auto val2 = GiGetSubVector##_func_suffix##V2(src0, 1); \
auto val3 = GiGetSubVector##_func_suffix##V2(src1, 0); \
auto val4 = GiGetSubVector##_func_suffix##V2(src1, 1); \
val1 = GiAdd##_func_suffix(val1, val3); \
val2 = GiAdd##_func_suffix(val2, val4); \
auto exp1 = GiExpPs##_func_suffix(val1); \
@@ -65,7 +66,10 @@ struct FuseAddTanhOp;
GiRecpeS##_func_suffix(exp2, rexp2), rexp2); \
val1 = GiMultiply##_func_suffix(val1, rexp1); \
val2 = GiMultiply##_func_suffix(val2, rexp2); \
return {{val1, val2}}; \
_simd_type ret; \
GiSetSubVector##_func_suffix##V2(ret, 0, val1); \
GiSetSubVector##_func_suffix##V2(ret, 1, val2); \
return ret; \
} \
};
OP(dt_float32, GI_FLOAT32_V2_t, Float32, GI_SIMD_LEN_BYTE / sizeof(float))


+ 30
- 22
dnn/src/fallback/elemwise_helper/kimpl/fuse_mul_add3.h View File

@@ -26,28 +26,36 @@ struct FuseMulAdd3OpBase : TernaryOpBase<src_ctype, dst_ctype> {
template <typename src_ctype, typename dst_ctype = src_ctype>
struct FuseMulAdd3Op;

#define OP(_ctype, _simd_type, _func_suffix, _simd_width) \
template <> \
struct FuseMulAdd3Op<_ctype> : FuseMulAdd3OpBase<_ctype> { \
using FuseMulAdd3OpBase::FuseMulAdd3OpBase; \
using FuseMulAdd3OpBase::operator(); \
constexpr static size_t SIMD_WIDTH = _simd_width; \
void operator()( \
const _simd_type& src0, const _simd_type& src1, \
const _simd_type& src2, dst_ctype* dst) const { \
auto vitem = operator()(src0, src1, src2); \
GiStore##_func_suffix(dst, vitem.val[0]); \
GiStore##_func_suffix(dst + SIMD_WIDTH, vitem.val[1]); \
} \
_simd_type operator()( \
const _simd_type& src0, const _simd_type& src1, \
const _simd_type& src2) const { \
auto vitem0 = GiMultiplyAdd##_func_suffix( \
src2.val[0], src0.val[0], src1.val[0]); \
auto vitem1 = GiMultiplyAdd##_func_suffix( \
src2.val[1], src0.val[1], src1.val[1]); \
return {{vitem0, vitem1}}; \
} \
#define OP(_ctype, _simd_type, _func_suffix, _simd_width) \
template <> \
struct FuseMulAdd3Op<_ctype> : FuseMulAdd3OpBase<_ctype> { \
using FuseMulAdd3OpBase::FuseMulAdd3OpBase; \
using FuseMulAdd3OpBase::operator(); \
constexpr static size_t SIMD_WIDTH = _simd_width; \
void operator()( \
const _simd_type& src0, const _simd_type& src1, \
const _simd_type& src2, dst_ctype* dst) const { \
auto vitem = operator()(src0, src1, src2); \
GiStore##_func_suffix(dst, GiGetSubVector##_func_suffix##V2(vitem, 0)); \
GiStore##_func_suffix( \
dst + SIMD_WIDTH, GiGetSubVector##_func_suffix##V2(vitem, 1)); \
} \
_simd_type operator()( \
const _simd_type& src0, const _simd_type& src1, \
const _simd_type& src2) const { \
auto vitem0 = GiMultiplyAdd##_func_suffix( \
GiGetSubVector##_func_suffix##V2(src2, 0), \
GiGetSubVector##_func_suffix##V2(src0, 0), \
GiGetSubVector##_func_suffix##V2(src1, 0)); \
auto vitem1 = GiMultiplyAdd##_func_suffix( \
GiGetSubVector##_func_suffix##V2(src2, 1), \
GiGetSubVector##_func_suffix##V2(src0, 1), \
GiGetSubVector##_func_suffix##V2(src1, 1)); \
_simd_type ret; \
GiSetSubVector##_func_suffix##V2(ret, 0, vitem0); \
GiSetSubVector##_func_suffix##V2(ret, 1, vitem1); \
return ret; \
} \
};
OP(dt_float32, GI_FLOAT32_V2_t, Float32, GI_SIMD_LEN_BYTE / sizeof(float))
OP(dt_int32, GI_INT32_V2_t, Int32, GI_SIMD_LEN_BYTE / sizeof(int32_t))


+ 51
- 38
dnn/src/fallback/elemwise_helper/kimpl/hswish.h View File

@@ -26,39 +26,43 @@ struct HSwishOpBase : UnaryOpBase<src_ctype, dst_ctype> {
template <typename src_ctype, typename dst_ctype = src_ctype>
struct HSwishOp;

#define OP(_ctype, _simd_type, _simd_type2, _func_suffix, _simd_width) \
template <> \
struct HSwishOp<_ctype> : HSwishOpBase<_ctype> { \
using HSwishOpBase::HSwishOpBase; \
using HSwishOpBase::operator(); \
constexpr static size_t SIMD_WIDTH = _simd_width; \
void operator()(const _simd_type2& src, _ctype* dst) const { \
auto vitem = operator()(src); \
GiStore##_func_suffix(dst, vitem.val[0]); \
GiStore##_func_suffix(dst + SIMD_WIDTH, vitem.val[1]); \
} \
void operator()(const _simd_type& src, _ctype* dst) const { \
auto vitem = operator()(src); \
GiStore##_func_suffix(dst, vitem); \
} \
_simd_type2 operator()(const _simd_type2& src) const { \
auto val1 = src.val[0]; \
auto val2 = src.val[1]; \
H_SWISH_KERN_FALLBACK(_func_suffix, val1, val2); \
return {{val1, val2}}; \
} \
_simd_type operator()(const _simd_type& src) const { \
auto val_zero = GiBroadcast##_func_suffix(0.f); \
auto val_six = GiBroadcast##_func_suffix(6.f); \
auto val_three = GiBroadcast##_func_suffix(3.f); \
auto val_rec_six = GiBroadcast##_func_suffix(1.f / 6.f); \
auto clip1 = GiMaximum##_func_suffix( \
GiMinimum##_func_suffix( \
GiAdd##_func_suffix(src, val_three), val_six), \
val_zero); \
return GiMultiply##_func_suffix( \
GiMultiply##_func_suffix(src, clip1), val_rec_six); \
} \
#define OP(_ctype, _simd_type, _simd_type2, _func_suffix, _simd_width) \
template <> \
struct HSwishOp<_ctype> : HSwishOpBase<_ctype> { \
using HSwishOpBase::HSwishOpBase; \
using HSwishOpBase::operator(); \
constexpr static size_t SIMD_WIDTH = _simd_width; \
void operator()(const _simd_type2& src, _ctype* dst) const { \
auto vitem = operator()(src); \
GiStore##_func_suffix(dst, GiGetSubVector##_func_suffix##V2(vitem, 0)); \
GiStore##_func_suffix( \
dst + SIMD_WIDTH, GiGetSubVector##_func_suffix##V2(vitem, 1)); \
} \
void operator()(const _simd_type& src, _ctype* dst) const { \
auto vitem = operator()(src); \
GiStore##_func_suffix(dst, vitem); \
} \
_simd_type2 operator()(const _simd_type2& src) const { \
auto val1 = GiGetSubVector##_func_suffix##V2(src, 0); \
auto val2 = GiGetSubVector##_func_suffix##V2(src, 1); \
H_SWISH_KERN_FALLBACK(_func_suffix, val1, val2); \
_simd_type2 ret; \
GiSetSubVector##_func_suffix##V2(ret, 0, val1); \
GiSetSubVector##_func_suffix##V2(ret, 1, val2); \
return ret; \
} \
_simd_type operator()(const _simd_type& src) const { \
auto val_zero = GiBroadcast##_func_suffix(0.f); \
auto val_six = GiBroadcast##_func_suffix(6.f); \
auto val_three = GiBroadcast##_func_suffix(3.f); \
auto val_rec_six = GiBroadcast##_func_suffix(1.f / 6.f); \
auto clip1 = GiMaximum##_func_suffix( \
GiMinimum##_func_suffix( \
GiAdd##_func_suffix(src, val_three), val_six), \
val_zero); \
return GiMultiply##_func_suffix( \
GiMultiply##_func_suffix(src, clip1), val_rec_six); \
} \
};

OP(dt_float32, GI_FLOAT32_t, GI_FLOAT32_V2_t, Float32, GI_SIMD_LEN_BYTE / sizeof(float))
@@ -90,14 +94,23 @@ struct HSwishOp<dt_qint32, dt_qint8> : HSwishOpBase<dt_qint32, dt_qint8> {
}

GI_INT8_t operator()(const GI_INT32_V2_t& vsrc) const {
auto vitem0 = GiMultiplyFloat32(GiCastToFloat32(vsrc.val[0]), this->vscale_src);
auto vitem1 = GiMultiplyFloat32(GiCastToFloat32(vsrc.val[1]), this->vscale_src);
auto vitem0 = GiMultiplyFloat32(
GiCastToFloat32(GiGetSubVectorInt32V2(vsrc, 0)),
GiFixLenType2GiFloat32Type(this->vscale_src));
auto vitem1 = GiMultiplyFloat32(
GiCastToFloat32(GiGetSubVectorInt32V2(vsrc, 1)),
GiFixLenType2GiFloat32Type(this->vscale_src));

H_SWISH_KERN_FALLBACK(Float32, vitem0, vitem1);
vitem0 = GiMultiplyFloat32(vitem0, this->vscale_dst);
vitem1 = GiMultiplyFloat32(vitem1, this->vscale_dst);
vitem0 =
GiMultiplyFloat32(vitem0, GiFixLenType2GiFloat32Type(this->vscale_dst));
vitem1 =
GiMultiplyFloat32(vitem1, GiFixLenType2GiFloat32Type(this->vscale_dst));
GI_FLOAT32_V2_t tmp;
GiSetSubVectorFloat32V2(tmp, 0, vitem0);
GiSetSubVectorFloat32V2(tmp, 1, vitem1);

return QConverter::convert<GI_INT8_t, GI_FLOAT32_V2_t>({{vitem0, vitem1}});
return QConverter::convert<GI_INT8_t, GI_FLOAT32_V2_t>(tmp);
}
};



+ 29
- 10
dnn/src/fallback/elemwise_helper/kimpl/max.h View File

@@ -32,14 +32,22 @@ struct MaxOp;
const _simd_type2& src0, const _simd_type2& src1, \
dst_ctype* dst) const { \
auto vitem = operator()(src0, src1); \
GiStore##_func_suffix(dst, vitem.val[0]); \
GiStore##_func_suffix(dst + SIMD_WIDTH, vitem.val[1]); \
GiStore##_func_suffix(dst, GiGetSubVector##_func_suffix##V2(vitem, 0)); \
GiStore##_func_suffix( \
dst + SIMD_WIDTH, GiGetSubVector##_func_suffix##V2(vitem, 1)); \
} \
_simd_type2 operator()( \
const _simd_type2& src0, const _simd_type2& src1) const { \
auto vitem0 = GiMaximum##_func_suffix(src0.val[0], src1.val[0]); \
auto vitem1 = GiMaximum##_func_suffix(src0.val[1], src1.val[1]); \
return {{vitem0, vitem1}}; \
auto vitem0 = GiMaximum##_func_suffix( \
GiGetSubVector##_func_suffix##V2(src0, 0), \
GiGetSubVector##_func_suffix##V2(src1, 0)); \
auto vitem1 = GiMaximum##_func_suffix( \
GiGetSubVector##_func_suffix##V2(src0, 1), \
GiGetSubVector##_func_suffix##V2(src1, 1)); \
_simd_type2 ret; \
GiSetSubVector##_func_suffix##V2(ret, 0, vitem0); \
GiSetSubVector##_func_suffix##V2(ret, 1, vitem1); \
return ret; \
} \
void operator()( \
const _simd_type& src0, const _simd_type& src1, \
@@ -87,12 +95,23 @@ struct MaxOp<dt_qint8, dt_qint8> : MaxOpBase<dt_qint8, dt_qint8> {

GI_INT8_t operator()(const GI_INT32_V2_t& vsrc0, const GI_INT32_V2_t& vsrc1) const {
auto vitem0 = GiMaximumFloat32(
GiMultiplyFloat32(GiCastToFloat32(vsrc0.val[0]), this->vscale0),
GiMultiplyFloat32(GiCastToFloat32(vsrc1.val[0]), this->vscale1));
GiMultiplyFloat32(
GiCastToFloat32(GiGetSubVectorInt32V2(vsrc0, 0)),
GiFixLenType2GiFloat32Type(this->vscale0)),
GiMultiplyFloat32(
GiCastToFloat32(GiGetSubVectorInt32V2(vsrc1, 0)),
GiFixLenType2GiFloat32Type(this->vscale1)));
auto vitem1 = GiMaximumFloat32(
GiMultiplyFloat32(GiCastToFloat32(vsrc0.val[1]), this->vscale0),
GiMultiplyFloat32(GiCastToFloat32(vsrc1.val[1]), this->vscale1));
return QConverter::convert<GI_INT8_t, GI_FLOAT32_V2_t>({{vitem0, vitem1}});
GiMultiplyFloat32(
GiCastToFloat32(GiGetSubVectorInt32V2(vsrc0, 1)),
GiFixLenType2GiFloat32Type(this->vscale0)),
GiMultiplyFloat32(
GiCastToFloat32(GiGetSubVectorInt32V2(vsrc1, 1)),
GiFixLenType2GiFloat32Type(this->vscale1)));
GI_FLOAT32_V2_t tmp;
GiSetSubVectorFloat32V2(tmp, 0, vitem0);
GiSetSubVectorFloat32V2(tmp, 1, vitem1);
return QConverter::convert<GI_INT8_t, GI_FLOAT32_V2_t>(tmp);
}
};



+ 29
- 10
dnn/src/fallback/elemwise_helper/kimpl/min.h View File

@@ -33,14 +33,22 @@ struct MinOp;
const _simd_type2& src0, const _simd_type2& src1, \
dst_ctype* dst) const { \
auto vitem = operator()(src0, src1); \
GiStore##_func_suffix(dst, vitem.val[0]); \
GiStore##_func_suffix(dst + SIMD_WIDTH, vitem.val[1]); \
GiStore##_func_suffix(dst, GiGetSubVector##_func_suffix##V2(vitem, 0)); \
GiStore##_func_suffix( \
dst + SIMD_WIDTH, GiGetSubVector##_func_suffix##V2(vitem, 1)); \
} \
_simd_type2 operator()( \
const _simd_type2& src0, const _simd_type2& src1) const { \
auto vitem0 = GiMinimum##_func_suffix(src0.val[0], src1.val[0]); \
auto vitem1 = GiMinimum##_func_suffix(src0.val[1], src1.val[1]); \
return {{vitem0, vitem1}}; \
auto vitem0 = GiMinimum##_func_suffix( \
GiGetSubVector##_func_suffix##V2(src0, 0), \
GiGetSubVector##_func_suffix##V2(src1, 0)); \
auto vitem1 = GiMinimum##_func_suffix( \
GiGetSubVector##_func_suffix##V2(src0, 1), \
GiGetSubVector##_func_suffix##V2(src1, 1)); \
_simd_type2 ret; \
GiSetSubVector##_func_suffix##V2(ret, 0, vitem0); \
GiSetSubVector##_func_suffix##V2(ret, 1, vitem1); \
return ret; \
} \
void operator()( \
const _simd_type& src0, const _simd_type& src1, \
@@ -84,12 +92,23 @@ struct MinOp<dt_qint8, dt_qint8> : MinOpBase<dt_qint8, dt_qint8> {

GI_INT8_t operator()(const GI_INT32_V2_t& vsrc0, const GI_INT32_V2_t& vsrc1) const {
auto vitem0 = GiMinimumFloat32(
GiMultiplyFloat32(GiCastToFloat32(vsrc0.val[0]), this->vscale0),
GiMultiplyFloat32(GiCastToFloat32(vsrc1.val[0]), this->vscale1));
GiMultiplyFloat32(
GiCastToFloat32(GiGetSubVectorInt32V2(vsrc0, 0)),
GiFixLenType2GiFloat32Type(this->vscale0)),
GiMultiplyFloat32(
GiCastToFloat32(GiGetSubVectorInt32V2(vsrc1, 0)),
GiFixLenType2GiFloat32Type(this->vscale1)));
auto vitem1 = GiMinimumFloat32(
GiMultiplyFloat32(GiCastToFloat32(vsrc0.val[1]), this->vscale0),
GiMultiplyFloat32(GiCastToFloat32(vsrc1.val[1]), this->vscale1));
return QConverter::convert<GI_INT8_t, GI_FLOAT32_V2_t>({{vitem0, vitem1}});
GiMultiplyFloat32(
GiCastToFloat32(GiGetSubVectorInt32V2(vsrc0, 1)),
GiFixLenType2GiFloat32Type(this->vscale0)),
GiMultiplyFloat32(
GiCastToFloat32(GiGetSubVectorInt32V2(vsrc1, 1)),
GiFixLenType2GiFloat32Type(this->vscale1)));
GI_FLOAT32_V2_t tmp;
GiSetSubVectorFloat32V2(tmp, 0, vitem0);
GiSetSubVectorFloat32V2(tmp, 1, vitem1);
return QConverter::convert<GI_INT8_t, GI_FLOAT32_V2_t>(tmp);
}
};



+ 29
- 10
dnn/src/fallback/elemwise_helper/kimpl/mul.h View File

@@ -33,14 +33,22 @@ struct MulOp;
const _simd_type2& src0, const _simd_type2& src1, \
dst_ctype* dst) const { \
auto vitem = operator()(src0, src1); \
GiStore##_func_suffix(dst, vitem.val[0]); \
GiStore##_func_suffix(dst + SIMD_WIDTH, vitem.val[1]); \
GiStore##_func_suffix(dst, GiGetSubVector##_func_suffix##V2(vitem, 0)); \
GiStore##_func_suffix( \
dst + SIMD_WIDTH, GiGetSubVector##_func_suffix##V2(vitem, 1)); \
} \
_simd_type2 operator()( \
const _simd_type2& src0, const _simd_type2& src1) const { \
auto vitem0 = GiMultiply##_func_suffix(src0.val[0], src1.val[0]); \
auto vitem1 = GiMultiply##_func_suffix(src0.val[1], src1.val[1]); \
return {{vitem0, vitem1}}; \
auto vitem0 = GiMultiply##_func_suffix( \
GiGetSubVector##_func_suffix##V2(src0, 0), \
GiGetSubVector##_func_suffix##V2(src1, 0)); \
auto vitem1 = GiMultiply##_func_suffix( \
GiGetSubVector##_func_suffix##V2(src0, 1), \
GiGetSubVector##_func_suffix##V2(src1, 1)); \
_simd_type2 ret; \
GiSetSubVector##_func_suffix##V2(ret, 0, vitem0); \
GiSetSubVector##_func_suffix##V2(ret, 1, vitem1); \
return ret; \
} \
void operator()( \
const _simd_type& src0, const _simd_type& src1, \
@@ -83,13 +91,24 @@ struct MulOp<dt_qint8, dt_qint8> : MulOpBase<dt_qint8, dt_qint8> {

GI_INT8_t operator()(const GI_INT32_V2_t& vsrc0, const GI_INT32_V2_t& vsrc1) const {
auto vitem0 = GiMultiplyFloat32(
GiMultiplyFloat32(GiCastToFloat32(vsrc0.val[0]), this->vscale_src0),
GiMultiplyFloat32(GiCastToFloat32(vsrc1.val[0]), this->vscale1));
GiMultiplyFloat32(
GiCastToFloat32(GiGetSubVectorInt32V2(vsrc0, 0)),
GiFixLenType2GiFloat32Type(this->vscale_src0)),
GiMultiplyFloat32(
GiCastToFloat32(GiGetSubVectorInt32V2(vsrc1, 0)),
GiFixLenType2GiFloat32Type(this->vscale1)));
auto vitem1 = GiMultiplyFloat32(
GiMultiplyFloat32(GiCastToFloat32(vsrc0.val[1]), this->vscale_src0),
GiMultiplyFloat32(GiCastToFloat32(vsrc1.val[1]), this->vscale1));
GiMultiplyFloat32(
GiCastToFloat32(GiGetSubVectorInt32V2(vsrc0, 1)),
GiFixLenType2GiFloat32Type(this->vscale_src0)),
GiMultiplyFloat32(
GiCastToFloat32(GiGetSubVectorInt32V2(vsrc1, 1)),
GiFixLenType2GiFloat32Type(this->vscale1)));

return QConverter::convert<GI_INT8_t, GI_FLOAT32_V2_t>({{vitem0, vitem1}});
GI_FLOAT32_V2_t tmp;
GiSetSubVectorFloat32V2(tmp, 0, vitem0);
GiSetSubVectorFloat32V2(tmp, 1, vitem1);
return QConverter::convert<GI_INT8_t, GI_FLOAT32_V2_t>(tmp);
}
};



+ 20
- 19
dnn/src/fallback/elemwise_helper/kimpl/none.h View File

@@ -16,23 +16,24 @@ struct NoneOpBase : UnaryOpBase<src_ctype, dst_ctype> {

template <typename src_ctype, typename dst_type = src_ctype>
struct NoneOp;
#define OP(_ctype, _simd_type, _simd_type2, _func_suffix, _simd_width) \
template <> \
struct NoneOp<_ctype> : NoneOpBase<_ctype> { \
NoneOp(){}; \
NoneOp(float, float){}; \
using NoneOpBase::NoneOpBase; \
using NoneOpBase::operator(); \
constexpr static size_t SIMD_WIDTH = _simd_width; \
_simd_type2 operator()(const _simd_type2& src) const { return src; } \
void operator()(const _simd_type2& src, _ctype* dst) const { \
GiStore##_func_suffix(dst, src.val[0]); \
GiStore##_func_suffix(dst + SIMD_WIDTH, src.val[1]); \
} \
void operator()(const _simd_type& src, _ctype* dst) const { \
GiStore##_func_suffix(dst, src); \
} \
_simd_type operator()(const _simd_type& src) const { return src; } \
#define OP(_ctype, _simd_type, _simd_type2, _func_suffix, _simd_width) \
template <> \
struct NoneOp<_ctype> : NoneOpBase<_ctype> { \
NoneOp(){}; \
NoneOp(float, float){}; \
using NoneOpBase::NoneOpBase; \
using NoneOpBase::operator(); \
constexpr static size_t SIMD_WIDTH = _simd_width; \
_simd_type2 operator()(const _simd_type2& src) const { return src; } \
void operator()(const _simd_type2& src, _ctype* dst) const { \
GiStore##_func_suffix(dst, GiGetSubVector##_func_suffix##V2(src, 0)); \
GiStore##_func_suffix( \
dst + SIMD_WIDTH, GiGetSubVector##_func_suffix##V2(src, 1)); \
} \
void operator()(const _simd_type& src, _ctype* dst) const { \
GiStore##_func_suffix(dst, src); \
} \
_simd_type operator()(const _simd_type& src) const { return src; } \
};

OP(dt_float32, GI_FLOAT32_t, GI_FLOAT32_V2_t, Float32, GI_SIMD_LEN_BYTE / sizeof(float))
@@ -61,8 +62,8 @@ struct NoneOp<dt_qint32, dt_qint8> : NoneOpBase<dt_qint32, dt_qint8> {
constexpr static size_t SIMD_WIDTH = GI_SIMD_LEN_BYTE / sizeof(int32_t);

void operator()(const GI_INT32_V2_t& vsrc, dt_qint8* dst) const {
GiStoreInt32(dst, vsrc.val[0]);
GiStoreInt32(dst + 16, vsrc.val[1]);
GiStoreInt32(dst, GiGetSubVectorInt32V2(vsrc, 0));
GiStoreInt32(dst + 16, GiGetSubVectorInt32V2(vsrc, 1));
}
void operator()(const GI_INT32_t& src, dt_qint8* dst) const {
GiStoreInt32(dst, src);


+ 206
- 142
dnn/src/fallback/elemwise_helper/kimpl/op_base.h View File

@@ -31,24 +31,24 @@ struct UnaryOpBase : OpBase<src_ctype, dst_ctype> {
UnaryOpBase(DType /*src_dtype*/, DType /*dst_dtype*/) {}
};

#define OPERATOR_UNARY_QINT8_FALLBACK \
GI_INT16_t vsrct0 = GiMoveLowLongInt8(vsrc.val[0]); \
GiStoreLowInt8( \
reinterpret_cast<int8_t*>(dst), operator()( \
{{GiMoveLowLongInt16(vsrct0), \
GiMoveHighLongInt16(vsrct0)}})); \
GI_INT16_t vsrct1 = GiMoveHighLongInt8(vsrc.val[0]); \
GiStoreLowInt8( \
reinterpret_cast<int8_t*>(dst + 8), \
operator()({{GiMoveLowLongInt16(vsrct1), GiMoveHighLongInt16(vsrct1)}})); \
GI_INT16_t vsrct2 = GiMoveLowLongInt8(vsrc.val[1]); \
GiStoreLowInt8( \
reinterpret_cast<int8_t*>(dst + 16), \
operator()({{GiMoveLowLongInt16(vsrct2), GiMoveHighLongInt16(vsrct2)}})); \
GI_INT16_t vsrct3 = GiMoveHighLongInt8(vsrc.val[1]); \
GiStoreLowInt8( \
reinterpret_cast<int8_t*>(dst + 24), \
operator()({{GiMoveLowLongInt16(vsrct3), GiMoveHighLongInt16(vsrct3)}}))
#define OPERATOR_UNARY_QINT8_FALLBACK \
GI_INT16_t vsrct0 = GiMoveLowLongInt8(GiGetSubVectorInt8V2(vsrc, 0)); \
GI_INT32_V2_t tmp; \
GiSetSubVectorInt32V2(tmp, 0, GiMoveLowLongInt16(vsrct0)); \
GiSetSubVectorInt32V2(tmp, 1, GiMoveHighLongInt16(vsrct0)); \
GiStoreLowInt8(reinterpret_cast<int8_t*>(dst), operator()(tmp)); \
GI_INT16_t vsrct1 = GiMoveHighLongInt8(GiGetSubVectorInt8V2(vsrc, 0)); \
GiSetSubVectorInt32V2(tmp, 0, GiMoveLowLongInt16(vsrct1)); \
GiSetSubVectorInt32V2(tmp, 1, GiMoveHighLongInt16(vsrct1)); \
GiStoreLowInt8(reinterpret_cast<int8_t*>(dst + 8), operator()(tmp)); \
GI_INT16_t vsrct2 = GiMoveLowLongInt8(GiGetSubVectorInt8V2(vsrc, 1)); \
GiSetSubVectorInt32V2(tmp, 0, GiMoveLowLongInt16(vsrct2)); \
GiSetSubVectorInt32V2(tmp, 1, GiMoveHighLongInt16(vsrct2)); \
GiStoreLowInt8(reinterpret_cast<int8_t*>(dst + 16), operator()(tmp)); \
GI_INT16_t vsrct3 = GiMoveHighLongInt8(GiGetSubVectorInt8V2(vsrc, 1)); \
GiSetSubVectorInt32V2(tmp, 0, GiMoveLowLongInt16(vsrct3)); \
GiSetSubVectorInt32V2(tmp, 1, GiMoveHighLongInt16(vsrct3)); \
GiStoreLowInt8(reinterpret_cast<int8_t*>(dst + 24), operator()(tmp))

//! scale_src = src.scale; scale_dst = 1.f / dst.scale (div -> mul)
//! scale = src.scale / dst.scale
@@ -56,17 +56,17 @@ template <>
struct UnaryOpBase<dt_qint8, dt_qint8> : OpBase<dt_qint8, dt_qint8> {
using OpBase::OpBase;
float scale_src, scale_dst;
GI_FLOAT32_t vscale_src, vscale_dst;
GI_FLOAT32_FIXLEN_t vscale_src, vscale_dst;
float scale;
GI_FLOAT32_t vscale;
GI_FLOAT32_FIXLEN_t vscale;

void init(float src_scale, float dst_scale) {
scale_src = src_scale;
vscale_src = GiBroadcastFloat32(scale_src);
vscale_src = GiFloat32Type2FixLenType(GiBroadcastFloat32(scale_src));
scale_dst = 1.f / dst_scale;
vscale_dst = GiBroadcastFloat32(scale_dst);
vscale_dst = GiFloat32Type2FixLenType(GiBroadcastFloat32(scale_dst));
scale = src_scale / dst_scale;
vscale = GiBroadcastFloat32(scale);
vscale = GiFloat32Type2FixLenType(GiBroadcastFloat32(scale));
}

UnaryOpBase(DType src_dtype, DType dst_dtype) {
@@ -83,17 +83,17 @@ struct UnaryOpBase<dt_qint32, dt_qint8> : OpBase<dt_qint32, dt_qint8> {
using src_ctype = dt_qint32;
using dst_ctype = dt_qint8;
float scale;
GI_FLOAT32_t vscale;
GI_FLOAT32_FIXLEN_t vscale;
float scale_src, scale_dst;
GI_FLOAT32_t vscale_src, vscale_dst;
GI_FLOAT32_FIXLEN_t vscale_src, vscale_dst;

void init(float src_scale, float dst_scale) {
scale_src = src_scale;
vscale_src = GiBroadcastFloat32(src_scale);
vscale_src = GiFloat32Type2FixLenType(GiBroadcastFloat32(src_scale));
scale_dst = 1 / dst_scale;
vscale_dst = GiBroadcastFloat32(scale_dst);
vscale_dst = GiFloat32Type2FixLenType(GiBroadcastFloat32(scale_dst));
scale = src_scale / dst_scale;
vscale = GiBroadcastFloat32(scale);
vscale = GiFloat32Type2FixLenType(GiBroadcastFloat32(scale));
}

UnaryOpBase(DType src_dtype, DType dst_dtype) {
@@ -115,35 +115,36 @@ struct BinaryOpBase : OpBase<src_ctype, dst_ctype> {

/* ================= binary op for quantized types ================== */

#define OPERATOR_BINARY_QINT8_FALLBACK \
GI_INT16_t vsrct0_0 = GiMoveLowLongInt8(vsrc0.val[0]); \
GI_INT16_t vsrct1_0 = GiMoveLowLongInt8(vsrc1.val[0]); \
GiStoreLowInt8( \
reinterpret_cast<int8_t*>(dst), \
operator()( \
{{GiMoveLowLongInt16(vsrct0_0), GiMoveHighLongInt16(vsrct0_0)}}, \
{{GiMoveLowLongInt16(vsrct1_0), GiMoveHighLongInt16(vsrct1_0)}})); \
GI_INT16_t vsrct0_1 = GiMoveHighLongInt8(vsrc0.val[0]); \
GI_INT16_t vsrct1_1 = GiMoveHighLongInt8(vsrc1.val[0]); \
GiStoreLowInt8( \
reinterpret_cast<int8_t*>(dst + 8), \
operator()( \
{{GiMoveLowLongInt16(vsrct0_1), GiMoveHighLongInt16(vsrct0_1)}}, \
{{GiMoveLowLongInt16(vsrct1_1), GiMoveHighLongInt16(vsrct1_1)}})); \
GI_INT16_t vsrct0_2 = GiMoveLowLongInt8(vsrc0.val[1]); \
GI_INT16_t vsrct1_2 = GiMoveLowLongInt8(vsrc1.val[1]); \
GiStoreLowInt8( \
reinterpret_cast<int8_t*>(dst + 16), \
operator()( \
{{GiMoveLowLongInt16(vsrct0_2), GiMoveHighLongInt16(vsrct0_2)}}, \
{{GiMoveLowLongInt16(vsrct1_2), GiMoveHighLongInt16(vsrct1_2)}})); \
GI_INT16_t vsrct0_3 = GiMoveHighLongInt8(vsrc0.val[1]); \
GI_INT16_t vsrct1_3 = GiMoveHighLongInt8(vsrc1.val[1]); \
GiStoreLowInt8( \
reinterpret_cast<int8_t*>(dst + 24), \
operator()( \
{{GiMoveLowLongInt16(vsrct0_3), GiMoveHighLongInt16(vsrct0_3)}}, \
{{GiMoveLowLongInt16(vsrct1_3), GiMoveHighLongInt16(vsrct1_3)}}))
#define OPERATOR_BINARY_QINT8_FALLBACK \
GI_INT16_t vsrct0_0 = GiMoveLowLongInt8(GiGetSubVectorInt8V2(vsrc0, 0)); \
GI_INT16_t vsrct1_0 = GiMoveLowLongInt8(GiGetSubVectorInt8V2(vsrc1, 0)); \
GI_INT32_V2_t tmp0, tmp1; \
GiSetSubVectorInt32V2(tmp0, 0, GiMoveLowLongInt16(vsrct0_0)); \
GiSetSubVectorInt32V2(tmp0, 1, GiMoveHighLongInt16(vsrct0_0)); \
GiSetSubVectorInt32V2(tmp1, 0, GiMoveLowLongInt16(vsrct1_0)); \
GiSetSubVectorInt32V2(tmp1, 1, GiMoveHighLongInt16(vsrct1_0)); \
GiStoreLowInt8(reinterpret_cast<int8_t*>(dst), operator()(tmp0, tmp1)); \
GI_INT16_t vsrct0_1 = GiMoveHighLongInt8(GiGetSubVectorInt8V2(vsrc0, 0)); \
GI_INT16_t vsrct1_1 = GiMoveHighLongInt8(GiGetSubVectorInt8V2(vsrc1, 0)); \
GiSetSubVectorInt32V2(tmp0, 0, GiMoveLowLongInt16(vsrct0_1)); \
GiSetSubVectorInt32V2(tmp0, 1, GiMoveHighLongInt16(vsrct0_1)); \
GiSetSubVectorInt32V2(tmp1, 0, GiMoveLowLongInt16(vsrct1_1)); \
GiSetSubVectorInt32V2(tmp1, 1, GiMoveHighLongInt16(vsrct1_1)); \
GiStoreLowInt8(reinterpret_cast<int8_t*>(dst + 8), operator()(tmp0, tmp1)); \
GI_INT16_t vsrct0_2 = GiMoveLowLongInt8(GiGetSubVectorInt8V2(vsrc0, 1)); \
GI_INT16_t vsrct1_2 = GiMoveLowLongInt8(GiGetSubVectorInt8V2(vsrc1, 1)); \
GiSetSubVectorInt32V2(tmp0, 0, GiMoveLowLongInt16(vsrct0_2)); \
GiSetSubVectorInt32V2(tmp0, 1, GiMoveHighLongInt16(vsrct0_2)); \
GiSetSubVectorInt32V2(tmp1, 0, GiMoveLowLongInt16(vsrct1_2)); \
GiSetSubVectorInt32V2(tmp1, 1, GiMoveHighLongInt16(vsrct1_2)); \
GiStoreLowInt8(reinterpret_cast<int8_t*>(dst + 16), operator()(tmp0, tmp1)); \
GI_INT16_t vsrct0_3 = GiMoveHighLongInt8(GiGetSubVectorInt8V2(vsrc0, 1)); \
GI_INT16_t vsrct1_3 = GiMoveHighLongInt8(GiGetSubVectorInt8V2(vsrc1, 1)); \
GiSetSubVectorInt32V2(tmp0, 0, GiMoveLowLongInt16(vsrct0_3)); \
GiSetSubVectorInt32V2(tmp0, 1, GiMoveHighLongInt16(vsrct0_3)); \
GiSetSubVectorInt32V2(tmp1, 0, GiMoveLowLongInt16(vsrct1_3)); \
GiSetSubVectorInt32V2(tmp1, 1, GiMoveHighLongInt16(vsrct1_3)); \
GiStoreLowInt8(reinterpret_cast<int8_t*>(dst + 24), operator()(tmp0, tmp1));

//! scale_src0 = src0.scale; scale_src1 = src1.scale; scale_dst = 1.f /
//! dst.scale scale0 = src0.scale / dst.scale; scale1 = src1.scale / dst.scale
@@ -153,21 +154,21 @@ struct BinaryOpBase<dt_qint8, dt_qint8> : OpBase<dt_qint8, dt_qint8> {
using src_ctype = dt_qint8;
using dst_ctype = dt_qint8;
float scale_src0, scale_src1, scale_dst;
GI_FLOAT32_t vscale_src0, vscale_src1, vscale_dst;
GI_FLOAT32_FIXLEN_t vscale_src0, vscale_src1, vscale_dst;
float scale0, scale1;
GI_FLOAT32_t vscale0, vscale1;
GI_FLOAT32_FIXLEN_t vscale0, vscale1;

void init(float src0_scale, float src1_scale, float dst_scale) {
scale_src0 = src0_scale;
vscale_src0 = GiBroadcastFloat32(scale_src0);
vscale_src0 = GiFloat32Type2FixLenType(GiBroadcastFloat32(scale_src0));
scale_src1 = src1_scale;
vscale_src1 = GiBroadcastFloat32(scale_src1);
vscale_src1 = GiFloat32Type2FixLenType(GiBroadcastFloat32(scale_src1));
scale_dst = 1.f / dst_scale;
vscale_dst = GiBroadcastFloat32(scale_dst);
vscale_dst = GiFloat32Type2FixLenType(GiBroadcastFloat32(scale_dst));
scale0 = src0_scale / dst_scale;
vscale0 = GiBroadcastFloat32(scale0);
vscale0 = GiFloat32Type2FixLenType(GiBroadcastFloat32(scale0));
scale1 = src1_scale / dst_scale;
vscale1 = GiBroadcastFloat32(scale1);
vscale1 = GiFloat32Type2FixLenType(GiBroadcastFloat32(scale1));
}

BinaryOpBase(DType src0_dtype, DType src1_dtype, DType dst_dtype) {
@@ -188,21 +189,21 @@ struct BinaryOpBase<dt_qint32, dt_qint8> : OpBase<dt_qint32, dt_qint8> {
using src_ctype = dt_qint32;
using dst_ctype = dt_qint8;
float scale0, scale1;
GI_FLOAT32_t vscale0, vscale1;
GI_FLOAT32_FIXLEN_t vscale0, vscale1;
float scale_src0, scale_src1, scale_dst;
GI_FLOAT32_t vscale_src0, vscale_src1, vscale_dst;
GI_FLOAT32_FIXLEN_t vscale_src0, vscale_src1, vscale_dst;

void init(float src0_scale, float src1_scale, float dst_scale) {
scale_src0 = src0_scale;
vscale_src0 = GiBroadcastFloat32(src0_scale);
vscale_src0 = GiFloat32Type2FixLenType(GiBroadcastFloat32(src0_scale));
scale_src1 = src1_scale;
vscale_src1 = GiBroadcastFloat32(src1_scale);
vscale_src1 = GiFloat32Type2FixLenType(GiBroadcastFloat32(src1_scale));
scale_dst = 1 / dst_scale;
vscale_dst = GiBroadcastFloat32(scale_dst);
vscale_dst = GiFloat32Type2FixLenType(GiBroadcastFloat32(scale_dst));
scale0 = src0_scale / dst_scale;
vscale0 = GiBroadcastFloat32(scale0);
vscale0 = GiFloat32Type2FixLenType(GiBroadcastFloat32(scale0));
scale1 = src1_scale / dst_scale;
vscale1 = GiBroadcastFloat32(scale1);
vscale1 = GiFloat32Type2FixLenType(GiBroadcastFloat32(scale1));
}

BinaryOpBase(DType src0_dtype, DType src1_dtype, DType dst_dtype) {
@@ -227,43 +228,48 @@ struct TernaryOpBase : OpBase<src_ctype, dst_ctype> {
DType /*dst_dtype*/) {}
};

#define OPERATOR_TERNARY_QINT8_FALLBACK \
GI_INT16_t vsrct0 = GiMoveLowLongInt8(vsrc0.val[0]); \
GI_INT16_t vsrct1 = GiMoveLowLongInt8(vsrc1.val[0]); \
GI_INT16_t vsrct2 = GiMoveLowLongInt8(vsrc2.val[0]); \
GiStoreLowInt8( \
reinterpret_cast<int8_t*>(dst), \
operator()( \
{{GiMoveLowLongInt16(vsrct0), GiMoveHighLongInt16(vsrct0)}}, \
{{GiMoveLowLongInt16(vsrct1), GiMoveHighLongInt16(vsrct1)}}, \
{{GiMoveLowLongInt16(vsrct2), GiMoveHighLongInt16(vsrct2)}})); \
vsrct0 = GiMoveHighLongInt8(vsrc0.val[0]); \
vsrct1 = GiMoveHighLongInt8(vsrc1.val[0]); \
vsrct2 = GiMoveHighLongInt8(vsrc2.val[0]); \
GiStoreLowInt8( \
reinterpret_cast<int8_t*>(dst + 8), \
operator()( \
{{GiMoveLowLongInt16(vsrct0), GiMoveHighLongInt16(vsrct0)}}, \
{{GiMoveLowLongInt16(vsrct1), GiMoveHighLongInt16(vsrct1)}}, \
{{GiMoveLowLongInt16(vsrct2), GiMoveHighLongInt16(vsrct2)}})); \
vsrct0 = GiMoveLowLongInt8(vsrc0.val[1]); \
vsrct1 = GiMoveLowLongInt8(vsrc1.val[1]); \
vsrct2 = GiMoveLowLongInt8(vsrc2.val[1]); \
GiStoreLowInt8( \
reinterpret_cast<int8_t*>(dst + 16), \
operator()( \
{{GiMoveLowLongInt16(vsrct0), GiMoveHighLongInt16(vsrct0)}}, \
{{GiMoveLowLongInt16(vsrct1), GiMoveHighLongInt16(vsrct1)}}, \
{{GiMoveLowLongInt16(vsrct2), GiMoveHighLongInt16(vsrct2)}})); \
vsrct0 = GiMoveHighLongInt8(vsrc0.val[1]); \
vsrct1 = GiMoveHighLongInt8(vsrc1.val[1]); \
vsrct2 = GiMoveHighLongInt8(vsrc2.val[1]); \
GiStoreLowInt8( \
reinterpret_cast<int8_t*>(dst + 24), \
operator()( \
{{GiMoveLowLongInt16(vsrct0), GiMoveHighLongInt16(vsrct0)}}, \
{{GiMoveLowLongInt16(vsrct1), GiMoveHighLongInt16(vsrct1)}}, \
{{GiMoveLowLongInt16(vsrct2), GiMoveHighLongInt16(vsrct2)}}))
#define OPERATOR_TERNARY_QINT8_FALLBACK \
GI_INT16_t vsrct0 = GiMoveLowLongInt8(GiGetSubVectorInt8V2(vsrc0, 0)); \
GI_INT16_t vsrct1 = GiMoveLowLongInt8(GiGetSubVectorInt8V2(vsrc1, 0)); \
GI_INT16_t vsrct2 = GiMoveLowLongInt8(GiGetSubVectorInt8V2(vsrc2, 0)); \
GI_INT32_V2_t tmp0, tmp1, tmp2; \
GiSetSubVectorInt32V2(tmp0, 0, GiMoveLowLongInt16(vsrct0)); \
GiSetSubVectorInt32V2(tmp0, 1, GiMoveHighLongInt16(vsrct0)); \
GiSetSubVectorInt32V2(tmp1, 0, GiMoveLowLongInt16(vsrct1)); \
GiSetSubVectorInt32V2(tmp1, 1, GiMoveHighLongInt16(vsrct1)); \
GiSetSubVectorInt32V2(tmp2, 0, GiMoveLowLongInt16(vsrct2)); \
GiSetSubVectorInt32V2(tmp2, 1, GiMoveHighLongInt16(vsrct2)); \
GiStoreLowInt8(reinterpret_cast<int8_t*>(dst), operator()(tmp0, tmp1, tmp2)); \
vsrct0 = GiMoveHighLongInt8(GiGetSubVectorInt8V2(vsrc0, 0)); \
vsrct1 = GiMoveHighLongInt8(GiGetSubVectorInt8V2(vsrc1, 0)); \
vsrct2 = GiMoveHighLongInt8(GiGetSubVectorInt8V2(vsrc2, 0)); \
GiSetSubVectorInt32V2(tmp0, 0, GiMoveLowLongInt16(vsrct0)); \
GiSetSubVectorInt32V2(tmp0, 1, GiMoveHighLongInt16(vsrct0)); \
GiSetSubVectorInt32V2(tmp1, 0, GiMoveLowLongInt16(vsrct1)); \
GiSetSubVectorInt32V2(tmp1, 1, GiMoveHighLongInt16(vsrct1)); \
GiSetSubVectorInt32V2(tmp2, 0, GiMoveLowLongInt16(vsrct2)); \
GiSetSubVectorInt32V2(tmp2, 1, GiMoveHighLongInt16(vsrct2)); \
GiStoreLowInt8(reinterpret_cast<int8_t*>(dst + 8), operator()(tmp0, tmp1, tmp2)); \
vsrct0 = GiMoveLowLongInt8(GiGetSubVectorInt8V2(vsrc0, 1)); \
vsrct1 = GiMoveLowLongInt8(GiGetSubVectorInt8V2(vsrc1, 1)); \
vsrct2 = GiMoveLowLongInt8(GiGetSubVectorInt8V2(vsrc2, 1)); \
GiSetSubVectorInt32V2(tmp0, 0, GiMoveLowLongInt16(vsrct0)); \
GiSetSubVectorInt32V2(tmp0, 1, GiMoveHighLongInt16(vsrct0)); \
GiSetSubVectorInt32V2(tmp1, 0, GiMoveLowLongInt16(vsrct1)); \
GiSetSubVectorInt32V2(tmp1, 1, GiMoveHighLongInt16(vsrct1)); \
GiSetSubVectorInt32V2(tmp2, 0, GiMoveLowLongInt16(vsrct2)); \
GiSetSubVectorInt32V2(tmp2, 1, GiMoveHighLongInt16(vsrct2)); \
GiStoreLowInt8(reinterpret_cast<int8_t*>(dst + 16), operator()(tmp0, tmp1, tmp2)); \
vsrct0 = GiMoveHighLongInt8(GiGetSubVectorInt8V2(vsrc0, 1)); \
vsrct1 = GiMoveHighLongInt8(GiGetSubVectorInt8V2(vsrc1, 1)); \
vsrct2 = GiMoveHighLongInt8(GiGetSubVectorInt8V2(vsrc2, 1)); \
GiSetSubVectorInt32V2(tmp0, 0, GiMoveLowLongInt16(vsrct0)); \
GiSetSubVectorInt32V2(tmp0, 1, GiMoveHighLongInt16(vsrct0)); \
GiSetSubVectorInt32V2(tmp1, 0, GiMoveLowLongInt16(vsrct1)); \
GiSetSubVectorInt32V2(tmp1, 1, GiMoveHighLongInt16(vsrct1)); \
GiSetSubVectorInt32V2(tmp2, 0, GiMoveLowLongInt16(vsrct2)); \
GiSetSubVectorInt32V2(tmp2, 1, GiMoveHighLongInt16(vsrct2)); \
GiStoreLowInt8(reinterpret_cast<int8_t*>(dst + 24), operator()(tmp0, tmp1, tmp2));

/*========================= ternaty op for quanzited ====================*/
template <>
@@ -272,24 +278,24 @@ struct TernaryOpBase<dt_qint8, dt_qint8> : OpBase<dt_qint8, dt_qint8> {
using src_ctype = dt_qint8;
using dst_ctype = dt_qint8;
float scale_src0, scale_src1, scale_src2, scale_dst;
GI_FLOAT32_t vscale_src0, vscale_src1, vscale_src2, vscale_dst;
GI_FLOAT32_FIXLEN_t vscale_src0, vscale_src1, vscale_src2, vscale_dst;
float scale0, scale1, scale2;
GI_FLOAT32_t vscale0, vscale1, vscale2;
GI_FLOAT32_FIXLEN_t vscale0, vscale1, vscale2;
void init(float src0_scale, float src1_scale, float src2_scale, float dst_scale) {
scale_src0 = src0_scale;
scale_src1 = src1_scale;
scale_src2 = src2_scale;
scale_dst = 1.f / dst_scale;
vscale_src0 = GiBroadcastFloat32(scale_src0);
vscale_src1 = GiBroadcastFloat32(scale_src1);
vscale_src2 = GiBroadcastFloat32(scale_src2);
vscale_dst = GiBroadcastFloat32(scale_dst);
vscale_src0 = GiFloat32Type2FixLenType(GiBroadcastFloat32(scale_src0));
vscale_src1 = GiFloat32Type2FixLenType(GiBroadcastFloat32(scale_src1));
vscale_src2 = GiFloat32Type2FixLenType(GiBroadcastFloat32(scale_src2));
vscale_dst = GiFloat32Type2FixLenType(GiBroadcastFloat32(scale_dst));
scale0 = src0_scale / dst_scale;
scale1 = src1_scale / dst_scale;
scale2 = src2_scale / dst_scale;
vscale0 = GiBroadcastFloat32(scale0);
vscale1 = GiBroadcastFloat32(scale1);
vscale2 = GiBroadcastFloat32(scale2);
vscale0 = GiFloat32Type2FixLenType(GiBroadcastFloat32(scale0));
vscale1 = GiFloat32Type2FixLenType(GiBroadcastFloat32(scale1));
vscale2 = GiFloat32Type2FixLenType(GiBroadcastFloat32(scale2));
}
TernaryOpBase(
DType src0_dtype, DType src1_dtype, DType src2_dtype, DType dst_dtype) {
@@ -307,7 +313,7 @@ struct TernaryOpBase<dt_qint8, dt_qint8> : OpBase<dt_qint8, dt_qint8> {

////////////////////////// fixup //////////////////////////
struct FixupBase {
GI_INT32_t vmultiplier, vshift;
GI_INT32_FIXLEN_t vmultiplier, vshift;
FixupBase(float scale) {
//! ignore Fixup if scale >= 0.5, using typecvt instead of shift &
//! multiplier, as it may introduce errors.
@@ -317,9 +323,9 @@ struct FixupBase {
int shift = static_cast<int>(::ceilf(::log2f(0.5 / scale)));
scale *= ::powf(2, shift);
//! Using double can get full precision here, but it can be ignored.
vmultiplier = GiBroadcastInt32(
std::round(static_cast<double>(scale) * ((2LL) << 30)));
vshift = GiBroadcastInt32(-shift);
vmultiplier = GiInt32Type2FixLenType(GiBroadcastInt32(
std::round(static_cast<double>(scale) * ((2LL) << 30))));
vshift = GiInt32Type2FixLenType(GiBroadcastInt32(-shift));
}
};

@@ -349,11 +355,25 @@ struct UnaryQuantizationOp<dt_qint8, dt_qint8, Op> : UnaryOpBase<dt_qint8, dt_qi
}

GI_INT8_t operator()(const GI_INT32_V2_t& vsrc) const {
auto vitem0 = GiMultiplyFloat32(GiCastToFloat32(vsrc.val[0]), this->vscale_src);
auto vitem1 = GiMultiplyFloat32(GiCastToFloat32(vsrc.val[1]), this->vscale_src);
auto val = this->op({{vitem0, vitem1}});
val.val[0] = GiMultiplyFloat32(val.val[0], this->vscale_dst);
val.val[1] = GiMultiplyFloat32(val.val[1], this->vscale_dst);
auto vitem0 = GiMultiplyFloat32(
GiCastToFloat32(GiGetSubVectorInt32V2(vsrc, 0)),
GiFixLenType2GiFloat32Type(this->vscale_src));
auto vitem1 = GiMultiplyFloat32(
GiCastToFloat32(GiGetSubVectorInt32V2(vsrc, 1)),
GiFixLenType2GiFloat32Type(this->vscale_src));
GI_FLOAT32_V2_t tmp;
GiSetSubVectorFloat32V2(tmp, 0, vitem0);
GiSetSubVectorFloat32V2(tmp, 1, vitem1);

auto val = this->op(tmp);
GI_FLOAT32_t a = GiMultiplyFloat32(
GiGetSubVectorFloat32V2(val, 0),
GiFixLenType2GiFloat32Type(this->vscale_dst));
GI_FLOAT32_t b = GiMultiplyFloat32(
GiGetSubVectorFloat32V2(val, 1),
GiFixLenType2GiFloat32Type(this->vscale_dst));
GiSetSubVectorFloat32V2(val, 0, a);
GiSetSubVectorFloat32V2(val, 1, b);
return QConverter::convert<GI_INT8_t, GI_FLOAT32_V2_t>(val);
}
};
@@ -385,13 +405,32 @@ struct BinaryQuantizationOp<dt_qint8, dt_qint8, Op> : BinaryOpBase<dt_qint8, dt_
}

GI_INT8_t operator()(const GI_INT32_V2_t& vsrc0, const GI_INT32_V2_t& vsrc1) const {
auto val0 = GiMultiplyFloat32(GiCastToFloat32(vsrc0.val[0]), this->vscale_src0);
auto val1 = GiMultiplyFloat32(GiCastToFloat32(vsrc0.val[1]), this->vscale_src0);
auto val2 = GiMultiplyFloat32(GiCastToFloat32(vsrc1.val[0]), this->vscale_src1);
auto val3 = GiMultiplyFloat32(GiCastToFloat32(vsrc1.val[1]), this->vscale_src1);
auto val = op({{val0, val1}}, {{val2, val3}});
val.val[0] = GiMultiplyFloat32(val.val[0], this->vscale_dst);
val.val[1] = GiMultiplyFloat32(val.val[1], this->vscale_dst);
auto val0 = GiMultiplyFloat32(
GiCastToFloat32(GiGetSubVectorInt32V2(vsrc0, 0)),
GiFixLenType2GiFloat32Type(this->vscale_src0));
auto val1 = GiMultiplyFloat32(
GiCastToFloat32(GiGetSubVectorInt32V2(vsrc0, 1)),
GiFixLenType2GiFloat32Type(this->vscale_src0));
auto val2 = GiMultiplyFloat32(
GiCastToFloat32(GiGetSubVectorInt32V2(vsrc1, 0)),
GiFixLenType2GiFloat32Type(this->vscale_src1));
auto val3 = GiMultiplyFloat32(
GiCastToFloat32(GiGetSubVectorInt32V2(vsrc1, 1)),
GiFixLenType2GiFloat32Type(this->vscale_src1));
GI_FLOAT32_V2_t tmp0, tmp1;
GiSetSubVectorFloat32V2(tmp0, 0, val0);
GiSetSubVectorFloat32V2(tmp0, 1, val1);
GiSetSubVectorFloat32V2(tmp1, 0, val2);
GiSetSubVectorFloat32V2(tmp1, 1, val3);
auto val = op(tmp0, tmp1);
GI_FLOAT32_t a = GiMultiplyFloat32(
GiGetSubVectorFloat32V2(val, 0),
GiFixLenType2GiFloat32Type(this->vscale_dst));
GI_FLOAT32_t b = GiMultiplyFloat32(
GiGetSubVectorFloat32V2(val, 1),
GiFixLenType2GiFloat32Type(this->vscale_dst));
GiSetSubVectorFloat32V2(val, 0, a);
GiSetSubVectorFloat32V2(val, 1, b);
return QConverter::convert<GI_INT8_t, GI_FLOAT32_V2_t>(val);
}
};
@@ -431,15 +470,40 @@ struct TernaryQuantizationOp<dt_qint8, dt_qint8, Op>
GI_INT8_t operator()(
const GI_INT32_V2_t& vsrc0, const GI_INT32_V2_t& vsrc1,
const GI_INT32_V2_t& vsrc2) const {
auto val0 = GiMultiplyFloat32(GiCastToFloat32(vsrc0.val[0]), this->vscale_src0);
auto val1 = GiMultiplyFloat32(GiCastToFloat32(vsrc0.val[1]), this->vscale_src0);
auto val2 = GiMultiplyFloat32(GiCastToFloat32(vsrc1.val[0]), this->vscale_src1);
auto val3 = GiMultiplyFloat32(GiCastToFloat32(vsrc1.val[1]), this->vscale_src1);
auto val4 = GiMultiplyFloat32(GiCastToFloat32(vsrc2.val[0]), this->vscale_src2);
auto val5 = GiMultiplyFloat32(GiCastToFloat32(vsrc2.val[1]), this->vscale_src2);
auto val = op({{val0, val1}}, {{val2, val3}}, {{val4, val5}});
val.val[0] = GiMultiplyFloat32(val.val[0], this->vscale_dst);
val.val[1] = GiMultiplyFloat32(val.val[1], this->vscale_dst);
auto val0 = GiMultiplyFloat32(
GiCastToFloat32(GiGetSubVectorInt32V2(vsrc0, 0)),
GiFixLenType2GiFloat32Type(this->vscale_src0));
auto val1 = GiMultiplyFloat32(
GiCastToFloat32(GiGetSubVectorInt32V2(vsrc0, 1)),
GiFixLenType2GiFloat32Type(this->vscale_src0));
auto val2 = GiMultiplyFloat32(
GiCastToFloat32(GiGetSubVectorInt32V2(vsrc1, 0)),
GiFixLenType2GiFloat32Type(this->vscale_src1));
auto val3 = GiMultiplyFloat32(
GiCastToFloat32(GiGetSubVectorInt32V2(vsrc1, 1)),
GiFixLenType2GiFloat32Type(this->vscale_src1));
auto val4 = GiMultiplyFloat32(
GiCastToFloat32(GiGetSubVectorInt32V2(vsrc2, 0)),
GiFixLenType2GiFloat32Type(this->vscale_src2));
auto val5 = GiMultiplyFloat32(
GiCastToFloat32(GiGetSubVectorInt32V2(vsrc2, 1)),
GiFixLenType2GiFloat32Type(this->vscale_src2));
GI_FLOAT32_V2_t tmp0, tmp1, tmp2;
GiSetSubVectorFloat32V2(tmp0, 0, val0);
GiSetSubVectorFloat32V2(tmp0, 1, val1);
GiSetSubVectorFloat32V2(tmp1, 0, val2);
GiSetSubVectorFloat32V2(tmp1, 1, val3);
GiSetSubVectorFloat32V2(tmp2, 0, val4);
GiSetSubVectorFloat32V2(tmp2, 1, val5);
auto val = op(tmp0, tmp1, tmp2);
GI_FLOAT32_t a = GiMultiplyFloat32(
GiGetSubVectorFloat32V2(val, 0),
GiFixLenType2GiFloat32Type(this->vscale_dst));
GI_FLOAT32_t b = GiMultiplyFloat32(
GiGetSubVectorFloat32V2(val, 1),
GiFixLenType2GiFloat32Type(this->vscale_dst));
GiSetSubVectorFloat32V2(val, 0, a);
GiSetSubVectorFloat32V2(val, 1, b);
return QConverter::convert<GI_INT8_t, GI_FLOAT32_V2_t>(val);
}
};


+ 67
- 39
dnn/src/fallback/elemwise_helper/kimpl/relu.h View File

@@ -20,37 +20,43 @@ struct ReluOpBase : UnaryOpBase<src_ctype, dst_ctype> {
template <typename src_ctype, typename dst_type = src_ctype>
struct ReluOp;

#define OP(_ctype, _simd_type, _simd_type2, _func_suffix, _simd_width, zero) \
template <> \
struct ReluOp<_ctype> : ReluOpBase<_ctype> { \
using ReluOpBase::ReluOpBase; \
using ReluOpBase::operator(); \
constexpr static size_t SIMD_WIDTH = _simd_width; \
void operator()(const _simd_type2& src, _ctype* dst) const { \
auto vitem = operator()(src); \
GiStore##_func_suffix(dst, vitem.val[0]); \
GiStore##_func_suffix(dst + SIMD_WIDTH, vitem.val[1]); \
} \
_simd_type2 operator()(const _simd_type2& src) const { \
auto vitem0 = GiMaximum##_func_suffix(src.val[0], zero); \
auto vitem1 = GiMaximum##_func_suffix(src.val[1], zero); \
return {{vitem0, vitem1}}; \
} \
void operator()(const _simd_type& src, _ctype* dst) const { \
auto vitem = operator()(src); \
GiStore##_func_suffix(dst, vitem); \
} \
_simd_type operator()(const _simd_type& src) const { \
return GiMaximum##_func_suffix(src, zero); \
} \
#define OP(_ctype, _simd_type, _simd_type2, _func_suffix, _simd_width, zero_num) \
template <> \
struct ReluOp<_ctype> : ReluOpBase<_ctype> { \
using ReluOpBase::ReluOpBase; \
using ReluOpBase::operator(); \
constexpr static size_t SIMD_WIDTH = _simd_width; \
void operator()(const _simd_type2& src, _ctype* dst) const { \
auto vitem = operator()(src); \
GiStore##_func_suffix(dst, GiGetSubVector##_func_suffix##V2(vitem, 0)); \
GiStore##_func_suffix( \
dst + SIMD_WIDTH, GiGetSubVector##_func_suffix##V2(vitem, 1)); \
} \
_simd_type2 operator()(const _simd_type2& src) const { \
_simd_type zero = GiBroadcast##_func_suffix(zero_num); \
auto vitem0 = GiMaximum##_func_suffix( \
GiGetSubVector##_func_suffix##V2(src, 0), zero); \
auto vitem1 = GiMaximum##_func_suffix( \
GiGetSubVector##_func_suffix##V2(src, 1), zero); \
_simd_type2 ret; \
GiSetSubVector##_func_suffix##V2(ret, 0, vitem0); \
GiSetSubVector##_func_suffix##V2(ret, 1, vitem1); \
return ret; \
} \
void operator()(const _simd_type& src, _ctype* dst) const { \
auto vitem = operator()(src); \
GiStore##_func_suffix(dst, vitem); \
} \
_simd_type operator()(const _simd_type& src) const { \
_simd_type zero = GiBroadcast##_func_suffix(zero_num); \
return GiMaximum##_func_suffix(src, zero); \
} \
};

OP(dt_float32, GI_FLOAT32_t, GI_FLOAT32_V2_t, Float32, GI_SIMD_LEN_BYTE / sizeof(float),
vfzero)
OP(dt_int32, GI_INT32_t, GI_INT32_V2_t, Int32, GI_SIMD_LEN_BYTE / sizeof(int32_t),
vzero)
OP(dt_int8, GI_INT8_t, GI_INT8_V2_t, Int8, GI_SIMD_LEN_BYTE / sizeof(int8_t),
vzero_int8)
0.0f)
OP(dt_int32, GI_INT32_t, GI_INT32_V2_t, Int32, GI_SIMD_LEN_BYTE / sizeof(int32_t), 0)
OP(dt_int8, GI_INT8_t, GI_INT8_V2_t, Int8, GI_SIMD_LEN_BYTE / sizeof(int8_t), 0)
#undef OP

template <>
@@ -76,11 +82,19 @@ struct ReluOp<dt_qint8, dt_qint8> : ReluOpBase<dt_qint8, dt_qint8> {
OPERATOR_UNARY_QINT8_FALLBACK;
}
GI_INT8_t operator()(const GI_INT32_V2_t& vsrc) const {
auto vitem0 = GiMultiplyFloat32(GiCastToFloat32(vsrc.val[0]), this->vscale);
auto vitem1 = GiMultiplyFloat32(GiCastToFloat32(vsrc.val[1]), this->vscale);
GI_FLOAT32_t vfzero = GiBroadcastFloat32(0.0f);
auto vitem0 = GiMultiplyFloat32(
GiCastToFloat32(GiGetSubVectorInt32V2(vsrc, 0)),
GiFixLenType2GiFloat32Type(this->vscale));
auto vitem1 = GiMultiplyFloat32(
GiCastToFloat32(GiGetSubVectorInt32V2(vsrc, 1)),
GiFixLenType2GiFloat32Type(this->vscale));
vitem0 = GiMaximumFloat32(vitem0, vfzero);
vitem1 = GiMaximumFloat32(vitem1, vfzero);
return QConverter::convert<GI_INT8_t, GI_FLOAT32_V2_t>({{vitem0, vitem1}});
GI_FLOAT32_V2_t tmp;
GiSetSubVectorFloat32V2(tmp, 0, vitem0);
GiSetSubVectorFloat32V2(tmp, 1, vitem1);
return QConverter::convert<GI_INT8_t, GI_FLOAT32_V2_t>(tmp);
}
};

@@ -104,6 +118,8 @@ template <>
struct ReluOp<dt_qint32, dt_qint8> : ReluOpBase<dt_qint32, dt_qint8>, FixupBase {
using ReluOpBase::operator();
constexpr static size_t SIMD_WIDTH = 4;
GI_INT32_t vzero = GiBroadcastInt32(0);
GI_FLOAT32_t vfzero = GiBroadcastFloat32(0.0f);

ReluOp(DType src_dtype, DType dst_dtype)
: ReluOpBase(src_dtype, dst_dtype), FixupBase(scale) {}
@@ -115,8 +131,8 @@ struct ReluOp<dt_qint32, dt_qint8> : ReluOpBase<dt_qint32, dt_qint8>, FixupBase
vst1_s8(reinterpret_cast<int8_t*>(dst), vget_low_s8(operator()(vsrc)));
}
int8x16_t operator()(const int32x4x2_t& vsrc) const {
int32x4_t vitem0 = vqrdmulhq_s32(vsrc.val[0], vmultiplier);
int32x4_t vitem1 = vqrdmulhq_s32(vsrc.val[1], vmultiplier);
int32x4_t vitem0 = vqrdmulhq_s32(GiGetSubVectorInt32V2(vsrc, 0), vmultiplier);
int32x4_t vitem1 = vqrdmulhq_s32(GiGetSubVectorInt32V2(vsrc, 1), vmultiplier);
vitem0 = vmaxq_s32(vitem0, vzero);
vitem1 = vmaxq_s32(vitem1, vzero);
auto tmp = vqmovn_s16(vcombine_s16(
@@ -158,24 +174,36 @@ struct ReluOp<dt_qint32, dt_qint8> : ReluOpBase<dt_qint32, dt_qint8> {
}
void operator()(const GI_INT32_t& src, dt_qint8* dst) const {
GiStoreLane0Int32(
reinterpret_cast<int32_t*>(dst), (GI_INT32_t)(operator()(src)));
reinterpret_cast<int32_t*>(dst),
GiReinterpretInt8AsInt32(operator()(src)));
}

GI_INT8_t operator()(const GI_INT32_V2_t& vsrc) const {
auto vitem0 = GiMultiplyFloat32(GiCastToFloat32(vsrc.val[0]), this->vscale);
auto vitem1 = GiMultiplyFloat32(GiCastToFloat32(vsrc.val[1]), this->vscale);
auto vitem0 = GiMultiplyFloat32(
GiCastToFloat32(GiGetSubVectorInt32V2(vsrc, 0)),
GiFixLenType2GiFloat32Type(this->vscale));
auto vitem1 = GiMultiplyFloat32(
GiCastToFloat32(GiGetSubVectorInt32V2(vsrc, 1)),
GiFixLenType2GiFloat32Type(this->vscale));
GI_FLOAT32_t vfzero = GiBroadcastFloat32(0.0f);
vitem0 = GiMaximumFloat32(vitem0, vfzero);
vitem1 = GiMaximumFloat32(vitem1, vfzero);

return QConverter::convert<GI_INT8_t, GI_FLOAT32_V2_t>({{vitem0, vitem1}});
GI_FLOAT32_V2_t tmp;
GiSetSubVectorFloat32V2(tmp, 0, vitem0);
GiSetSubVectorFloat32V2(tmp, 1, vitem1);
return QConverter::convert<GI_INT8_t, GI_FLOAT32_V2_t>(tmp);
}
GI_INT8_t operator()(const GI_INT32_t& src) const {
auto vitem0 = GiMultiplyFloat32(GiCastToFloat32(src), this->vscale);
auto vitem0 = GiMultiplyFloat32(
GiCastToFloat32(src), GiFixLenType2GiFloat32Type(this->vscale));
GI_FLOAT32_t vfzero = GiBroadcastFloat32(0.0f);
vitem0 = GiMaximumFloat32(vitem0, vfzero);
return QConverter::convert<GI_INT8_t, GI_FLOAT32_t>(vitem0);
}
GI_INT8_t operator()(const GI_FLOAT32_t& src) const {
auto vitem0 = GiMultiplyFloat32(src, this->vscale);
auto vitem0 = GiMultiplyFloat32(src, GiFixLenType2GiFloat32Type(this->vscale));
GI_FLOAT32_t vfzero = GiBroadcastFloat32(0.0f);
vitem0 = GiMaximumFloat32(vitem0, vfzero);
return QConverter::convert<GI_INT8_t, GI_FLOAT32_t>(vitem0);
}


+ 27
- 21
dnn/src/fallback/elemwise_helper/kimpl/sigmoid.h View File

@@ -25,27 +25,33 @@ struct SigmoidOpBase : UnaryOpBase<src_ctype, dst_ctype> {
template <typename src_ctype, typename dst_ctype = src_ctype>
struct SigmoidOp;

#define OP(_ctype, _simd_type, _simd_type2, _func_suffix, _simd_width) \
template <> \
struct SigmoidOp<_ctype> : SigmoidOpBase<_ctype> { \
using SigmoidOpBase::SigmoidOpBase; \
using SigmoidOpBase::operator(); \
constexpr static size_t SIMD_WIDTH = _simd_width; \
void operator()(const _simd_type2& src, _ctype* dst) const { \
auto vitem = operator()(src); \
GiStore##_func_suffix(dst, vitem.val[0]); \
GiStore##_func_suffix(dst + SIMD_WIDTH, vitem.val[1]); \
} \
void operator()(const _simd_type& src, _ctype* dst) const { \
auto vitem = operator()(src); \
GiStore##_func_suffix(dst, vitem); \
} \
_simd_type2 operator()(const _simd_type2& src) const { \
return {{operator()(src.val[0]), operator()(src.val[1])}}; \
} \
_simd_type operator()(const _simd_type& src) const { \
return GiSigmoidPs##_func_suffix(src); \
} \
#define OP(_ctype, _simd_type, _simd_type2, _func_suffix, _simd_width) \
template <> \
struct SigmoidOp<_ctype> : SigmoidOpBase<_ctype> { \
using SigmoidOpBase::SigmoidOpBase; \
using SigmoidOpBase::operator(); \
constexpr static size_t SIMD_WIDTH = _simd_width; \
void operator()(const _simd_type2& src, _ctype* dst) const { \
auto vitem = operator()(src); \
GiStore##_func_suffix(dst, GiGetSubVector##_func_suffix##V2(vitem, 0)); \
GiStore##_func_suffix( \
dst + SIMD_WIDTH, GiGetSubVector##_func_suffix##V2(vitem, 1)); \
} \
void operator()(const _simd_type& src, _ctype* dst) const { \
auto vitem = operator()(src); \
GiStore##_func_suffix(dst, vitem); \
} \
_simd_type2 operator()(const _simd_type2& src) const { \
_simd_type2 ret; \
GiSetSubVector##_func_suffix##V2( \
ret, 0, operator()(GiGetSubVector##_func_suffix##V2(src, 0))); \
GiSetSubVector##_func_suffix##V2( \
ret, 1, operator()(GiGetSubVector##_func_suffix##V2(src, 1))); \
return ret; \
} \
_simd_type operator()(const _simd_type& src) const { \
return GiSigmoidPs##_func_suffix(src); \
} \
};
OP(dt_float32, GI_FLOAT32_t, GI_FLOAT32_V2_t, Float32, GI_SIMD_LEN_BYTE / sizeof(float))
#undef OP


+ 29
- 10
dnn/src/fallback/elemwise_helper/kimpl/sub.h View File

@@ -33,14 +33,22 @@ struct SubOp;
const _simd_type2& src0, const _simd_type2& src1, \
dst_ctype* dst) const { \
auto vitem = operator()(src0, src1); \
GiStore##_func_suffix(dst, vitem.val[0]); \
GiStore##_func_suffix(dst + SIMD_WIDTH, vitem.val[1]); \
GiStore##_func_suffix(dst, GiGetSubVector##_func_suffix##V2(vitem, 0)); \
GiStore##_func_suffix( \
dst + SIMD_WIDTH, GiGetSubVector##_func_suffix##V2(vitem, 1)); \
} \
_simd_type2 operator()( \
const _simd_type2& src0, const _simd_type2& src1) const { \
auto vitem0 = GiSubtract##_func_suffix(src0.val[0], src1.val[0]); \
auto vitem1 = GiSubtract##_func_suffix(src0.val[1], src1.val[1]); \
return {{vitem0, vitem1}}; \
auto vitem0 = GiSubtract##_func_suffix( \
GiGetSubVector##_func_suffix##V2(src0, 0), \
GiGetSubVector##_func_suffix##V2(src1, 0)); \
auto vitem1 = GiSubtract##_func_suffix( \
GiGetSubVector##_func_suffix##V2(src0, 1), \
GiGetSubVector##_func_suffix##V2(src1, 1)); \
_simd_type2 ret; \
GiSetSubVector##_func_suffix##V2(ret, 0, vitem0); \
GiSetSubVector##_func_suffix##V2(ret, 1, vitem1); \
return ret; \
} \
void operator()( \
const _simd_type& src0, const _simd_type& src1, \
@@ -82,12 +90,23 @@ struct SubOp<dt_qint8, dt_qint8> : SubOpBase<dt_qint8, dt_qint8> {
}
GI_INT8_t operator()(const GI_INT32_V2_t& vsrc0, const GI_INT32_V2_t& vsrc1) const {
auto vitem0 = GiSubtractFloat32(
GiMultiplyFloat32(GiCastToFloat32(vsrc0.val[0]), this->vscale0),
GiMultiplyFloat32(GiCastToFloat32(vsrc1.val[0]), this->vscale1));
GiMultiplyFloat32(
GiCastToFloat32(GiGetSubVectorInt32V2(vsrc0, 0)),
GiFixLenType2GiFloat32Type(this->vscale0)),
GiMultiplyFloat32(
GiCastToFloat32(GiGetSubVectorInt32V2(vsrc1, 0)),
GiFixLenType2GiFloat32Type(this->vscale1)));
auto vitem1 = GiSubtractFloat32(
GiMultiplyFloat32(GiCastToFloat32(vsrc0.val[1]), this->vscale0),
GiMultiplyFloat32(GiCastToFloat32(vsrc1.val[1]), this->vscale1));
return QConverter::convert<GI_INT8_t, GI_FLOAT32_V2_t>({{vitem0, vitem1}});
GiMultiplyFloat32(
GiCastToFloat32(GiGetSubVectorInt32V2(vsrc0, 1)),
GiFixLenType2GiFloat32Type(this->vscale0)),
GiMultiplyFloat32(
GiCastToFloat32(GiGetSubVectorInt32V2(vsrc1, 1)),
GiFixLenType2GiFloat32Type(this->vscale1)));
GI_FLOAT32_V2_t tmp;
GiSetSubVectorFloat32V2(tmp, 0, vitem0);
GiSetSubVectorFloat32V2(tmp, 1, vitem1);
return QConverter::convert<GI_INT8_t, GI_FLOAT32_V2_t>(tmp);
}
};



+ 52
- 48
dnn/src/fallback/elemwise_helper/kimpl/tanh.h View File

@@ -23,54 +23,58 @@ struct TanhOpBase : UnaryOpBase<src_ctype, dst_ctype> {
template <typename src_ctype, typename dst_type = src_ctype>
struct TanhOp;

#define OP(_ctype, _simd_type, _simd_type2, _func_suffix, _simd_width) \
template <> \
struct TanhOp<_ctype> : TanhOpBase<_ctype> { \
using TanhOpBase::TanhOpBase; \
using TanhOpBase::operator(); \
constexpr static size_t SIMD_WIDTH = _simd_width; \
void operator()(const _simd_type2& src, _ctype* dst) const { \
auto vitem = operator()(src); \
GiStore##_func_suffix(dst, vitem.val[0]); \
GiStore##_func_suffix(dst + SIMD_WIDTH, vitem.val[1]); \
} \
_simd_type2 operator()(const _simd_type2& src) const { \
auto one_val = GiBroadcast##_func_suffix(1.f); \
auto two_val = GiBroadcast##_func_suffix(2.f); \
auto val1 = src.val[0]; \
auto val2 = src.val[1]; \
val1 = GiMultiply##_func_suffix(two_val, val1); \
val2 = GiMultiply##_func_suffix(two_val, val2); \
val1 = GiExpPs##_func_suffix(val1); \
val2 = GiExpPs##_func_suffix(val2); \
val1 = GiAdd##_func_suffix(one_val, val1); \
val2 = GiAdd##_func_suffix(one_val, val2); \
auto rval1 = GiRecpe##_func_suffix(val1); \
auto rval2 = GiRecpe##_func_suffix(val2); \
rval1 = GiMultiply##_func_suffix( \
GiRecpeS##_func_suffix(val1, rval1), rval1); \
rval2 = GiMultiply##_func_suffix( \
GiRecpeS##_func_suffix(val2, rval2), rval2); \
val1 = GiMultiply##_func_suffix(two_val, rval1); \
val2 = GiMultiply##_func_suffix(two_val, rval2); \
val1 = GiSubtract##_func_suffix(one_val, val1); \
val2 = GiSubtract##_func_suffix(one_val, val2); \
return {{val1, val2}}; \
} \
_simd_type operator()(const _simd_type& src) const { \
auto one_val = GiBroadcast##_func_suffix(1.f); \
auto two_val = GiBroadcast##_func_suffix(2.f); \
auto val1 = src; \
val1 = GiMultiply##_func_suffix(two_val, val1); \
val1 = GiExpPs##_func_suffix(val1); \
val1 = GiAdd##_func_suffix(one_val, val1); \
auto rval1 = GiRecpe##_func_suffix(val1); \
rval1 = GiMultiply##_func_suffix( \
GiRecpeS##_func_suffix(val1, rval1), rval1); \
val1 = GiMultiply##_func_suffix(two_val, rval1); \
val1 = GiSubtract##_func_suffix(one_val, val1); \
return val1; \
} \
#define OP(_ctype, _simd_type, _simd_type2, _func_suffix, _simd_width) \
template <> \
struct TanhOp<_ctype> : TanhOpBase<_ctype> { \
using TanhOpBase::TanhOpBase; \
using TanhOpBase::operator(); \
constexpr static size_t SIMD_WIDTH = _simd_width; \
void operator()(const _simd_type2& src, _ctype* dst) const { \
auto vitem = operator()(src); \
GiStore##_func_suffix(dst, GiGetSubVector##_func_suffix##V2(vitem, 0)); \
GiStore##_func_suffix( \
dst + SIMD_WIDTH, GiGetSubVector##_func_suffix##V2(vitem, 1)); \
} \
_simd_type2 operator()(const _simd_type2& src) const { \
auto one_val = GiBroadcast##_func_suffix(1.f); \
auto two_val = GiBroadcast##_func_suffix(2.f); \
auto val1 = GiGetSubVector##_func_suffix##V2(src, 0); \
auto val2 = GiGetSubVector##_func_suffix##V2(src, 1); \
val1 = GiMultiply##_func_suffix(two_val, val1); \
val2 = GiMultiply##_func_suffix(two_val, val2); \
val1 = GiExpPs##_func_suffix(val1); \
val2 = GiExpPs##_func_suffix(val2); \
val1 = GiAdd##_func_suffix(one_val, val1); \
val2 = GiAdd##_func_suffix(one_val, val2); \
auto rval1 = GiRecpe##_func_suffix(val1); \
auto rval2 = GiRecpe##_func_suffix(val2); \
rval1 = GiMultiply##_func_suffix( \
GiRecpeS##_func_suffix(val1, rval1), rval1); \
rval2 = GiMultiply##_func_suffix( \
GiRecpeS##_func_suffix(val2, rval2), rval2); \
val1 = GiMultiply##_func_suffix(two_val, rval1); \
val2 = GiMultiply##_func_suffix(two_val, rval2); \
val1 = GiSubtract##_func_suffix(one_val, val1); \
val2 = GiSubtract##_func_suffix(one_val, val2); \
_simd_type2 ret; \
GiSetSubVector##_func_suffix##V2(ret, 0, val1); \
GiSetSubVector##_func_suffix##V2(ret, 1, val2); \
return ret; \
} \
_simd_type operator()(const _simd_type& src) const { \
auto one_val = GiBroadcast##_func_suffix(1.f); \
auto two_val = GiBroadcast##_func_suffix(2.f); \
auto val1 = src; \
val1 = GiMultiply##_func_suffix(two_val, val1); \
val1 = GiExpPs##_func_suffix(val1); \
val1 = GiAdd##_func_suffix(one_val, val1); \
auto rval1 = GiRecpe##_func_suffix(val1); \
rval1 = GiMultiply##_func_suffix( \
GiRecpeS##_func_suffix(val1, rval1), rval1); \
val1 = GiMultiply##_func_suffix(two_val, rval1); \
val1 = GiSubtract##_func_suffix(one_val, val1); \
return val1; \
} \
};
OP(dt_float32, GI_FLOAT32_t, GI_FLOAT32_V2_t, Float32, GI_SIMD_LEN_BYTE / sizeof(float))
#undef OP


+ 11
- 7
dnn/src/fallback/elemwise_helper/kimpl/true_div.h View File

@@ -36,18 +36,22 @@ struct TrueDivOp;
const _simd_type2& src0, const _simd_type2& src1, \
dst_ctype* dst) const { \
auto vitem = operator()(src0, src1); \
GiStore##_func_suffix(dst, vitem.val[0]); \
GiStore##_func_suffix(dst + SIMD_WIDTH, vitem.val[1]); \
GiStore##_func_suffix(dst, GiGetSubVector##_func_suffix##V2(vitem, 0)); \
GiStore##_func_suffix( \
dst + SIMD_WIDTH, GiGetSubVector##_func_suffix##V2(vitem, 1)); \
} \
_simd_type2 operator()( \
const _simd_type2& src0, const _simd_type2& src1) const { \
auto val1 = src0.val[0]; \
auto val2 = src0.val[1]; \
auto val3 = src1.val[0]; \
auto val4 = src1.val[1]; \
auto val1 = GiGetSubVector##_func_suffix##V2(src0, 0); \
auto val2 = GiGetSubVector##_func_suffix##V2(src0, 1); \
auto val3 = GiGetSubVector##_func_suffix##V2(src1, 0); \
auto val4 = GiGetSubVector##_func_suffix##V2(src1, 1); \
val1 = GiDivide##_func_suffix(val1, val3); \
val2 = GiDivide##_func_suffix(val2, val4); \
return {{val1, val2}}; \
_simd_type2 ret; \
GiSetSubVector##_func_suffix##V2(ret, 0, val1); \
GiSetSubVector##_func_suffix##V2(ret, 1, val2); \
return ret; \
} \
void operator()( \
const _simd_type& src0, const _simd_type& src1, \


+ 16
- 7
dnn/src/fallback/elemwise_helper/kimpl/typecvt.h View File

@@ -21,7 +21,8 @@ struct TypeCvtOp<dt_qint32, dt_qint8> : UnaryOpBase<dt_qint32, dt_qint8> {
}
void operator()(const GI_INT32_t& vsrc, dt_qint8* dst) const {
GiStoreLane0Int32(
reinterpret_cast<int32_t*>(dst), (GI_INT32_t)(operator()(vsrc)));
reinterpret_cast<int32_t*>(dst),
GiReinterpretInt8AsInt32(operator()(vsrc)));
}
void operator()(const src_ctype& src, dst_ctype* dst) const {
*dst = operator()(src);
@@ -32,17 +33,25 @@ struct TypeCvtOp<dt_qint32, dt_qint8> : UnaryOpBase<dt_qint32, dt_qint8> {
}

GI_INT8_t operator()(const GI_INT32_V2_t& vsrc) const {
auto vitem0 = GiMultiplyFloat32(GiCastToFloat32(vsrc.val[0]), this->vscale);
auto vitem1 = GiMultiplyFloat32(GiCastToFloat32(vsrc.val[1]), this->vscale);

return QConverter::convert<GI_INT8_t, GI_FLOAT32_V2_t>({{vitem0, vitem1}});
auto vitem0 = GiMultiplyFloat32(
GiCastToFloat32(GiGetSubVectorInt32V2(vsrc, 0)),
GiFixLenType2GiFloat32Type(this->vscale));
auto vitem1 = GiMultiplyFloat32(
GiCastToFloat32(GiGetSubVectorInt32V2(vsrc, 1)),
GiFixLenType2GiFloat32Type(this->vscale));

GI_FLOAT32_V2_t tmp;
GiSetSubVectorFloat32V2(tmp, 0, vitem0);
GiSetSubVectorFloat32V2(tmp, 1, vitem1);
return QConverter::convert<GI_INT8_t, GI_FLOAT32_V2_t>(tmp);
}
GI_INT8_t operator()(const GI_INT32_t& src) const {
auto vitem0 = GiMultiplyFloat32(GiCastToFloat32(src), this->vscale);
auto vitem0 = GiMultiplyFloat32(
GiCastToFloat32(src), GiFixLenType2GiFloat32Type(this->vscale));
return QConverter::convert<GI_INT8_t, GI_FLOAT32_t>(vitem0);
}
GI_INT8_t operator()(const GI_FLOAT32_t& src) const {
auto vitem0 = GiMultiplyFloat32(src, this->vscale);
auto vitem0 = GiMultiplyFloat32(src, GiFixLenType2GiFloat32Type(this->vscale));
return QConverter::convert<GI_INT8_t, GI_FLOAT32_t>(vitem0);
}
};


+ 182
- 131
dnn/src/fallback/elemwise_helper/op_common.h View File

@@ -96,6 +96,82 @@ cb(dt_float32, float, GI_FLOAT32_t, Float32);
cb(dt_int32, int32_t, GI_INT32_t, Int32);
#undef cb

///////////////////////////////// ParamElemVistor v2///////////////////////////
template <typename ctype>
struct ParamElemVisitorV2;

//! visitor single elemwise, and dup to vector
template <typename ctype>
struct ParamElemVisitorDupV2;

template <typename ctype>
struct ParamElemVisitorBcast101x4V2;

#define cb(_ctype, _inner_ctype, _simd_type, _fun_suffix, _simd_type_v2) \
template <> \
struct ParamElemVisitorV2<_ctype> { \
_simd_type_v2 operator()(const _ctype* src, const _ctype* src_1) const { \
_simd_type_v2 ret; \
GiSetSubVector##_fun_suffix##V2(ret, 0, GiLoad##_fun_suffix(src)); \
GiSetSubVector##_fun_suffix##V2(ret, 1, GiLoad##_fun_suffix(src_1)); \
return ret; \
} \
}; \
template <> \
struct ParamElemVisitorDupV2<_ctype> { \
_simd_type_v2 operator()(const _ctype* src) const { \
_simd_type_v2 ret; \
_simd_type tmp = GiBroadcast##_fun_suffix( \
*reinterpret_cast<const _inner_ctype*>(src)); \
GiSetSubVector##_fun_suffix##V2(ret, 0, tmp); \
GiSetSubVector##_fun_suffix##V2(ret, 1, tmp); \
return ret; \
} \
}
cb(dt_qint32, int32_t, GI_INT32_t, Int32, GI_INT32_V2_t);
cb(dt_qint8, int8_t, GI_INT8_t, Int8, GI_INT8_V2_t);

cb(dt_float32, float, GI_FLOAT32_t, Float32, GI_FLOAT32_V2_t);
cb(dt_int32, int32_t, GI_INT32_t, Int32, GI_INT32_V2_t);
cb(dt_int8, int8_t, GI_INT8_t, Int8, GI_INT8_V2_t);
#undef cb

template <typename ctype>
struct ParamElemVisitorBcast101x4V2;
#define cb(_ctype, _inner_ctype, _simd_type, _fun_suffix, rel_suffix, _simd_type_v2) \
template <> \
struct ParamElemVisitorBcast101x4V2<_ctype> { \
_simd_type_v2 operator()(const _ctype* src) const { \
_simd_type_v2 ret; \
_simd_type tmp = \
GiReinter##rel_suffix##To##_fun_suffix(GiBroadcast##rel_suffix( \
*reinterpret_cast<const _inner_ctype*>(src))); \
GiSetSubVector##_fun_suffix##V2(ret, 0, tmp); \
GiSetSubVector##_fun_suffix##V2(ret, 1, tmp); \
return ret; \
} \
}

cb(dt_qint8, int32_t, GI_INT8_t, Int8, Int32, GI_INT8_V2_t);
cb(dt_int8, int32_t, GI_INT8_t, Int8, Int32, GI_INT8_V2_t);
#undef cb
#define cb(_ctype, _inner_ctype, _simd_type, _fun_suffix, _simd_type_v2) \
template <> \
struct ParamElemVisitorBcast101x4V2<_ctype> { \
_simd_type_v2 operator()(const _ctype* src) const { \
_simd_type_v2 ret; \
_simd_type tmp = GiLoad##_fun_suffix(src); \
GiSetSubVector##_fun_suffix##V2(ret, 0, tmp); \
GiSetSubVector##_fun_suffix##V2(ret, 1, tmp); \
return ret; \
} \
}

cb(dt_qint32, int32_t, GI_INT32_t, Int32, GI_INT32_V2_t);
cb(dt_float32, float, GI_FLOAT32_t, Float32, GI_FLOAT32_V2_t);
cb(dt_int32, int32_t, GI_INT32_t, Int32, GI_INT32_V2_t);
#undef cb

///////////////////////////////// OpCaller /////////////////////////////
template <typename Op, BcastType bcast_type>
struct OpCallerUnary;
@@ -106,10 +182,10 @@ struct OpCallerUnary<Op, VEC> {
const typename Op::src_ctype* src, typename Op::dst_ctype* dst,
DType src_dtype, DType dst_dtype, size_t nr_elems) {
Op op(src_dtype, dst_dtype);
ParamElemVisitor<typename Op::src_ctype> vis;
ParamElemVisitorV2<typename Op::src_ctype> vis;
size_t i = 0;
for (; i + Op::SIMD_WIDTH * 2 <= nr_elems; i += Op::SIMD_WIDTH * 2) {
op({{vis(src), vis(src + Op::SIMD_WIDTH)}}, dst);
op(vis(src, src + Op::SIMD_WIDTH), dst);
src += Op::SIMD_WIDTH * 2;
dst += Op::SIMD_WIDTH * 2;
}
@@ -364,12 +440,12 @@ struct OpCallerBinary<Op, VEC_VEC> {
typename Op::dst_ctype* dst, DType src0_dtype, DType src1_dtype,
DType dst_dtype, size_t nr_elems) {
Op op(src0_dtype, src1_dtype, dst_dtype);
ParamElemVisitor<typename Op::src_ctype> vis0;
ParamElemVisitor<typename Op::src_ctype> vis1;
ParamElemVisitorV2<typename Op::src_ctype> vis0;
ParamElemVisitorV2<typename Op::src_ctype> vis1;
size_t i = 0;
for (; i + Op::SIMD_WIDTH * 2 <= nr_elems; i += Op::SIMD_WIDTH * 2) {
op({{vis0(src0), vis0(src0 + Op::SIMD_WIDTH)}},
{{vis1(src1), vis1(src1 + Op::SIMD_WIDTH)}}, dst);
op(vis0(src0, src0 + Op::SIMD_WIDTH), vis1(src1, src1 + Op::SIMD_WIDTH),
dst);
src0 += Op::SIMD_WIDTH * 2;
src1 += Op::SIMD_WIDTH * 2;
dst += Op::SIMD_WIDTH * 2;
@@ -394,17 +470,16 @@ struct OpCallerBinary<Op, VEC_BCAST101> {
typename Op::dst_ctype* dst, DType src0_dtype, DType src1_dtype,
DType dst_dtype, size_t batch, size_t channel, size_t channel_stride) {
Op op(src0_dtype, src1_dtype, dst_dtype);
ParamElemVisitor<typename Op::src_ctype> vis0;
ParamElemVisitorDup<typename Op::src_ctype> vis1;
ParamElemVisitorV2<typename Op::src_ctype> vis0;
ParamElemVisitorDupV2<typename Op::src_ctype> vis1;
for (size_t b = 0; b < batch; b++) {
const typename Op::src_ctype* src1_ptr = src1;
for (size_t c = 0; c < channel; c++) {
size_t i = 0;
auto src1_simd = vis1(src1_ptr);
auto src1_simd_v2 = vis1(src1_ptr);
for (; i + Op::SIMD_WIDTH * 2 <= channel_stride;
i += Op::SIMD_WIDTH * 2) {
op({{vis0(src0), vis0(src0 + Op::SIMD_WIDTH)}},
{{src1_simd, src1_simd}}, dst);
op(vis0(src0, src0 + Op::SIMD_WIDTH), src1_simd_v2, dst);
src0 += Op::SIMD_WIDTH * 2;
dst += Op::SIMD_WIDTH * 2;
}
@@ -430,7 +505,7 @@ struct OpCallerBinary<Op, VEC_BCASTX0X> {
typename Op::dst_ctype* dst, DType src0_dtype, DType src1_dtype,
DType dst_dtype, size_t batch, size_t channel, size_t channel_stride) {
Op op(src0_dtype, src1_dtype, dst_dtype);
ParamElemVisitor<typename Op::src_ctype> vis;
ParamElemVisitorV2<typename Op::src_ctype> vis;
for (size_t b = 0; b < batch; b++) {
const typename Op::src_ctype* src1_ptr_base = src1 + b * channel_stride;
for (size_t c = 0; c < channel; c++) {
@@ -438,11 +513,9 @@ struct OpCallerBinary<Op, VEC_BCASTX0X> {
auto src1_ptr = src1_ptr_base;
for (; i + Op::SIMD_WIDTH * 2 <= channel_stride;
i += Op::SIMD_WIDTH * 2) {
auto src0_simd0 = vis(src0);
auto src0_simd1 = vis(src0 + Op::SIMD_WIDTH);
auto src1_simd0 = vis(src1_ptr);
auto src1_simd1 = vis(src1_ptr + Op::SIMD_WIDTH);
op({{src0_simd0, src0_simd1}}, {{src1_simd0, src1_simd1}}, dst);
auto src0_simd01 = vis(src0, src0 + Op::SIMD_WIDTH);
auto src1_simd01 = vis(src1_ptr, src1_ptr + Op::SIMD_WIDTH);
op(src0_simd01, src1_simd01, dst);
src0 += Op::SIMD_WIDTH * 2;
src1_ptr += Op::SIMD_WIDTH * 2;
dst += Op::SIMD_WIDTH * 2;
@@ -469,19 +542,17 @@ struct OpCallerBinary<Op, VEC_BCAST111C> {
typename Op::dst_ctype* dst, DType src0_dtype, DType src1_dtype,
DType dst_dtype, size_t batch, size_t channel, size_t channel_stride) {
Op op(src0_dtype, src1_dtype, dst_dtype);
ParamElemVisitor<typename Op::src_ctype> vis;
ParamElemVisitorV2<typename Op::src_ctype> vis;
for (size_t b = 0; b < batch; b++) {
for (size_t c = 0; c < channel; c++) {
size_t rest = channel_stride;
const typename Op::src_ctype* src1_ptr = src1;
while (rest >= Op::SIMD_WIDTH * 2) {
auto src0_simd0 = vis(src0);
auto src0_simd1 = vis(src0 + Op::SIMD_WIDTH);
auto src1_simd0 = vis(src1_ptr);
auto src1_simd1 = vis(src1_ptr + Op::SIMD_WIDTH);
auto src0_simd01 = vis(src0, src0 + Op::SIMD_WIDTH);
auto src1_simd01 = vis(src1_ptr, src1_ptr + Op::SIMD_WIDTH);
src0 += Op::SIMD_WIDTH * 2;
src1_ptr += Op::SIMD_WIDTH * 2;
op({{src0_simd0, src0_simd1}}, {{src1_simd0, src1_simd1}}, dst);
op(src0_simd01, src1_simd01, dst);
dst += Op::SIMD_WIDTH * 2;
rest -= Op::SIMD_WIDTH * 2;
}
@@ -508,19 +579,17 @@ struct OpCallerBinary<Op, BCAST111C_VEC> {
typename Op::dst_ctype* dst, DType src0_dtype, DType src1_dtype,
DType dst_dtype, size_t batch, size_t channel, size_t channel_stride) {
Op op(src0_dtype, src1_dtype, dst_dtype);
ParamElemVisitor<typename Op::src_ctype> vis;
ParamElemVisitorV2<typename Op::src_ctype> vis;
for (size_t b = 0; b < batch; b++) {
for (size_t c = 0; c < channel; c++) {
size_t rest = channel_stride;
const typename Op::src_ctype* src0_ptr = src0;
while (rest >= Op::SIMD_WIDTH * 2) {
auto src0_simd0 = vis(src0_ptr);
auto src0_simd1 = vis(src0_ptr + Op::SIMD_WIDTH);
auto src1_simd0 = vis(src1);
auto src1_simd1 = vis(src1 + Op::SIMD_WIDTH);
auto src0_simd01 = vis(src0_ptr, src0_ptr + Op::SIMD_WIDTH);
auto src1_simd01 = vis(src1, src1 + Op::SIMD_WIDTH);
src0_ptr += Op::SIMD_WIDTH * 2;
src1 += Op::SIMD_WIDTH * 2;
op({{src0_simd0, src0_simd1}}, {{src1_simd0, src1_simd1}}, dst);
op(src0_simd01, src1_simd01, dst);
dst += Op::SIMD_WIDTH * 2;
rest -= Op::SIMD_WIDTH * 2;
}
@@ -599,13 +668,12 @@ struct OpCallerBinaryBcast101xDVec {
auto src0_ptr = src0;
for (size_t cb = 0; cb < nr_channel_blocks; cb++) {
auto src0_block_ptr = src0_ptr + cb * channel_block_dim;
auto channel_block_vec = vis0(src0_block_ptr);
auto channel_block_vec_v2 = vis0(src0_block_ptr);
size_t img_index = 0;
auto src1_offset = Op::SIMD_WIDTH / channel_block_dim;
for (; img_index + 2 * src1_offset <= channel_stride;
img_index += 2 * src1_offset) {
op({{channel_block_vec, channel_block_vec}},
{{vis1(src1), vis1(src1 + Op::SIMD_WIDTH)}}, dst);
op(channel_block_vec_v2, vis1(src1, src1 + Op::SIMD_WIDTH), dst);
src1 += Op::SIMD_WIDTH * 2;
dst += Op::SIMD_WIDTH * 2;
}
@@ -629,8 +697,8 @@ struct OpCallerBinaryBcast101xXVec<src_ctype, 4> {
const src_ctype* src0, const src_ctype* src1, typename Op::dst_ctype* dst,
const Op& op, size_t batch, size_t nr_channel_blocks,
size_t channel_stride) {
ParamElemVisitorBcast101x4<typename Op::src_ctype> vis0;
ParamElemVisitor<typename Op::src_ctype> vis1;
ParamElemVisitorBcast101x4V2<typename Op::src_ctype> vis0;
ParamElemVisitorV2<typename Op::src_ctype> vis1;
OpCallerBinaryBcast101xDVec<src_ctype, 4>::run(
src0, src1, dst, op, vis0, vis1, batch, nr_channel_blocks,
channel_stride);
@@ -717,13 +785,12 @@ struct OpCallerBinaryVecBcast101xD {
auto src1_ptr = src1;
for (size_t cb = 0; cb < nr_channel_blocks; cb++) {
auto src1_block_ptr = src1_ptr + cb * channel_block_dim;
auto channel_block_vec = vis1(src1_block_ptr);
auto channel_block_vec_v2 = vis1(src1_block_ptr);
size_t img_index = 0;
auto src0_offset = Op::SIMD_WIDTH / channel_block_dim;
for (; img_index + 2 * src0_offset <= channel_stride;
img_index += 2 * src0_offset) {
op({{vis0(src0), vis0(src0 + Op::SIMD_WIDTH)}},
{{channel_block_vec, channel_block_vec}}, dst);
op(vis0(src0, src0 + Op::SIMD_WIDTH), channel_block_vec_v2, dst);
src0 += Op::SIMD_WIDTH * 2;
dst += Op::SIMD_WIDTH * 2;
}
@@ -747,8 +814,8 @@ struct OpCallerBinaryVecBcast101xX<src_ctype, 4> {
const src_ctype* src0, const src_ctype* src1, typename Op::dst_ctype* dst,
const Op& op, size_t batch, size_t nr_channel_blocks,
size_t channel_stride) {
ParamElemVisitor<src_ctype> vis0;
ParamElemVisitorBcast101x4<src_ctype> vis1;
ParamElemVisitorV2<src_ctype> vis0;
ParamElemVisitorBcast101x4V2<src_ctype> vis1;
OpCallerBinaryVecBcast101xD<src_ctype, 4>::run(
src0, src1, dst, op, vis0, vis1, batch, nr_channel_blocks,
channel_stride);
@@ -783,13 +850,12 @@ struct OpCallerBinary<Op, VEC_SCALAR> {
typename Op::dst_ctype* dst, DType src0_dtype, DType src1_dtype,
DType dst_dtype, size_t nr_elems) {
Op op(src0_dtype, src1_dtype, dst_dtype);
ParamElemVisitor<typename Op::src_ctype> vis0;
ParamElemVisitorDup<typename Op::src_ctype> vis1;
auto vis1_simd = vis1(&src1);
ParamElemVisitorV2<typename Op::src_ctype> vis0;
ParamElemVisitorDupV2<typename Op::src_ctype> vis1;
auto vis1_simd_v2 = vis1(&src1);
size_t i = 0;
for (; i + Op::SIMD_WIDTH * 2 <= nr_elems; i += Op::SIMD_WIDTH * 2) {
op({{vis0(src0), vis0(src0 + Op::SIMD_WIDTH)}}, {{vis1_simd, vis1_simd}},
dst);
op(vis0(src0, src0 + Op::SIMD_WIDTH), vis1_simd_v2, dst);
src0 += Op::SIMD_WIDTH * 2;
dst += Op::SIMD_WIDTH * 2;
}
@@ -813,13 +879,12 @@ struct OpCallerBinary<Op, SCALAR_VEC> {
typename Op::dst_ctype* dst, DType src0_dtype, DType src1_dtype,
DType dst_dtype, size_t nr_elems) {
Op op(src0_dtype, src1_dtype, dst_dtype);
ParamElemVisitorDup<typename Op::src_ctype> vis0;
ParamElemVisitor<typename Op::src_ctype> vis1;
auto vis0_simd = vis0(&src0);
ParamElemVisitorDupV2<typename Op::src_ctype> vis0;
ParamElemVisitorV2<typename Op::src_ctype> vis1;
auto vis0_simd_v2 = vis0(&src0);
size_t i = 0;
for (; i + Op::SIMD_WIDTH * 2 <= nr_elems; i += Op::SIMD_WIDTH * 2) {
op({{vis0_simd, vis0_simd}}, {{vis1(src1), vis1(src1 + Op::SIMD_WIDTH)}},
dst);
op(vis0_simd_v2, vis1(src1, src1 + Op::SIMD_WIDTH), dst);
src1 += Op::SIMD_WIDTH * 2;
dst += Op::SIMD_WIDTH * 2;
}
@@ -842,17 +907,16 @@ struct OpCallerBinary<Op, BCAST101_VEC> {
typename Op::dst_ctype* dst, DType src0_dtype, DType src1_dtype,
DType dst_dtype, size_t batch, size_t channel, size_t channel_stride) {
Op op(src0_dtype, src1_dtype, dst_dtype);
ParamElemVisitorDup<typename Op::src_ctype> vis0;
ParamElemVisitor<typename Op::src_ctype> vis1;
ParamElemVisitorDupV2<typename Op::src_ctype> vis0;
ParamElemVisitorV2<typename Op::src_ctype> vis1;
for (size_t b = 0; b < batch; b++) {
auto src0_ptr = src0;
for (size_t c = 0; c < channel; c++) {
auto vis0_simd = vis0(src0_ptr);
auto vis0_simd_v2 = vis0(src0_ptr);
size_t i = 0;
for (; i + Op::SIMD_WIDTH * 2 <= channel_stride;
i += Op::SIMD_WIDTH * 2) {
op({{vis0_simd, vis0_simd}},
{{vis1(src1), vis1(src1 + Op::SIMD_WIDTH)}}, dst);
op(vis0_simd_v2, vis1(src1, src1 + Op::SIMD_WIDTH), dst);
src1 += Op::SIMD_WIDTH * 2;
dst += Op::SIMD_WIDTH * 2;
}
@@ -878,7 +942,7 @@ struct OpCallerBinary<Op, BCASTX0X_VEC> {
typename Op::dst_ctype* dst, DType src0_dtype, DType src1_dtype,
DType dst_dtype, size_t batch, size_t channel, size_t channel_stride) {
Op op(src0_dtype, src1_dtype, dst_dtype);
ParamElemVisitor<typename Op::src_ctype> vis;
ParamElemVisitorV2<typename Op::src_ctype> vis;
for (size_t b = 0; b < batch; b++) {
auto src0_ptr_base = src0 + b * channel_stride;
for (size_t c = 0; c < channel; c++) {
@@ -886,11 +950,9 @@ struct OpCallerBinary<Op, BCASTX0X_VEC> {
size_t i = 0;
for (; i + Op::SIMD_WIDTH * 2 <= channel_stride;
i += Op::SIMD_WIDTH * 2) {
auto src0_simd0 = vis(src0_ptr);
auto src0_simd1 = vis(src0_ptr + Op::SIMD_WIDTH);
auto src1_simd0 = vis(src1);
auto src1_simd1 = vis(src1 + Op::SIMD_WIDTH);
op({{src0_simd0, src0_simd1}}, {{src1_simd0, src1_simd1}}, dst);
auto src0_simd01 = vis(src0_ptr, src0_ptr + Op::SIMD_WIDTH);
auto src1_simd01 = vis(src1, src1 + Op::SIMD_WIDTH);
op(src0_simd01, src1_simd01, dst);
src0_ptr += Op::SIMD_WIDTH * 2;
src1 += Op::SIMD_WIDTH * 2;
dst += Op::SIMD_WIDTH * 2;
@@ -921,14 +983,13 @@ struct OpCallerTernary<Op, VEC_VEC_VEC> {
DType src0_dtype, DType src1_dtype, DType src2_dtype, DType dst_dtype,
size_t nr_elems) {
Op op(src0_dtype, src1_dtype, src2_dtype, dst_dtype);
ParamElemVisitor<typename Op::src_ctype> vis0;
ParamElemVisitor<typename Op::src_ctype> vis1;
ParamElemVisitor<typename Op::src_ctype> vis2;
ParamElemVisitorV2<typename Op::src_ctype> vis0;
ParamElemVisitorV2<typename Op::src_ctype> vis1;
ParamElemVisitorV2<typename Op::src_ctype> vis2;
size_t i = 0;
for (; i + Op::SIMD_WIDTH * 2 <= nr_elems; i += Op::SIMD_WIDTH * 2) {
op({{vis0(src0), vis0(src0 + Op::SIMD_WIDTH)}},
{{vis1(src1), vis1(src1 + Op::SIMD_WIDTH)}},
{{vis2(src2), vis2(src2 + Op::SIMD_WIDTH)}}, dst);
op(vis0(src0, src0 + Op::SIMD_WIDTH), vis1(src1, src1 + Op::SIMD_WIDTH),
vis2(src2, src2 + Op::SIMD_WIDTH), dst);
src0 += Op::SIMD_WIDTH * 2;
src1 += Op::SIMD_WIDTH * 2;
src2 += Op::SIMD_WIDTH * 2;
@@ -957,15 +1018,14 @@ struct OpCallerTernary<Op, VEC_VEC_SCALAR> {
DType src0_dtype, DType src1_dtype, DType src2_dtype, DType dst_dtype,
size_t nr_elems) {
Op op(src0_dtype, src1_dtype, src2_dtype, dst_dtype);
ParamElemVisitor<typename Op::src_ctype> vis0;
ParamElemVisitor<typename Op::src_ctype> vis1;
ParamElemVisitorDup<typename Op::src_ctype> vis2;
auto vis2_simd = vis2(&src2);
ParamElemVisitorV2<typename Op::src_ctype> vis0;
ParamElemVisitorV2<typename Op::src_ctype> vis1;
ParamElemVisitorDupV2<typename Op::src_ctype> vis2;
auto vis2_simd_v2 = vis2(&src2);
size_t i = 0;
for (; i + Op::SIMD_WIDTH * 2 <= nr_elems; i += Op::SIMD_WIDTH * 2) {
op({{vis0(src0), vis0(src0 + Op::SIMD_WIDTH)}},
{{vis1(src1), vis1(src1 + Op::SIMD_WIDTH)}}, {{vis2_simd, vis2_simd}},
dst);
op(vis0(src0, src0 + Op::SIMD_WIDTH), vis1(src1, src1 + Op::SIMD_WIDTH),
vis2_simd_v2, dst);
src0 += Op::SIMD_WIDTH * 2;
src1 += Op::SIMD_WIDTH * 2;
dst += Op::SIMD_WIDTH * 2;
@@ -993,22 +1053,21 @@ struct OpCallerTernary<Op, BCAST101_VEC_BCAST101> {
size_t batch_size, size_t channel_size, size_t channel_stride,
size_t batch_offset) {
Op op(src0_dtype, src1_dtype, src2_dtype, dst_dtype);
ParamElemVisitor<typename Op::src_ctype> vis1;
ParamElemVisitorDup<typename Op::src_ctype> vis0;
ParamElemVisitorDup<typename Op::src_ctype> vis2;
ParamElemVisitorV2<typename Op::src_ctype> vis1;
ParamElemVisitorDupV2<typename Op::src_ctype> vis0;
ParamElemVisitorDupV2<typename Op::src_ctype> vis2;
for (size_t batch = 0; batch < batch_size; batch++) {
auto src0_ptr = src0;
auto src2_ptr = src2;
auto b_offset = batch_offset;
for (size_t channel = 0; channel < channel_size; channel++) {
size_t i = 0;
auto src0_simd = vis0(src0_ptr);
auto src2_simd = vis2(src2_ptr);
auto src0_simd_v2 = vis0(src0_ptr);
auto src2_simd_v2 = vis2(src2_ptr);
for (; i + Op::SIMD_WIDTH * 2 <= channel_stride;
i += Op::SIMD_WIDTH * 2) {
op({{src0_simd, src0_simd}},
{{vis1(src1), vis1(src1 + Op::SIMD_WIDTH)}},
{{src2_simd, src2_simd}}, dst);
op(src0_simd_v2, vis1(src1, src1 + Op::SIMD_WIDTH), src2_simd_v2,
dst);
src1 += Op::SIMD_WIDTH * 2;
dst += Op::SIMD_WIDTH * 2;
b_offset -= Op::SIMD_WIDTH * 2;
@@ -1042,7 +1101,7 @@ struct OpCallerTernary<Op, BCAST111C_VEC_BCAST111C> {
DType src2_dtype, DType dst_dtype, size_t batch_size, size_t channel_size,
size_t channel_stride, size_t batch_offset) {
Op op(src0_dtype, src1_dtype, src2_dtype, dst_dtype);
ParamElemVisitor<typename Op::src_ctype> vis;
ParamElemVisitorV2<typename Op::src_ctype> vis;
for (size_t batch = 0; batch < batch_size; batch++) {
auto b_offset = batch_offset;
for (size_t channel = 0; channel < channel_size; channel++) {
@@ -1051,14 +1110,10 @@ struct OpCallerTernary<Op, BCAST111C_VEC_BCAST111C> {
size_t i = 0;
for (; i + Op::SIMD_WIDTH * 2 <= channel_stride;
i += Op::SIMD_WIDTH * 2) {
auto src0_simd0 = vis(src0_ptr);
auto src0_simd1 = vis(src0_ptr + Op::SIMD_WIDTH);
auto src1_simd0 = vis(src1);
auto src1_simd1 = vis(src1 + Op::SIMD_WIDTH);
auto src2_simd0 = vis(src2_ptr);
auto src2_simd1 = vis(src2_ptr + Op::SIMD_WIDTH);
op({{src0_simd0, src0_simd1}}, {{src1_simd0, src1_simd1}},
{{src2_simd0, src2_simd1}}, dst);
auto src0_simd01 = vis(src0_ptr, src0_ptr + Op::SIMD_WIDTH);
auto src1_simd01 = vis(src1, src1 + Op::SIMD_WIDTH);
auto src2_simd01 = vis(src2_ptr, src2_ptr + Op::SIMD_WIDTH);
op(src0_simd01, src1_simd01, src2_simd01, dst);
src0_ptr += Op::SIMD_WIDTH * 2;
src1 += Op::SIMD_WIDTH * 2;
src2_ptr += Op::SIMD_WIDTH * 2;
@@ -1125,15 +1180,14 @@ struct OpCallerTernaryBcast101xDVecBcast101xD {
for (size_t cb = 0; cb < nr_channel_blocks; cb++) {
auto src0_block_ptr = src0_ptr + cb * channel_block_dim;
auto src2_block_ptr = src2_ptr + cb * channel_block_dim;
auto channel_block_vec0 = vis0(src0_block_ptr);
auto channel_block_vec2 = vis2(src2_block_ptr);
auto channel_block_vec0_v2 = vis0(src0_block_ptr);
auto channel_block_vec2_v2 = vis2(src2_block_ptr);
size_t img_index = 0;
auto src1_offset = Op::SIMD_WIDTH / channel_block_dim;
for (; img_index + 2 * src1_offset <= channel_stride;
img_index += 2 * src1_offset) {
op({{channel_block_vec0, channel_block_vec0}},
{{vis1(src1), vis1(src1 + Op::SIMD_WIDTH)}},
{{channel_block_vec2, channel_block_vec2}}, dst);
op(channel_block_vec0_v2, vis1(src1, src1 + Op::SIMD_WIDTH),
channel_block_vec2_v2, dst);
src1 += Op::SIMD_WIDTH * 2;
dst += Op::SIMD_WIDTH * 2;
}
@@ -1159,9 +1213,9 @@ struct OpCallerTernaryBcast101xXVecBcast101xX<src_ctype, 4> {
const src_ctype* src0, const src_ctype* src1, const src_ctype* src2,
typename Op::dst_ctype* dst, const Op& op, size_t batch,
size_t nr_channel_blocks, size_t channel_stride) {
ParamElemVisitorBcast101x4<src_ctype> vis0;
ParamElemVisitor<src_ctype> vis1;
ParamElemVisitorBcast101x4<src_ctype> vis2;
ParamElemVisitorBcast101x4V2<src_ctype> vis0;
ParamElemVisitorV2<src_ctype> vis1;
ParamElemVisitorBcast101x4V2<src_ctype> vis2;
OpCallerTernaryBcast101xDVecBcast101xD<src_ctype, 4>::run(
src0, src1, src2, dst, op, vis0, vis1, vis2, batch, nr_channel_blocks,
channel_stride);
@@ -1201,19 +1255,18 @@ struct OpCallerTernary<Op, VEC_BCAST101_VEC> {
DType src0_dtype, DType src1_dtype, DType src2_dtype, DType dst_dtype,
size_t batch_size, size_t channel_size, size_t channel_stride) {
Op op(src0_dtype, src1_dtype, src2_dtype, dst_dtype);
ParamElemVisitor<typename Op::src_ctype> vis0;
ParamElemVisitorDup<typename Op::src_ctype> vis1;
ParamElemVisitor<typename Op::src_ctype> vis2;
ParamElemVisitorV2<typename Op::src_ctype> vis0;
ParamElemVisitorDupV2<typename Op::src_ctype> vis1;
ParamElemVisitorV2<typename Op::src_ctype> vis2;
for (size_t batch = 0; batch < batch_size; batch++) {
auto src1_ptr = src1;
for (size_t channel = 0; channel < channel_size; channel++) {
size_t i = 0;
auto src1_simd = vis1(src1_ptr);
auto src1_simd_v2 = vis1(src1_ptr);
for (; i + Op::SIMD_WIDTH * 2 <= channel_stride;
i += Op::SIMD_WIDTH * 2) {
op({{vis0(src0), vis0(src0 + Op::SIMD_WIDTH)}},
{{src1_simd, src1_simd}},
{{vis2(src2), vis2(src2 + Op::SIMD_WIDTH)}}, dst);
op(vis0(src0, src0 + Op::SIMD_WIDTH), src1_simd_v2,
vis2(src2, src2 + Op::SIMD_WIDTH), dst);
src0 += Op::SIMD_WIDTH * 2;
src2 += Op::SIMD_WIDTH * 2;
dst += Op::SIMD_WIDTH * 2;
@@ -1244,18 +1297,18 @@ struct OpCallerTernary<Op, VEC_BCAST111C_VEC> {
DType src1_dtype, DType src2_dtype, DType dst_dtype, size_t batch_size,
size_t channel_size, size_t channel_stride) {
Op op(src0_dtype, src1_dtype, src2_dtype, dst_dtype);
ParamElemVisitor<typename Op::src_ctype> vis0;
ParamElemVisitor<typename Op::src_ctype> vis1;
ParamElemVisitor<typename Op::src_ctype> vis2;
ParamElemVisitorV2<typename Op::src_ctype> vis0;
ParamElemVisitorV2<typename Op::src_ctype> vis1;
ParamElemVisitorV2<typename Op::src_ctype> vis2;
for (size_t batch = 0; batch < batch_size; batch++) {
for (size_t channel = 0; channel < channel_size; channel++) {
auto src1_ptr = src1;
size_t i = 0;
for (; i + Op::SIMD_WIDTH * 2 <= channel_stride;
i += Op::SIMD_WIDTH * 2) {
op({{vis0(src0), vis0(src0 + Op::SIMD_WIDTH)}},
{{vis1(src1_ptr), vis1(src1_ptr + Op::SIMD_WIDTH)}},
{{vis2(src2), vis2(src2 + Op::SIMD_WIDTH)}}, dst);
op(vis0(src0, src0 + Op::SIMD_WIDTH),
vis1(src1_ptr, src1_ptr + Op::SIMD_WIDTH),
vis2(src2, src2 + Op::SIMD_WIDTH), dst);
src0 += Op::SIMD_WIDTH * 2;
src1_ptr += Op::SIMD_WIDTH * 2;
src2 += Op::SIMD_WIDTH * 2;
@@ -1316,14 +1369,13 @@ struct OpCallerTernaryVecBcast101xDVec {
auto src1_ptr = src1;
for (size_t cb = 0; cb < nr_channel_blocks; cb++) {
auto src1_block_ptr = src1_ptr + cb * channel_block_dim;
auto channel_block_vec = vis1(src1_block_ptr);
auto channel_block_vec_v2 = vis1(src1_block_ptr);
size_t img_index = 0;
auto offset = Op::SIMD_WIDTH / channel_block_dim;
for (; img_index + 2 * offset <= channel_stride;
img_index += 2 * offset) {
op({{vis0(src0), vis0(src0 + Op::SIMD_WIDTH)}},
{{channel_block_vec, channel_block_vec}},
{{vis2(src2), vis2(src2 + Op::SIMD_WIDTH)}}, dst);
op(vis0(src0, src0 + Op::SIMD_WIDTH), channel_block_vec_v2,
vis2(src2, src2 + Op::SIMD_WIDTH), dst);
src0 += Op::SIMD_WIDTH * 2;
src2 += Op::SIMD_WIDTH * 2;
dst += Op::SIMD_WIDTH * 2;
@@ -1349,9 +1401,9 @@ struct OpCallerTernaryVecBcast101xXVec<src_ctype, 4> {
const src_ctype* src0, const src_ctype* src1, const src_ctype* src2,
typename Op::dst_ctype* dst, const Op& op, size_t batch,
size_t nr_channel_blocks, size_t channel_stride) {
ParamElemVisitor<src_ctype> vis0;
ParamElemVisitorBcast101x4<src_ctype> vis1;
ParamElemVisitor<src_ctype> vis2;
ParamElemVisitorV2<src_ctype> vis0;
ParamElemVisitorBcast101x4V2<src_ctype> vis1;
ParamElemVisitorV2<src_ctype> vis2;
OpCallerTernaryVecBcast101xDVec<src_ctype, 4>::run(
src0, src1, src2, dst, op, vis0, vis1, vis2, batch, nr_channel_blocks,
channel_stride);
@@ -1392,14 +1444,14 @@ struct OpCallerTernary<Op, VEC_SCALAR_VEC> {
DType src0_dtype, DType src1_dtype, DType src2_dtype, DType dst_dtype,
size_t nr_elems) {
Op op(src0_dtype, src1_dtype, src2_dtype, dst_dtype);
ParamElemVisitor<typename Op::src_ctype> vis0;
ParamElemVisitorDup<typename Op::src_ctype> vis1;
ParamElemVisitor<typename Op::src_ctype> vis2;
auto vis1_simd = vis1(&src1);
ParamElemVisitorV2<typename Op::src_ctype> vis0;
ParamElemVisitorDupV2<typename Op::src_ctype> vis1;
ParamElemVisitorV2<typename Op::src_ctype> vis2;
auto vis1_simd_v2 = vis1(&src1);
size_t i = 0;
for (; i + Op::SIMD_WIDTH * 2 <= nr_elems; i += Op::SIMD_WIDTH * 2) {
op({{vis0(src0), vis0(src0 + Op::SIMD_WIDTH)}}, {{vis1_simd, vis1_simd}},
{{vis2(src2), vis2(src2 + Op::SIMD_WIDTH)}}, dst);
op(vis0(src0, src0 + Op::SIMD_WIDTH), vis1_simd_v2,
vis2(src2, src2 + Op::SIMD_WIDTH), dst);
src0 += Op::SIMD_WIDTH * 2;
src2 += Op::SIMD_WIDTH * 2;
dst += Op::SIMD_WIDTH * 2;
@@ -1426,15 +1478,14 @@ struct OpCallerTernary<Op, VEC_SCALAR_SCALAR> {
DType src0_dtype, DType src1_dtype, DType src2_dtype, DType dst_dtype,
size_t nr_elems) {
Op op(src0_dtype, src1_dtype, src2_dtype, dst_dtype);
ParamElemVisitor<typename Op::src_ctype> vis0;
ParamElemVisitorDup<typename Op::src_ctype> vis1;
ParamElemVisitorDup<typename Op::src_ctype> vis2;
auto vis1_simd = vis1(&src1);
auto vis2_simd = vis2(&src2);
ParamElemVisitorV2<typename Op::src_ctype> vis0;
ParamElemVisitorDupV2<typename Op::src_ctype> vis1;
ParamElemVisitorDupV2<typename Op::src_ctype> vis2;
auto vis1_simd_v2 = vis1(&src1);
auto vis2_simd_v2 = vis2(&src2);
size_t i = 0;
for (; i + Op::SIMD_WIDTH * 2 <= nr_elems; i += Op::SIMD_WIDTH * 2) {
op({{vis0(src0), vis0(src0 + Op::SIMD_WIDTH)}}, {{vis1_simd, vis1_simd}},
{{vis2_simd, vis2_simd}}, dst);
op(vis0(src0, src0 + Op::SIMD_WIDTH), vis1_simd_v2, vis2_simd_v2, dst);
src0 += Op::SIMD_WIDTH * 2;
dst += Op::SIMD_WIDTH * 2;
}


+ 3
- 2
dnn/src/fallback/gi_intrinsic_helper.h View File

@@ -11,8 +11,9 @@ struct LoadHelper {
static GI_FORCEINLINE void impl(T& weight, T2 ptr, int oc_offset, XT... args);
};

#define WEIGHT_CB(step) \
src[step] = Func::impl(ptr + base_offset + step * ptr_step, args...);
#define WEIGHT_CB(step) \
src[step] = GiFloat32Type2FixLenType( \
Func::impl(ptr + base_offset + step * ptr_step, args...));

#define LOAD_HELPER(step) \
template < \


+ 7
- 1
dnn/src/fallback/quantized_converter.h View File

@@ -38,7 +38,13 @@ template <>
inline GI_FLOAT32_V2_t QConverter::convert(const GI_INT16_t& vsrc) {
GI_INT32_t vhi = GiMoveHighLongInt16(vsrc);
GI_INT32_t vlo = GiMoveLowLongInt16(vsrc);
return {{GiCastToFloat32(vlo), GiCastToFloat32(vhi)}};
GI_FLOAT32_t fhi = GiCastToFloat32(vhi);
GI_FLOAT32_t flo = GiCastToFloat32(vlo);
GI_FLOAT32_V2_t ret;
GiSetSubVectorFloat32V2(ret, 0, flo);
GiSetSubVectorFloat32V2(ret, 1, fhi);

return ret;
}

template <>


+ 195
- 156
dnn/src/fallback/reduce/reducer.h View File

@@ -36,14 +36,14 @@ struct MeanReducer<dt_qint8, int8_t, int32_t, false> {
using ctype = int8_t;
static constexpr int SIMD_WIDTH = GI_SIMD_LEN_BYTE / sizeof(int8_t);

GI_INT32_t res[4];
GI_INT32_FIXLEN_t res[4];
int32_t remain;
int32_t cnt;
float coef;
GI_FLOAT32_t vcoef;
GI_FLOAT32_FIXLEN_t vcoef;
MeanReducer(DType, size_t cnt) : remain(0), cnt(cnt), coef(1.0 / cnt) {
memset(res, 0, sizeof(res));
vcoef = GiBroadcastFloat32(coef);
vcoef = GiFloat32Type2FixLenType(GiBroadcastFloat32(coef));
}
MeanReducer() = default;
void feed(const int8_t* val) {
@@ -56,19 +56,27 @@ struct MeanReducer<dt_qint8, int8_t, int32_t, false> {
const GI_INT32_t vval_high_low = GiMoveLowLongInt16(vval_high);
const GI_INT32_t vval_high_high = GiMoveHighLongInt16(vval_high);

res[0] = GiAddInt32(res[0], vval_low_low);
res[1] = GiAddInt32(res[1], vval_low_high);
res[2] = GiAddInt32(res[2], vval_high_low);
res[3] = GiAddInt32(res[3], vval_high_high);
res[0] = GiInt32Type2FixLenType(
GiAddInt32(GiFixLenType2GiInt32Type(res[0]), vval_low_low));
res[1] = GiInt32Type2FixLenType(
GiAddInt32(GiFixLenType2GiInt32Type(res[1]), vval_low_high));
res[2] = GiInt32Type2FixLenType(
GiAddInt32(GiFixLenType2GiInt32Type(res[2]), vval_high_low));
res[3] = GiInt32Type2FixLenType(
GiAddInt32(GiFixLenType2GiInt32Type(res[3]), vval_high_high));
}
void feed_remain(const int8_t* val) { remain += *val; }
void post(int8_t* dst) {
for (int i = 0; i < 4; i += 2) {
GI_FLOAT32_t vitem0 = GiMultiplyFloat32(GiCastToFloat32(res[i]), vcoef);
GI_FLOAT32_t vitem1 = GiMultiplyFloat32(GiCastToFloat32(res[i + 1]), vcoef);
GiStoreLowInt8(
dst, (QConverter::convert<GI_INT8_t, GI_FLOAT32_V2_t>(
{{vitem0, vitem1}})));
auto tmp = GiFixLenType2GiFloat32Type(vcoef);
GI_FLOAT32_t vitem0 = GiMultiplyFloat32(
GiCastToFloat32(GiFixLenType2GiInt32Type(res[i])), tmp);
GI_FLOAT32_t vitem1 = GiMultiplyFloat32(
GiCastToFloat32(GiFixLenType2GiInt32Type(res[i + 1])), tmp);
GI_FLOAT32_V2_t ret;
GiSetSubVectorFloat32V2(ret, 0, vitem0);
GiSetSubVectorFloat32V2(ret, 1, vitem1);
GiStoreLowInt8(dst, (QConverter::convert<GI_INT8_t, GI_FLOAT32_V2_t>(ret)));
dst += 8;
}
}
@@ -83,17 +91,20 @@ struct MeanReducer<dt_float32, float, float, true> {
using ctype = float;
static constexpr int SIMD_WIDTH = GI_SIMD_LEN_BYTE / sizeof(float);

GI_FLOAT32_t res;
GI_FLOAT32_FIXLEN_t res;
float result;
float coef;
MeanReducer(DType, size_t cnt) : result(0.0f), coef(1.0 / cnt) {
res = GiBroadcastFloat32(0.0f);
res = GiFloat32Type2FixLenType(GiBroadcastFloat32(0.0f));
}
MeanReducer() = default;
void feed(const float* val) { res = GiAddFloat32(GiLoadFloat32(val), res); }
void feed(const float* val) {
res = GiFloat32Type2FixLenType(
GiAddFloat32(GiLoadFloat32(val), GiFixLenType2GiFloat32Type(res)));
}
void feed_remain(const float* val) { result += *val; }
void post(float* dst) {
result += GiReduceAddFloat32(res);
result += GiReduceAddFloat32(GiFixLenType2GiFloat32Type(res));
*dst = result * coef;
}
};
@@ -103,18 +114,22 @@ struct MeanReducer<dt_float32, float, float, false> {
using ctype = float;
static constexpr int SIMD_WIDTH = GI_SIMD_LEN_BYTE / sizeof(float);

GI_FLOAT32_t res;
GI_FLOAT32_FIXLEN_t res;
float remain;
float coef;
MeanReducer(DType, size_t cnt) : remain(0.0f), coef(1.0 / cnt) {
res = GiBroadcastFloat32(0.0f);
res = GiFloat32Type2FixLenType(GiBroadcastFloat32(0.0f));
}
MeanReducer() = default;
void feed(const float* val) { res = GiAddFloat32(GiLoadFloat32(val), res); }
void feed(const float* val) {
res = GiFloat32Type2FixLenType(
GiAddFloat32(GiLoadFloat32(val), GiFixLenType2GiFloat32Type(res)));
}
void feed_remain(const float* val) { remain += *val; }
void post(float* dst) {
res = GiMultiplyScalerFloat32(res, coef);
GiStoreFloat32(dst, res);
res = GiFloat32Type2FixLenType(
GiMultiplyScalerFloat32(GiFixLenType2GiFloat32Type(res), coef));
GiStoreFloat32(dst, GiFixLenType2GiFloat32Type(res));
}
void post_remain(float* dst) { *dst = remain * coef; }
};
@@ -125,23 +140,29 @@ struct maxReducer;
template <typename dtype, typename ctype, typename comp_type, bool C1>
struct minReducer;

#define REDUCER_MAX_MIN_C1(_mode, _Mode, _init) \
template <> \
struct _mode##Reducer<dt_float32, float, float, true> { \
using ctype = float; \
static constexpr int SIMD_WIDTH = GI_SIMD_LEN_BYTE / sizeof(float); \
GI_FLOAT32_t res; \
_mode##Reducer(DType, size_t) { res = GiBroadcastFloat32(_init); } \
_mode##Reducer() = default; \
void feed(const float* val) { \
auto vval = GiLoadFloat32(val); \
res = Gi##_Mode##NanFloat32(res, vval); \
} \
void feed_remain(const float* val) { \
auto vval = GiBroadcastFloat32(*val); \
res = Gi##_Mode##NanFloat32(vval, res); \
} \
void post(float* dst) { *dst = GiReduce##_Mode##NanFloat32(res); } \
#define REDUCER_MAX_MIN_C1(_mode, _Mode, _init) \
template <> \
struct _mode##Reducer<dt_float32, float, float, true> { \
using ctype = float; \
static constexpr int SIMD_WIDTH = GI_SIMD_LEN_BYTE / sizeof(float); \
GI_FLOAT32_FIXLEN_t res; \
_mode##Reducer(DType, size_t) { \
res = GiFloat32Type2FixLenType(GiBroadcastFloat32(_init)); \
} \
_mode##Reducer() = default; \
void feed(const float* val) { \
auto vval = GiLoadFloat32(val); \
res = GiFloat32Type2FixLenType( \
Gi##_Mode##NanFloat32(GiFixLenType2GiFloat32Type(res), vval)); \
} \
void feed_remain(const float* val) { \
auto vval = GiBroadcastFloat32(*val); \
res = GiFloat32Type2FixLenType( \
Gi##_Mode##NanFloat32(vval, GiFixLenType2GiFloat32Type(res))); \
} \
void post(float* dst) { \
*dst = GiReduce##_Mode##NanFloat32(GiFixLenType2GiFloat32Type(res)); \
} \
}

REDUCER_MAX_MIN_C1(max, Max, std::numeric_limits<dt_float32>::lowest());
@@ -151,28 +172,31 @@ REDUCER_MAX_MIN_C1(min, Min, std::numeric_limits<dt_float32>::max());
#define Max_NAN(a, b) (isnan(a) || (a) > (b)) ? (a) : (b);
#define Min_NAN(a, b) (isnan(a) || (a) < (b)) ? (a) : (b);

#define REDUCER_MAX_MIN_C(_mode, _Mode, _init) \
template <> \
struct _mode##Reducer<dt_float32, float, float, false> { \
using ctype = float; \
static constexpr int SIMD_WIDTH = GI_SIMD_LEN_BYTE / sizeof(float); \
GI_FLOAT32_t res; \
float remain; \
_mode##Reducer(DType, size_t) { \
res = GiBroadcastFloat32(_init); \
remain = _init; \
} \
_mode##Reducer() = default; \
void feed(const float* val) { \
GI_FLOAT32_t vval = GiLoadFloat32(val); \
res = Gi##_Mode##NanFloat32(res, vval); \
} \
void feed_remain(const float* val) { \
using namespace std; \
remain = _Mode##_NAN(*val, remain); \
} \
void post(float* dst) { GiStoreFloat32(dst, res); } \
void post_remain(float* dst) { *dst = remain; } \
#define REDUCER_MAX_MIN_C(_mode, _Mode, _init) \
template <> \
struct _mode##Reducer<dt_float32, float, float, false> { \
using ctype = float; \
static constexpr int SIMD_WIDTH = GI_SIMD_LEN_BYTE / sizeof(float); \
GI_FLOAT32_FIXLEN_t res; \
float remain; \
_mode##Reducer(DType, size_t) { \
res = GiFloat32Type2FixLenType(GiBroadcastFloat32(_init)); \
remain = _init; \
} \
_mode##Reducer() = default; \
void feed(const float* val) { \
GI_FLOAT32_t vval = GiLoadFloat32(val); \
res = GiFloat32Type2FixLenType( \
Gi##_Mode##NanFloat32(GiFixLenType2GiFloat32Type(res), vval)); \
} \
void feed_remain(const float* val) { \
using namespace std; \
remain = _Mode##_NAN(*val, remain); \
} \
void post(float* dst) { \
GiStoreFloat32(dst, GiFixLenType2GiFloat32Type(res)); \
} \
void post_remain(float* dst) { *dst = remain; } \
}

REDUCER_MAX_MIN_C(max, Max, std::numeric_limits<dt_float32>::lowest());
@@ -181,51 +205,58 @@ REDUCER_MAX_MIN_C(min, Min, std::numeric_limits<dt_float32>::max());
#undef Max_NAN
#undef Min_NAN

#define REDUCER_MAX_MIN_C1(_mode, _Mode, _init) \
template <> \
struct _mode##Reducer<dt_qint8, int8_t, int8_t, true> { \
using ctype = int8_t; \
static constexpr int SIMD_WIDTH = GI_SIMD_LEN_BYTE / sizeof(int8_t); \
GI_INT8_t res; \
_mode##Reducer(DType, size_t) { res = GiBroadcastInt8(_init); } \
_mode##Reducer() = default; \
void feed(const int8_t* val) { \
GI_INT8_t vval = GiLoadInt8(val); \
res = Gi##_Mode##imumInt8(vval, res); \
} \
void feed_remain(const int8_t* val) { \
GI_INT8_t vval = GiBroadcastInt8(*val); \
res = Gi##_Mode##imumInt8(res, vval); \
} \
void post(int8_t* dst) { *dst = GiReduce##_Mode##Int8(res); } \
#define REDUCER_MAX_MIN_C1(_mode, _Mode, _init) \
template <> \
struct _mode##Reducer<dt_qint8, int8_t, int8_t, true> { \
using ctype = int8_t; \
static constexpr int SIMD_WIDTH = GI_SIMD_LEN_BYTE / sizeof(int8_t); \
GI_INT8_FIXLEN_t res; \
_mode##Reducer(DType, size_t) { \
res = GiInt8Type2FixLenType(GiBroadcastInt8(_init)); \
} \
_mode##Reducer() = default; \
void feed(const int8_t* val) { \
GI_INT8_t vval = GiLoadInt8(val); \
res = GiInt8Type2FixLenType( \
Gi##_Mode##imumInt8(vval, GiFixLenType2GiInt8Type(res))); \
} \
void feed_remain(const int8_t* val) { \
GI_INT8_t vval = GiBroadcastInt8(*val); \
res = GiInt8Type2FixLenType( \
Gi##_Mode##imumInt8(GiFixLenType2GiInt8Type(res), vval)); \
} \
void post(int8_t* dst) { \
*dst = GiReduce##_Mode##Int8(GiFixLenType2GiInt8Type(res)); \
} \
}

REDUCER_MAX_MIN_C1(max, Max, -128);
REDUCER_MAX_MIN_C1(min, Min, 127);
#undef REDUCER_MAX_MIN_C1

#define REDUCER_MAX_MIN_C(_mode, _Mode, _init) \
template <> \
struct _mode##Reducer<dt_qint8, int8_t, int8_t, false> { \
using ctype = int8_t; \
static constexpr int SIMD_WIDTH = GI_SIMD_LEN_BYTE / sizeof(int8_t); \
GI_INT8_t res; \
int8_t remain; \
_mode##Reducer(DType, size_t) { \
res = GiBroadcastInt8(_init); \
remain = _init; \
} \
_mode##Reducer() = default; \
void feed(const int8_t* val) { \
GI_INT8_t vval = GiLoadInt8(val); \
res = Gi##_Mode##imumInt8(res, vval); \
} \
void feed_remain(const int8_t* val) { \
using namespace std; \
remain = _mode(*val, remain); \
} \
void post(int8_t* dst) { GiStoreInt8(dst, res); } \
void post_remain(int8_t* dst) { *dst = remain; } \
#define REDUCER_MAX_MIN_C(_mode, _Mode, _init) \
template <> \
struct _mode##Reducer<dt_qint8, int8_t, int8_t, false> { \
using ctype = int8_t; \
static constexpr int SIMD_WIDTH = GI_SIMD_LEN_BYTE / sizeof(int8_t); \
GI_INT8_FIXLEN_t res; \
int8_t remain; \
_mode##Reducer(DType, size_t) { \
res = GiInt8Type2FixLenType(GiBroadcastInt8(_init)); \
remain = _init; \
} \
_mode##Reducer() = default; \
void feed(const int8_t* val) { \
GI_INT8_t vval = GiLoadInt8(val); \
res = GiInt8Type2FixLenType( \
Gi##_Mode##imumInt8(GiFixLenType2GiInt8Type(res), vval)); \
} \
void feed_remain(const int8_t* val) { \
using namespace std; \
remain = _mode(*val, remain); \
} \
void post(int8_t* dst) { GiStoreInt8(dst, GiFixLenType2GiInt8Type(res)); } \
void post_remain(int8_t* dst) { *dst = remain; } \
}

REDUCER_MAX_MIN_C(max, Max, -128);
@@ -238,61 +269,67 @@ struct SumReducer;
template <typename dtype, typename ctype, typename comp_type, bool C1>
struct ProductReducer;

#define REDUCER_SUM_PRODUCT_C1(_mode, _Mode, _op, _init) \
template <> \
struct _mode##Reducer<dt_float32, float, float, true> { \
using ctype = float; \
static constexpr int SIMD_WIDTH = GI_SIMD_LEN_BYTE / sizeof(float); \
GI_FLOAT32_t res; \
float remain; \
_mode##Reducer(DType, size_t) { \
res = GiBroadcastFloat32(_init); \
remain = _init; \
} \
_mode##Reducer() = default; \
void feed(const float* val) { \
GI_FLOAT32_t vval = GiLoadFloat32(val); \
res = Gi##_Mode##Float32(vval, res); \
} \
void feed_remain(const float* val) { \
using namespace std; \
auto op = _op<float>(); \
remain = op(remain, *val); \
} \
void post(float* dst) { \
using namespace std; \
auto op = _op<float>(); \
*dst = op(remain, GiReduce##_Mode##Float32(res)); \
} \
#define REDUCER_SUM_PRODUCT_C1(_mode, _Mode, _op, _init) \
template <> \
struct _mode##Reducer<dt_float32, float, float, true> { \
using ctype = float; \
static constexpr int SIMD_WIDTH = GI_SIMD_LEN_BYTE / sizeof(float); \
GI_FLOAT32_FIXLEN_t res; \
float remain; \
_mode##Reducer(DType, size_t) { \
res = GiFloat32Type2FixLenType(GiBroadcastFloat32(_init)); \
remain = _init; \
} \
_mode##Reducer() = default; \
void feed(const float* val) { \
GI_FLOAT32_t vval = GiLoadFloat32(val); \
res = GiFloat32Type2FixLenType( \
Gi##_Mode##Float32(vval, GiFixLenType2GiFloat32Type(res))); \
} \
void feed_remain(const float* val) { \
using namespace std; \
auto op = _op<float>(); \
remain = op(remain, *val); \
} \
void post(float* dst) { \
using namespace std; \
auto op = _op<float>(); \
*dst = \
op(remain, \
GiReduce##_Mode##Float32(GiFixLenType2GiFloat32Type(res))); \
} \
}

REDUCER_SUM_PRODUCT_C1(Sum, Add, plus, 0.0f);
REDUCER_SUM_PRODUCT_C1(Product, Multiply, multiplies, 1.0f);
#undef REDUCER_SUM_PRODUCT_C1

#define REDUCER_SUM_PRODUCT_C(_mode, _Mode, _op, _init) \
template <> \
struct _mode##Reducer<dt_float32, float, float, false> { \
using ctype = float; \
static constexpr int SIMD_WIDTH = GI_SIMD_LEN_BYTE / sizeof(float); \
GI_FLOAT32_t res; \
float remain; \
_mode##Reducer(DType, size_t) { \
res = GiBroadcastFloat32(_init); \
remain = _init; \
} \
_mode##Reducer() = default; \
void feed(const float* val) { \
GI_FLOAT32_t vval = GiLoadFloat32(val); \
res = Gi##_Mode##Float32(vval, res); \
} \
void feed_remain(const float* val) { \
using namespace std; \
auto op = _op<float>(); \
remain = op(remain, (*val)); \
} \
void post(float* dst) { GiStoreFloat32(dst, res); } \
void post_remain(float* dst) { *dst = remain; } \
#define REDUCER_SUM_PRODUCT_C(_mode, _Mode, _op, _init) \
template <> \
struct _mode##Reducer<dt_float32, float, float, false> { \
using ctype = float; \
static constexpr int SIMD_WIDTH = GI_SIMD_LEN_BYTE / sizeof(float); \
GI_FLOAT32_FIXLEN_t res; \
float remain; \
_mode##Reducer(DType, size_t) { \
res = GiFloat32Type2FixLenType(GiBroadcastFloat32(_init)); \
remain = _init; \
} \
_mode##Reducer() = default; \
void feed(const float* val) { \
GI_FLOAT32_t vval = GiLoadFloat32(val); \
res = GiFloat32Type2FixLenType( \
Gi##_Mode##Float32(vval, GiFixLenType2GiFloat32Type(res))); \
} \
void feed_remain(const float* val) { \
using namespace std; \
auto op = _op<float>(); \
remain = op(remain, (*val)); \
} \
void post(float* dst) { \
GiStoreFloat32(dst, GiFixLenType2GiFloat32Type(res)); \
} \
void post_remain(float* dst) { *dst = remain; } \
}

REDUCER_SUM_PRODUCT_C(Sum, Add, plus, 0.0f);
@@ -308,23 +345,24 @@ struct SumSqrReducer<dt_float32, float, float, true> {
using ctype = float;
static constexpr int SIMD_WIDTH = GI_SIMD_LEN_BYTE / sizeof(float);

GI_FLOAT32_t res;
GI_FLOAT32_FIXLEN_t res;
float result;
SumSqrReducer(DType, size_t cnt) : result(0.0f) {
MEGDNN_MARK_USED_VAR(cnt);
res = GiBroadcastFloat32(0.0f);
res = GiFloat32Type2FixLenType(GiBroadcastFloat32(0.0f));
}
SumSqrReducer() = default;
void feed(const float* val) {
GI_FLOAT32_t vval = GiLoadFloat32(val);
res = GiAddFloat32(GiMultiplyFloat32(vval, vval), res);
res = GiFloat32Type2FixLenType(GiAddFloat32(
GiMultiplyFloat32(vval, vval), GiFixLenType2GiFloat32Type(res)));
}
void feed_remain(const float* val) {
float vval = *val;
result += vval * vval;
}
void post(float* dst) {
result += GiReduceAddFloat32(res);
result += GiReduceAddFloat32(GiFixLenType2GiFloat32Type(res));
*dst = result;
}
};
@@ -333,19 +371,20 @@ struct SumSqrReducer<dt_float32, float, float, false> {
using ctype = float;
static constexpr int SIMD_WIDTH = GI_SIMD_LEN_BYTE / sizeof(float);

GI_FLOAT32_t res;
GI_FLOAT32_FIXLEN_t res;
float remain;
SumSqrReducer(DType, size_t cnt) : remain(0.0f) {
MEGDNN_MARK_USED_VAR(cnt);
res = GiBroadcastFloat32(0.0f);
res = GiFloat32Type2FixLenType(GiBroadcastFloat32(0.0f));
}
SumSqrReducer() = default;
void feed(const float* val) {
GI_FLOAT32_t vval = GiLoadFloat32(val);
res = GiAddFloat32(GiMultiplyFloat32(vval, vval), res);
res = GiFloat32Type2FixLenType(GiAddFloat32(
GiMultiplyFloat32(vval, vval), GiFixLenType2GiFloat32Type(res)));
}
void feed_remain(const float* val) { remain += (*val) * (*val); }
void post(float* dst) { GiStoreFloat32(dst, res); }
void post(float* dst) { GiStoreFloat32(dst, GiFixLenType2GiFloat32Type(res)); }
void post_remain(float* dst) { *dst = remain; }
};
/**************************************do reduce*************************/


+ 61
- 49
dnn/src/fallback/type_cvt/typecvt_helper.h View File

@@ -18,22 +18,26 @@ struct QuantizedTypeCvter<int32_t, int8_t> {
static constexpr size_t SIMD_WIDTH = GI_SIMD_LEN_BYTE / sizeof(int32_t) * 2;
static constexpr size_t SIMD_STEP = GI_SIMD_LEN_BYTE / sizeof(int32_t);
float scale;
GI_FLOAT32_t vscale;
GI_FLOAT32_FIXLEN_t vscale;

QuantizedTypeCvter(DType src_dtype, DType dst_dtype) {
float src_scale = src_dtype.param<dtype::QuantizedS32>().scale;
float dst_scale = dst_dtype.param<dtype::QuantizedS8>().scale;
scale = src_scale / dst_scale;
vscale = GiBroadcastFloat32(scale);
vscale = GiFloat32Type2FixLenType(GiBroadcastFloat32(scale));
}

void cvt(const int32_t* src, int8_t* dst) {
GI_FLOAT32_t vitem0 =
GiMultiplyFloat32(GiCastToFloat32(GiLoadInt32(src)), vscale);
GI_FLOAT32_t vitem1 = GiMultiplyFloat32(
GiCastToFloat32(GiLoadInt32(src + SIMD_STEP)), vscale);

auto vres = QConverter::convert<GI_INT8_t, GI_FLOAT32_V2_t>({{vitem0, vitem1}});
GI_FLOAT32_t t;
t = GiFixLenType2GiFloat32Type(vscale);
GI_FLOAT32_t vitem0 = GiMultiplyFloat32(GiCastToFloat32(GiLoadInt32(src)), t);
GI_FLOAT32_t vitem1 =
GiMultiplyFloat32(GiCastToFloat32(GiLoadInt32(src + SIMD_STEP)), t);

GI_FLOAT32_V2_t v2;
GiSetSubVectorFloat32V2(v2, 0, vitem0);
GiSetSubVectorFloat32V2(v2, 1, vitem1);
auto vres = QConverter::convert<GI_INT8_t, GI_FLOAT32_V2_t>(v2);
GiStoreLowInt8(dst, vres);
}

@@ -48,27 +52,29 @@ struct QuantizedTypeCvter<int8_t, int32_t> {
using dst_type = int32_t;
static constexpr size_t SIMD_WIDTH = GI_SIMD_LEN_BYTE / sizeof(int8_t);
float scale;
GI_FLOAT32_t vscale;
GI_FLOAT32_FIXLEN_t vscale;

QuantizedTypeCvter(DType src_dtype, DType dst_dtype) {
float src_scale = src_dtype.param<dtype::QuantizedS8>().scale;
float dst_scale = dst_dtype.param<dtype::QuantizedS32>().scale;
scale = src_scale / dst_scale;
vscale = GiBroadcastFloat32(scale);
vscale = GiFloat32Type2FixLenType(GiBroadcastFloat32(scale));
}

void cvt(const int8_t* src, int32_t* dst) {
GI_FLOAT32_t t;
t = GiFixLenType2GiFloat32Type(vscale);
GI_INT8_t data = GiLoadInt8(src);
GI_INT16_t vitem0 = GiMoveLowLongInt8(data);
GI_INT16_t vitem1 = GiMoveHighLongInt8(data);
auto vret0 = QConverter::round<GI_INT32_t, GI_FLOAT32_t>(
GiMultiplyFloat32(GiCastToFloat32(GiMoveLowLongInt16(vitem0)), vscale));
auto vret1 = QConverter::round<GI_INT32_t, GI_FLOAT32_t>(GiMultiplyFloat32(
GiCastToFloat32(GiMoveHighLongInt16(vitem0)), vscale));
GiMultiplyFloat32(GiCastToFloat32(GiMoveLowLongInt16(vitem0)), t));
auto vret1 = QConverter::round<GI_INT32_t, GI_FLOAT32_t>(
GiMultiplyFloat32(GiCastToFloat32(GiMoveHighLongInt16(vitem0)), t));
auto vret2 = QConverter::round<GI_INT32_t, GI_FLOAT32_t>(
GiMultiplyFloat32(GiCastToFloat32(GiMoveLowLongInt16(vitem1)), vscale));
auto vret3 = QConverter::round<GI_INT32_t, GI_FLOAT32_t>(GiMultiplyFloat32(
GiCastToFloat32(GiMoveHighLongInt16(vitem1)), vscale));
GiMultiplyFloat32(GiCastToFloat32(GiMoveLowLongInt16(vitem1)), t));
auto vret3 = QConverter::round<GI_INT32_t, GI_FLOAT32_t>(
GiMultiplyFloat32(GiCastToFloat32(GiMoveHighLongInt16(vitem1)), t));

constexpr size_t step = GI_SIMD_LEN_BYTE / sizeof(int32_t);
GiStoreInt32(dst, vret0);
@@ -90,21 +96,26 @@ struct QuantizedTypeCvter<float, int8_t> {
static constexpr size_t SIMD_WIDTH = GI_SIMD_LEN_BYTE / sizeof(float) * 2;
static constexpr size_t SIMD_STEP = GI_SIMD_LEN_BYTE / sizeof(float);
float scale;
GI_FLOAT32_t vscale;
GI_FLOAT32_FIXLEN_t vscale;

QuantizedTypeCvter(DType src_dtype, DType dst_dtype) {
MEGDNN_MARK_USED_VAR(src_dtype);
float src_scale = 1;
float dst_scale = dst_dtype.param<dtype::QuantizedS8>().scale;
scale = src_scale / dst_scale;
vscale = GiBroadcastFloat32(scale);
vscale = GiFloat32Type2FixLenType(GiBroadcastFloat32(scale));
}

void cvt(const float* src, int8_t* dst) {
GI_FLOAT32_t vitem0 = GiMultiplyFloat32(GiLoadFloat32(src), vscale);
GI_FLOAT32_t vitem1 = GiMultiplyFloat32(GiLoadFloat32(src + SIMD_STEP), vscale);

auto vres = QConverter::convert<GI_INT8_t, GI_FLOAT32_V2_t>({{vitem0, vitem1}});
GI_FLOAT32_t t;
t = GiFixLenType2GiFloat32Type(vscale);
GI_FLOAT32_t vitem0 = GiMultiplyFloat32(GiLoadFloat32(src), t);
GI_FLOAT32_t vitem1 = GiMultiplyFloat32(GiLoadFloat32(src + SIMD_STEP), t);

GI_FLOAT32_V2_t v2;
GiSetSubVectorFloat32V2(v2, 0, vitem0);
GiSetSubVectorFloat32V2(v2, 1, vitem1);
auto vres = QConverter::convert<GI_INT8_t, GI_FLOAT32_V2_t>(v2);
GiStoreLowInt8(dst, vres);
}

@@ -119,18 +130,19 @@ struct QuantizedTypeCvter<int32_t, int32_t> {
using dst_type = int32_t;
static constexpr size_t SIMD_WIDTH = GI_SIMD_LEN_BYTE / sizeof(int32_t);
float scale;
GI_FLOAT32_t vscale;
GI_FLOAT32_FIXLEN_t vscale;

QuantizedTypeCvter(DType src_dtype, DType dst_dtype) {
float src_scale = src_dtype.param<dtype::QuantizedS32>().scale;
float dst_scale = dst_dtype.param<dtype::QuantizedS32>().scale;
scale = src_scale / dst_scale;
vscale = GiBroadcastFloat32(scale);
vscale = GiFloat32Type2FixLenType(GiBroadcastFloat32(scale));
}

void cvt(const int32_t* src, int32_t* dst) {
GI_FLOAT32_t vitem =
GiMultiplyFloat32(GiCastToFloat32(GiLoadInt32(src)), vscale);
GI_FLOAT32_t t;
t = GiFixLenType2GiFloat32Type(vscale);
GI_FLOAT32_t vitem = GiMultiplyFloat32(GiCastToFloat32(GiLoadInt32(src)), t);

auto vres = QConverter::round<GI_INT32_t, GI_FLOAT32_t>(vitem);
GiStoreInt32(dst, vres);
@@ -148,30 +160,32 @@ struct QuantizedTypeCvter<int8_t, int8_t> {
using dst_type = int8_t;
static constexpr size_t SIMD_WIDTH = GI_SIMD_LEN_BYTE / sizeof(int8_t);
float scale;
GI_FLOAT32_t vscale;
GI_FLOAT32_FIXLEN_t vscale;

QuantizedTypeCvter(DType src_dtype, DType dst_dtype) {
float src_scale = src_dtype.param<dtype::QuantizedS8>().scale;
float dst_scale = dst_dtype.param<dtype::QuantizedS8>().scale;
scale = src_scale / dst_scale;
vscale = GiBroadcastFloat32(scale);
vscale = GiFloat32Type2FixLenType(GiBroadcastFloat32(scale));
}

void cvt(const int8_t* src, int8_t* dst) {
GI_FLOAT32_t t;
t = GiFixLenType2GiFloat32Type(vscale);
GI_INT8_t data = GiLoadInt8(src);
GI_INT16_t vitem0 = GiMoveLowLongInt8(data);
GI_INT16_t vitem1 = GiMoveHighLongInt8(data);
auto vret0 =
GiMultiplyFloat32(GiCastToFloat32(GiMoveLowLongInt16(vitem0)), vscale);
auto vret1 =
GiMultiplyFloat32(GiCastToFloat32(GiMoveHighLongInt16(vitem0)), vscale);
auto vret2 =
GiMultiplyFloat32(GiCastToFloat32(GiMoveLowLongInt16(vitem1)), vscale);
auto vret3 =
GiMultiplyFloat32(GiCastToFloat32(GiMoveHighLongInt16(vitem1)), vscale);
auto vres = QConverter::convert<GI_INT8_t, GI_FLOAT32_V4_t>(
{{vret0, vret1, vret2, vret3}});
auto vret0 = GiMultiplyFloat32(GiCastToFloat32(GiMoveLowLongInt16(vitem0)), t);
auto vret1 = GiMultiplyFloat32(GiCastToFloat32(GiMoveHighLongInt16(vitem0)), t);
auto vret2 = GiMultiplyFloat32(GiCastToFloat32(GiMoveLowLongInt16(vitem1)), t);
auto vret3 = GiMultiplyFloat32(GiCastToFloat32(GiMoveHighLongInt16(vitem1)), t);
GI_FLOAT32_V4_t v4;
GiSetSubVectorFloat32V4(v4, 0, vret0);
GiSetSubVectorFloat32V4(v4, 1, vret1);
GiSetSubVectorFloat32V4(v4, 2, vret2);
GiSetSubVectorFloat32V4(v4, 3, vret3);
auto vres = QConverter::convert<GI_INT8_t, GI_FLOAT32_V4_t>(v4);
GiStoreInt8(dst, vres);
}

@@ -245,26 +259,24 @@ struct Quan2FloatTypeCvter<int8_t, float> {
static constexpr size_t SIMD_WIDTH = GI_SIMD_LEN_BYTE / sizeof(int8_t);
static constexpr size_t SIMD_STEP = GI_SIMD_LEN_BYTE / sizeof(float);
float _scale = 0.0f;
GI_FLOAT32_t vscale;
GI_FLOAT32_FIXLEN_t vscale;

Quan2FloatTypeCvter(DType src_dtype, DType dst_dtype) {
_scale = src_dtype.param<dtype::QuantizedS8>().scale;
vscale = GiBroadcastFloat32(_scale);
vscale = GiFloat32Type2FixLenType(GiBroadcastFloat32(_scale));
MEGDNN_MARK_USED_VAR(dst_dtype);
}

void cvt(const int8_t* src, float* dst) {
GI_FLOAT32_t t;
t = GiFixLenType2GiFloat32Type(vscale);
GI_INT8_t data = GiLoadInt8(src);
GI_INT16_t vitem0 = GiMoveLowLongInt8(data);
GI_INT16_t vitem1 = GiMoveHighLongInt8(data);
auto vret0 =
GiMultiplyFloat32(GiCastToFloat32(GiMoveLowLongInt16(vitem0)), vscale);
auto vret1 =
GiMultiplyFloat32(GiCastToFloat32(GiMoveHighLongInt16(vitem0)), vscale);
auto vret2 =
GiMultiplyFloat32(GiCastToFloat32(GiMoveLowLongInt16(vitem1)), vscale);
auto vret3 =
GiMultiplyFloat32(GiCastToFloat32(GiMoveHighLongInt16(vitem1)), vscale);
auto vret0 = GiMultiplyFloat32(GiCastToFloat32(GiMoveLowLongInt16(vitem0)), t);
auto vret1 = GiMultiplyFloat32(GiCastToFloat32(GiMoveHighLongInt16(vitem0)), t);
auto vret2 = GiMultiplyFloat32(GiCastToFloat32(GiMoveLowLongInt16(vitem1)), t);
auto vret3 = GiMultiplyFloat32(GiCastToFloat32(GiMoveHighLongInt16(vitem1)), t);

GiStoreFloat32(dst, vret0);
GiStoreFloat32(dst + SIMD_STEP, vret1);


Loading…
Cancel
Save