From e34a642b31311a64961c5927657025d2bb0279fa Mon Sep 17 00:00:00 2001 From: Megvii Engine Team Date: Thu, 17 Feb 2022 18:27:26 +0800 Subject: [PATCH] feat(fallback): reduce support general intrinsic GitOrigin-RevId: f250aa7b2a145a66699636c11e5602f02693ed2a --- dnn/src/fallback/general_intrinsic/gi_common.h | 7 +- dnn/src/fallback/general_intrinsic/gi_float.h | 21 +- dnn/src/fallback/general_intrinsic/gi_int.h | 87 +++--- dnn/src/fallback/quantized_converter.h | 81 +++++ dnn/src/fallback/reduce/opr_impl.cpp | 106 +++++++ dnn/src/fallback/reduce/opr_impl.h | 4 + dnn/src/fallback/reduce/reducer.h | 417 +++++++++++++++++++++++++ dnn/test/arm_common/lstm.cpp | 1 - dnn/test/fallback/reduce.cpp | 69 ++++ 9 files changed, 744 insertions(+), 49 deletions(-) create mode 100644 dnn/src/fallback/quantized_converter.h create mode 100644 dnn/src/fallback/reduce/reducer.h diff --git a/dnn/src/fallback/general_intrinsic/gi_common.h b/dnn/src/fallback/general_intrinsic/gi_common.h index 3050f424..3b21d62d 100644 --- a/dnn/src/fallback/general_intrinsic/gi_common.h +++ b/dnn/src/fallback/general_intrinsic/gi_common.h @@ -95,8 +95,8 @@ typedef __m128i GI_INT16; typedef __m128i GI_INT32; #else typedef float GI_FLOAT32 __attribute__((vector_size(16))); -typedef uint16_t GI_UINT8 __attribute__((vector_size(16))); -typedef int16_t GI_INT8 __attribute__((vector_size(16))); +typedef uint8_t GI_UINT8 __attribute__((vector_size(16))); +typedef int8_t GI_INT8 __attribute__((vector_size(16))); typedef int16_t GI_INT16 __attribute__((vector_size(16))); typedef int32_t GI_INT32 __attribute__((vector_size(16))); #endif @@ -119,6 +119,9 @@ typedef int32_t GI_INT32 __attribute__((vector_size(16))); #define GI_SIMD_LEN_BYTE 16 #endif +#define Max(a, b) (a) > (b) ? (a) : (b) +#define Min(a, b) (a) < (b) ? (a) : (b) + typedef struct { GI_INT32 val[2]; } GI_INT32_V2; diff --git a/dnn/src/fallback/general_intrinsic/gi_float.h b/dnn/src/fallback/general_intrinsic/gi_float.h index 79b1e778..65142d33 100644 --- a/dnn/src/fallback/general_intrinsic/gi_float.h +++ b/dnn/src/fallback/general_intrinsic/gi_float.h @@ -223,7 +223,7 @@ GiInterleaveLowFloat32(GI_FLOAT32 Vector1, GI_FLOAT32 Vector2) { #if defined(GI_NEON64_INTRINSICS) return vzip1q_f32(Vector1, Vector2); #elif defined(GI_NEON32_INTRINSICS) - float32x2_t zipped = vzipq_f32(Vector1, Vector2); + float32x4x2_t zipped = vzipq_f32(Vector1, Vector2); return zipped.val[0]; #elif defined(GI_SSE2_INTRINSICS) return _mm_unpacklo_ps(Vector1, Vector2); @@ -243,7 +243,7 @@ GiInterleaveHighFloat32(GI_FLOAT32 Vector1, GI_FLOAT32 Vector2) { #if defined(GI_NEON64_INTRINSICS) return vzip2q_f32(Vector1, Vector2); #elif defined(GI_NEON32_INTRINSICS) - float32x2_t zipped = vzipq_f32(Vector1, Vector2); + float32x4x2_t zipped = vzipq_f32(Vector1, Vector2); return zipped.val[1]; #elif defined(GI_SSE2_INTRINSICS) return _mm_unpackhi_ps(Vector1, Vector2); @@ -460,7 +460,14 @@ GiMaximumFloat32(GI_FLOAT32 Vector1, GI_FLOAT32 Vector2) { #if defined(GI_NEON_INTRINSICS) return vmaxq_f32(Vector1, Vector2); #elif defined(GI_SSE2_INTRINSICS) - return _mm_max_ps(Vector1, Vector2); + //! _mm_max_ps does not fellow the IEEE standard when input is NAN, so + //! implement by C code +#define MAX_NAN(a, b) (std::isnan(a) || (a) > (b)) ? (a) : (b); + GI_FLOAT32 max; + for (size_t i = 0; i < GI_SIMD_LEN_BYTE / sizeof(float); i++) { + max[i] = MAX_NAN(Vector1[i], Vector2[i]); + } + return max; #else return GiBlendFloat32(Vector2, Vector1, Vector1 > Vector2); #endif @@ -473,6 +480,14 @@ GiMinimumFloat32(GI_FLOAT32 Vector1, GI_FLOAT32 Vector2) { return vminq_f32(Vector1, Vector2); #elif defined(GI_SSE2_INTRINSICS) return _mm_min_ps(Vector1, Vector2); + //! _mm_min_ps does not fellow the IEEE standard when input is NAN, so + //! implement by C code +#define MIN_NAN(a, b) (std::isnan(a) || (a) < (b)) ? (a) : (b); + GI_FLOAT32 min; + for (size_t i = 0; i < GI_SIMD_LEN_BYTE / sizeof(float); i++) { + min[i] = MIN_NAN(Vector1[i], Vector2[i]); + } + return min; #else return GiBlendFloat32(Vector2, Vector1, Vector2 > Vector1); #endif diff --git a/dnn/src/fallback/general_intrinsic/gi_int.h b/dnn/src/fallback/general_intrinsic/gi_int.h index 38dfafd3..aeabfa21 100644 --- a/dnn/src/fallback/general_intrinsic/gi_int.h +++ b/dnn/src/fallback/general_intrinsic/gi_int.h @@ -97,7 +97,7 @@ void GiStoreInt8(int8_t* Buffer, GI_INT8 Vector) { #elif defined(GI_SSE2_INTRINSICS) _mm_storeu_si128((__m128i*)Buffer, Vector); #else - for (int i = 0; i < 16; i++) { + for (size_t i = 0; i < GI_SIMD_LEN_BYTE / sizeof(int8_t); i++) { Buffer[i] = Vector[i]; } #endif @@ -197,7 +197,8 @@ GiAndNotInt8(GI_INT8 VectorNot, GI_INT8 Vector) { #elif defined(GI_SSE2_INTRINSICS) return _mm_andnot_si128(VectorNot, Vector); #else - return (~VectorNot) & Vector; + GI_INT8 Not = ~VectorNot; + return (Not & Vector); #endif } @@ -327,11 +328,13 @@ GiMoveHighLongInt8(GI_INT8 Vector) { for (int i = 0; i < 8; i++) { data[i] = o_data[8 + i]; } - return _mm_loadu_si16(data); + return _mm_loadu_si128((__m128i*)data); #else GI_INT16 ret; - for (size_t i = 0; i < GI_SIMD_LEN_BYTE / 2 / sizeof(int8_t); i++) { - ret[i] = Vector[GI_SIMD_LEN_BYTE / 2 + i]; + int8_t* data = (int8_t*)&Vector; + size_t half_length = GI_SIMD_LEN_BYTE / 2 / sizeof(int8_t); + for (size_t i = 0; i < half_length; i++) { + ret[i] = data[i + half_length]; } return ret; #endif @@ -351,10 +354,11 @@ GiMoveLowLongInt8(GI_INT8 Vector) { for (int i = 0; i < 8; i++) { data[i] = o_data[i]; } - return _mm_loadu_si16(data); + return _mm_loadu_si128((__m128i*)data); #else GI_INT16 ret; - for (size_t i = 0; i < GI_SIMD_LEN_BYTE / 2 / sizeof(int8_t); i++) { + size_t half_length = GI_SIMD_LEN_BYTE / 2 / sizeof(int8_t); + for (size_t i = 0; i < half_length; i++) { ret[i] = Vector[i]; } return ret; @@ -375,11 +379,12 @@ GiMoveHighLongInt16(GI_INT16 Vector) { for (int i = 0; i < 4; i++) { data[i] = o_data[4 + i]; } - return _mm_loadu_si32(data); + return _mm_loadu_si128((__m128i*)data); #else GI_INT32 ret; - for (size_t i = 0; i < GI_SIMD_LEN_BYTE / 2 / sizeof(int16_t); i++) { - ret[i] = Vector[GI_SIMD_LEN_BYTE / 2 + i]; + size_t half_length = GI_SIMD_LEN_BYTE / 2 / sizeof(int16_t); + for (size_t i = 0; i < half_length; i++) { + ret[i] = Vector[half_length + i]; } return ret; #endif @@ -399,10 +404,11 @@ GiMoveLowLongInt16(GI_INT16 Vector) { for (int i = 0; i < 4; i++) { data[i] = o_data[i]; } - return _mm_loadu_si32(data); + return _mm_loadu_si128((__m128i*)data); #else GI_INT32 ret; - for (size_t i = 0; i < GI_SIMD_LEN_BYTE / 2 / sizeof(int16_t); i++) { + size_t half_length = GI_SIMD_LEN_BYTE / 2 / sizeof(int16_t); + for (size_t i = 0; i < half_length; i++) { ret[i] = Vector[i]; } return ret; @@ -414,7 +420,7 @@ int16_t GiReduceAddInt8(GI_INT8 Vector) { #if defined(GI_NEON64_INTRINSICS) return vaddlvq_s8(Vector); #elif defined(GI_NEON32_INTRINSICS) - int32_t sum = vpaddlq_s16(vpaddlq_s8(Vector)); + int32x4_t sum = vpaddlq_s16(vpaddlq_s8(Vector)); return (vgetq_lane_s32(sum, 0) + vgetq_lane_s32(sum, 1) + vgetq_lane_s32(sum, 2) + vgetq_lane_s32(sum, 3)); #elif defined(GI_SSE42_INTRINSICS) @@ -431,8 +437,8 @@ int16_t GiReduceAddInt8(GI_INT8 Vector) { return (int16_t)(ret); #elif defined(GI_SSE2_INTRINSICS) - __m64 low = GiGetLowInt8x16(Vector); - __m64 high = GiGetHighInt8x16(Vector); + __m64 low = _mm_movepi64_pi64(Vector); + __m64 high = _mm_movepi64_pi64(_mm_unpackhi_epi64(Vector, Vector)); __m128 v0 = _mm_cvtpi8_ps(low); __m128 v1 = _mm_cvtpi8_ps(_mm_unpackhi_pi32(low, low)); __m128 v2 = _mm_cvtpi8_ps(high); @@ -447,16 +453,13 @@ int16_t GiReduceAddInt8(GI_INT8 Vector) { return (int16_t)(ret0 + ret1 + ret2 + ret3); #else int32_t sum = 0; - for (size_t i = 0; i < GI_SIMD_LEN_BYTE / sizeof(int32_t); i++) { + for (size_t i = 0; i < GI_SIMD_LEN_BYTE / sizeof(int8_t); i++) { sum += Vector[i]; } return sum; #endif } -#define Max(a, b) (a) > (b) ? (a) : (b) -#define Min(a, b) (a) < (b) ? (a) : (b) - GI_FORCEINLINE int8_t GiReduceMaxInt8(GI_INT8 Vector) { #if defined(GI_NEON64_INTRINSICS) @@ -480,23 +483,23 @@ int8_t GiReduceMaxInt8(GI_INT8 Vector) { ret = Max(_mm_extract_epi32(sum, 3), ret); return (int8_t)ret; #elif defined(GI_SSE2_INTRINSICS) - __m64 low = GiGetLowInt8x16(Vector); - __m64 high = GiGetHighInt8x16(Vector); + __m64 low = _mm_movepi64_pi64(Vector); + __m64 high = _mm_movepi64_pi64(_mm_unpackhi_epi64(Vector, Vector)); __m128 v0 = _mm_cvtpi8_ps(low); __m128 v1 = _mm_cvtpi8_ps(_mm_unpackhi_pi32(low, low)); __m128 v2 = _mm_cvtpi8_ps(high); __m128 v3 = _mm_cvtpi8_ps(_mm_unpackhi_pi32(high, high)); - __m128 sum0 = _mm_add_ps(v0, v1); - __m128 sum1 = _mm_add_ps(v2, v3); - __m128 sum = _mm_add_ps(sum0, sum1); - float ret0 = _mm_cvtss_f32(sum); - float ret1 = _mm_cvtss_f32(_mm_shuffle_ps(sum, sum, _MM_SHUFFLE(1, 1, 1, 1))); - float ret2 = _mm_cvtss_f32(_mm_shuffle_ps(sum, sum, _MM_SHUFFLE(2, 2, 2, 2))); - float ret3 = _mm_cvtss_f32(_mm_shuffle_ps(sum, sum, _MM_SHUFFLE(3, 3, 3, 3))); + __m128 max0 = _mm_max_ps(v0, v1); + __m128 max1 = _mm_max_ps(v2, v3); + __m128 max = _mm_max_ps(max0, max1); + float ret0 = _mm_cvtss_f32(max); + float ret1 = _mm_cvtss_f32(_mm_shuffle_ps(max, max, _MM_SHUFFLE(1, 1, 1, 1))); + float ret2 = _mm_cvtss_f32(_mm_shuffle_ps(max, max, _MM_SHUFFLE(2, 2, 2, 2))); + float ret3 = _mm_cvtss_f32(_mm_shuffle_ps(max, max, _MM_SHUFFLE(3, 3, 3, 3))); return (int8_t)(Max(Max(ret0, ret1), Max(ret2, ret3))); #else int8_t max = Vector[0]; - for (size_t i = 1; i < GI_SIMD_LEN_BYTE / sizeof(int32_t); i++) { + for (size_t i = 1; i < GI_SIMD_LEN_BYTE / sizeof(int8_t); i++) { max = Max(max, Vector[i]); } return max; @@ -526,23 +529,23 @@ int8_t GiReduceMinInt8(GI_INT8 Vector) { ret = Min(_mm_extract_epi32(sum, 3), ret); return (int8_t)ret; #elif defined(GI_SSE2_INTRINSICS) - __m64 low = GiGetLowInt8x16(Vector); - __m64 high = GiGetHighInt8x16(Vector); + __m64 low = _mm_movepi64_pi64(Vector); + __m64 high = _mm_movepi64_pi64(_mm_unpackhi_epi64(Vector, Vector)); __m128 v0 = _mm_cvtpi8_ps(low); __m128 v1 = _mm_cvtpi8_ps(_mm_unpackhi_pi32(low, low)); __m128 v2 = _mm_cvtpi8_ps(high); __m128 v3 = _mm_cvtpi8_ps(_mm_unpackhi_pi32(high, high)); - __m128 sum0 = _mm_add_ps(v0, v1); - __m128 sum1 = _mm_add_ps(v2, v3); - __m128 sum = _mm_add_ps(sum0, sum1); - float ret0 = _mm_cvtss_f32(sum); - float ret1 = _mm_cvtss_f32(_mm_shuffle_ps(sum, sum, _MM_SHUFFLE(1, 1, 1, 1))); - float ret2 = _mm_cvtss_f32(_mm_shuffle_ps(sum, sum, _MM_SHUFFLE(2, 2, 2, 2))); - float ret3 = _mm_cvtss_f32(_mm_shuffle_ps(sum, sum, _MM_SHUFFLE(3, 3, 3, 3))); + __m128 min0 = _mm_min_ps(v0, v1); + __m128 min1 = _mm_min_ps(v2, v3); + __m128 min = _mm_min_ps(min0, min1); + float ret0 = _mm_cvtss_f32(min); + float ret1 = _mm_cvtss_f32(_mm_shuffle_ps(min, min, _MM_SHUFFLE(1, 1, 1, 1))); + float ret2 = _mm_cvtss_f32(_mm_shuffle_ps(min, min, _MM_SHUFFLE(2, 2, 2, 2))); + float ret3 = _mm_cvtss_f32(_mm_shuffle_ps(min, min, _MM_SHUFFLE(3, 3, 3, 3))); return (int8_t)(Min(Min(ret0, ret1), Min(ret2, ret3))); #else int8_t min = Vector[0]; - for (size_t i = 1; i < GI_SIMD_LEN_BYTE / sizeof(int32_t); i++) { + for (size_t i = 1; i < GI_SIMD_LEN_BYTE / sizeof(int8_t); i++) { min = Min(min, Vector[i]); } return min; @@ -561,8 +564,7 @@ GiCvtFromFloat32ToInt8(GI_FLOAT32 src) { #if __ARM_ARCH >= 8 int32x4_t vres0 = vcvtaq_s32_f32(src); int16x8_t mid_s16 = vcombine_s16(vqmovn_s32(vres0), vqmovn_s32(vres0)); - int8x8_t ret = vqmovn_s16(vcombine_s16(vqmovn_s32(mid_s16), vqmovn_s32(mid_s16))); - return vcombine_s16(ret, ret); + return vcombine_s8(vqmovn_s16(mid_s16), vqmovn_s16(mid_s16)); #else float32x4_t vzero = vdupq_n_f32(0.f); float32x4_t vfhalf = vdupq_n_f32(0.5f); @@ -570,8 +572,7 @@ GiCvtFromFloat32ToInt8(GI_FLOAT32 src) { float32x4_t vinc0 = vbslq_f32(vcgeq_f32(src, vzero), vfhalf, vfneg_half); int32x4_t vres0 = vcvtq_s32_f32(vaddq_f32(src, vinc0)); int16x8_t mid_s16 = vcombine_s16(vqmovn_s32(vres0), vqmovn_s32(vres0)); - int8x8_t ret = vqmovn_s16(vcombine_s16(vqmovn_s32(mid_s16), vqmovn_s32(mid_s16))); - return vcombine_s16(ret, ret); + return vcombine_s8(vqmovn_s16(mid_s16), vqmovn_s16(mid_s16)); #endif #elif defined(GI_SSE42_INTRINSICS) __m128 vfzero = _mm_set1_ps(0.f); diff --git a/dnn/src/fallback/quantized_converter.h b/dnn/src/fallback/quantized_converter.h new file mode 100644 index 00000000..b842a862 --- /dev/null +++ b/dnn/src/fallback/quantized_converter.h @@ -0,0 +1,81 @@ +/** + * \file dnn/src/arm_common/quantized_converter.h + * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") + * + * Copyright (c) 2014-2021 Megvii Inc. All rights reserved. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or + * implied. + */ +#pragma once + +#include "megdnn/dtype.h" +#include "megdnn/oprs.h" +#include "src/common/utils.h" +#include "src/fallback/general_intrinsic/gi_float.h" +#include "src/fallback/general_intrinsic/gi_int.h" + +namespace megdnn { +namespace fallback { + +struct QConverterBase { + inline static GI_INT32 vzero() { return GiBroadcastInt32(0); } + + inline static GI_FLOAT32 vfzero() { return GiBroadcastFloat32(0.f); } + + inline static GI_FLOAT32 vfhalf() { return GiBroadcastFloat32(0.5f); } + + inline static GI_FLOAT32 vfneg_half() { return GiBroadcastFloat32(-0.5f); } +}; + +struct QConverter { + template + static inline dst_type convert(const src_type&... src); + + template + static inline dst_type round(const src_type&... src); +}; + +template <> +inline dt_qint8 QConverter::convert(const float& src) { + return dt_qint8(saturate(std::round(src), -128, 127)); +} + +template <> +inline dt_quint8 QConverter::convert(const float& src, const uint8_t& zp) { + return dt_quint8(saturate(std::round(src) + zp, 0, 255)); +} + +template <> +inline dt_qint32 QConverter::convert(const float& src) { + return dt_qint32(saturate( + std::round(src), static_cast(std::numeric_limits::min()), + static_cast(std::numeric_limits::max()))); +} + +template <> +inline GI_FLOAT32_V2 QConverter::convert(const GI_INT16& vsrc) { + GI_INT32 vhi = GiMoveHighLongInt16(vsrc); + GI_INT32 vlo = GiMoveLowLongInt16(vsrc); + return {{GiCastToFloat32(vlo), GiCastToFloat32(vhi)}}; +} + +template <> +inline GI_INT8 QConverter::convert(const GI_FLOAT32_V2& vsrc) { + return GiCvtFromFloat32V2ToInt8(vsrc); +} +template <> +inline GI_INT8 QConverter::convert(const GI_FLOAT32& src) { + return GiCvtFromFloat32ToInt8(src); +} + +template <> +inline GI_INT32 QConverter::round(const GI_FLOAT32& vsrc) { + return GiRoundAsInt32(vsrc); +} +} // namespace fallback +} // namespace megdnn + +// vim: syntax=cpp.doxygen diff --git a/dnn/src/fallback/reduce/opr_impl.cpp b/dnn/src/fallback/reduce/opr_impl.cpp index 78192424..66041510 100644 --- a/dnn/src/fallback/reduce/opr_impl.cpp +++ b/dnn/src/fallback/reduce/opr_impl.cpp @@ -14,11 +14,13 @@ #include "src/naive/handle.h" #include "midout.h" +#include "reducer.h" #include "src/common/reduce_helper.h" MIDOUT_DECL(megdnn_fb_reduce_op) MIDOUT_DECL(megdnn_fb_reduce_c) MIDOUT_DECL(megdnn_fb_reduce_dtype) +MIDOUT_DECL(megdnn_fallback_reduce_optimized) namespace { @@ -77,11 +79,20 @@ namespace fallback { void ReduceImpl::exec( _megdnn_tensor_in src, _megdnn_tensor_out dst, _megdnn_workspace workspace) { + check_exec(src.layout, dst.layout, workspace.size); + if (!exec_optimized(src, dst, workspace)) { + return exec_fallback(src, dst, workspace); + } +} + +void ReduceImpl::exec_fallback( + _megdnn_tensor_in src, _megdnn_tensor_out dst, _megdnn_workspace workspace) { using namespace reduce; using Mode = Param::Mode; check_exec(src.layout, dst.layout, workspace.size); size_t A, B, C; get_ABC(src.layout, A, B, C, param().axis); + #define cb_by_op(src_type, dst_type, _wtype, mode_, Op_, kern_func) \ if (param().mode == mode_) { \ typedef DTypeTrait::ctype src_ctype; \ @@ -176,6 +187,101 @@ void ReduceImpl::exec( naive::ReduceForwardImpl::exec(src, dst, workspace); } +bool ReduceImpl::exec_optimized( + _megdnn_tensor_in src, _megdnn_tensor_out dst, _megdnn_workspace) { + size_t A, B, C; + reduce::get_ABC(src.layout, A, B, C, param().axis); + bool execed = false; + using Mode = param::Reduce::Mode; +#define DISPATCH_FUNC(Reducer, dtype, ctype, comp_type) \ + if (C == 1) { \ + using _Reducer = Reducer; \ + std::function \ + do_reduce = Exec<_Reducer, true>::do_reduce; \ + MIDOUT_BEGIN( \ + megdnn_fallback_reduce_optimized, ctype, dtype, comp_type, \ + midout_iv(0)) { \ + MEGDNN_DISPATCH_CPU_KERN_OPR(do_reduce( \ + reinterpret_cast(src.raw_ptr()), \ + reinterpret_cast(dst.raw_ptr()), src_type, A, B, C)); \ + execed = true; \ + } \ + MIDOUT_END(); \ + } else { \ + using _Reducer = Reducer; \ + std::function \ + do_reduce = Exec<_Reducer, false>::do_reduce; \ + MIDOUT_BEGIN( \ + megdnn_fallback_reduce_optimized, ctype, dtype, comp_type, \ + midout_iv(1)) { \ + MEGDNN_DISPATCH_CPU_KERN_OPR(do_reduce( \ + reinterpret_cast(src.raw_ptr()), \ + reinterpret_cast(dst.raw_ptr()), src_type, A, B, C)); \ + execed = true; \ + } \ + MIDOUT_END(); \ + } + +#define DISPATCH_MODE_QUANTIZED(dtype, ctype, comp_type) \ + switch (param().mode) { \ + case Mode::MEAN: \ + DISPATCH_FUNC(MeanReducer, dtype, ctype, comp_type); \ + break; \ + case Mode::MAX: \ + DISPATCH_FUNC(maxReducer, dtype, ctype, ctype); \ + break; \ + case Mode::MIN: \ + DISPATCH_FUNC(minReducer, dtype, ctype, ctype); \ + break; \ + default: \ + break; \ + } + +#define DISPATCH_MODE_FLOAT(dtype, ctype, comp_type) \ + switch (param().mode) { \ + case Mode::MEAN: \ + DISPATCH_FUNC(MeanReducer, dtype, ctype, comp_type); \ + break; \ + case Mode::MAX: \ + DISPATCH_FUNC(maxReducer, dtype, ctype, ctype); \ + break; \ + case Mode::MIN: \ + DISPATCH_FUNC(minReducer, dtype, ctype, ctype); \ + break; \ + case Mode::SUM: \ + DISPATCH_FUNC(SumReducer, dtype, ctype, ctype); \ + break; \ + case Mode::SUM_SQR: \ + DISPATCH_FUNC(SumSqrReducer, dtype, ctype, ctype); \ + break; \ + case Mode::PRODUCT: \ + DISPATCH_FUNC(ProductReducer, dtype, ctype, ctype); \ + break; \ + default: \ + break; \ + } + if (src.layout.is_contiguous() && + src.layout.dtype.category() == DTypeCategory::QUANTIZED && + param().data_type == param::Reduce::DataType::DEFAULT) { + DType src_type = src.layout.dtype; + if (src.layout.dtype.enumv() == DTypeEnum::QuantizedS8) { + DISPATCH_MODE_QUANTIZED(dt_qint8, int8_t, int32_t) + } + } else if ( + src.layout.is_contiguous() && + src.layout.dtype.category() == DTypeCategory::FLOAT && + param().data_type == param::Reduce::DataType::DEFAULT) { + DType src_type = src.layout.dtype; + if (src.layout.dtype.enumv() == DTypeEnum::Float32) { + DISPATCH_MODE_FLOAT(dt_float32, float, float) + } + } + return execed; +#undef DISPATCH_FUNC +#undef DISPATCH_MODE_QUANTIZED +#undef DISPATCH_MODE_FLOAT +} + } // namespace fallback } // namespace megdnn // vim: syntax=cpp.doxygen diff --git a/dnn/src/fallback/reduce/opr_impl.h b/dnn/src/fallback/reduce/opr_impl.h index 759b405d..3db4401a 100644 --- a/dnn/src/fallback/reduce/opr_impl.h +++ b/dnn/src/fallback/reduce/opr_impl.h @@ -19,6 +19,10 @@ public: using ReduceForwardImpl::ReduceForwardImpl; void exec( _megdnn_tensor_in src, _megdnn_tensor_out dst, _megdnn_workspace) override; + bool exec_optimized( + _megdnn_tensor_in src, _megdnn_tensor_out dst, _megdnn_workspace); + void exec_fallback( + _megdnn_tensor_in src, _megdnn_tensor_out dst, _megdnn_workspace); }; } // namespace fallback diff --git a/dnn/src/fallback/reduce/reducer.h b/dnn/src/fallback/reduce/reducer.h new file mode 100644 index 00000000..efeee039 --- /dev/null +++ b/dnn/src/fallback/reduce/reducer.h @@ -0,0 +1,417 @@ +/** + * \file dnn/src/fallback/reduce/reducer.h + * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") + * + * Copyright (c) 2014-2021 Megvii Inc. All rights reserved. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + */ +#pragma once + +#include "src/common/utils.h" +#include "src/fallback/general_intrinsic/gi_float.h" +#include "src/fallback/general_intrinsic/gi_int.h" +#include "src/fallback/quantized_converter.h" + +using namespace megdnn; +using namespace fallback; + +namespace { + +/*****************************Mean Reducer***********************/ +template +struct MeanReducer; + +template <> +struct MeanReducer { + using ctype = int8_t; + static constexpr int SIMD_WIDTH = GI_SIMD_LEN_BYTE / sizeof(int8_t); + + int32_t res; + float coef; + MeanReducer(DType, size_t cnt) : res(0), coef(1.0 / cnt) {} + MeanReducer() = default; + void feed(const int8_t* val) { res += GiReduceAddInt8(GiLoadInt8(val)); } + void feed_remain(const int8_t* val) { res += *val; } + void post(int8_t* dst) { + float sum = res * coef; + *dst = std::round(sum); + } +}; + +template <> +struct MeanReducer { + using ctype = int8_t; + static constexpr int SIMD_WIDTH = GI_SIMD_LEN_BYTE / sizeof(int8_t); + + GI_INT32 res[4]; + int32_t remain; + int32_t cnt; + float coef; + GI_FLOAT32 vcoef; + MeanReducer(DType, size_t cnt) : remain(0), cnt(cnt), coef(1.0 / cnt) { + memset(res, 0, sizeof(res)); + vcoef = GiBroadcastFloat32(coef); + } + MeanReducer() = default; + void feed(const int8_t* val) { + const GI_INT8 vval = GiLoadInt8(val); + const GI_INT16 vval_low = GiMoveLowLongInt8(vval); + const GI_INT16 vval_high = GiMoveHighLongInt8(vval); + + const GI_INT32 vval_low_low = GiMoveLowLongInt16(vval_low); + const GI_INT32 vval_low_high = GiMoveHighLongInt16(vval_low); + const GI_INT32 vval_high_low = GiMoveLowLongInt16(vval_high); + const GI_INT32 vval_high_high = GiMoveHighLongInt16(vval_high); + + res[0] = GiAddInt32(res[0], vval_low_low); + res[1] = GiAddInt32(res[1], vval_low_high); + res[2] = GiAddInt32(res[2], vval_high_low); + res[3] = GiAddInt32(res[3], vval_high_high); + } + void feed_remain(const int8_t* val) { remain += *val; } + void post(int8_t* dst) { + for (int i = 0; i < 4; i += 2) { + GI_FLOAT32 vitem0 = GiMultiplyFloat32(GiCastToFloat32(res[i]), vcoef); + GI_FLOAT32 vitem1 = GiMultiplyFloat32(GiCastToFloat32(res[i + 1]), vcoef); + GiStoreLowInt8( + dst, + (QConverter::convert({{vitem0, vitem1}}))); + dst += 8; + } + } + void post_remain(int8_t* dst) { + float sum = remain * coef; + *dst = std::round(sum); + } +}; + +template <> +struct MeanReducer { + using ctype = float; + static constexpr int SIMD_WIDTH = GI_SIMD_LEN_BYTE / sizeof(float); + + GI_FLOAT32 res; + float result; + float coef; + MeanReducer(DType, size_t cnt) : result(0.0f), coef(1.0 / cnt) { + res = GiBroadcastFloat32(0.0f); + } + MeanReducer() = default; + void feed(const float* val) { res = GiAddFloat32(GiLoadFloat32(val), res); } + void feed_remain(const float* val) { result += *val; } + void post(float* dst) { + result += GiReduceAddFloat32(res); + *dst = result * coef; + } +}; + +template <> +struct MeanReducer { + using ctype = float; + static constexpr int SIMD_WIDTH = GI_SIMD_LEN_BYTE / sizeof(float); + + GI_FLOAT32 res; + float remain; + float coef; + MeanReducer(DType, size_t cnt) : remain(0.0f), coef(1.0 / cnt) { + res = GiBroadcastFloat32(0.0f); + } + MeanReducer() = default; + void feed(const float* val) { res = GiAddFloat32(GiLoadFloat32(val), res); } + void feed_remain(const float* val) { remain += *val; } + void post(float* dst) { + res = GiMultiplyScalerFloat32(res, coef); + GiStoreFloat32(dst, res); + } + void post_remain(float* dst) { *dst = remain * coef; } +}; + +/******************************max min Reducer****************************/ +template +struct maxReducer; +template +struct minReducer; + +#define REDUCER_MAX_MIN_C1(_mode, _Mode, _init) \ + template <> \ + struct _mode##Reducer { \ + using ctype = float; \ + static constexpr int SIMD_WIDTH = GI_SIMD_LEN_BYTE / sizeof(float); \ + GI_FLOAT32 res; \ + _mode##Reducer(DType, size_t) { res = GiBroadcastFloat32(_init); } \ + _mode##Reducer() = default; \ + void feed(const float* val) { \ + auto vval = GiLoadFloat32(val); \ + res = Gi##_Mode##imumFloat32(vval, res); \ + } \ + void feed_remain(const float* val) { \ + auto vval = GiBroadcastFloat32(*val); \ + res = Gi##_Mode##imumFloat32(vval, res); \ + } \ + void post(float* dst) { *dst = GiReduce##_Mode##imumFloat32(res); } \ + } + +REDUCER_MAX_MIN_C1(max, Max, std::numeric_limits::lowest()); +REDUCER_MAX_MIN_C1(min, Min, std::numeric_limits::max()); +#undef REDUCER_MAX_MIN_C1 + +#define REDUCER_MAX_MIN_C(_mode, _Mode, _init) \ + template <> \ + struct _mode##Reducer { \ + using ctype = float; \ + static constexpr int SIMD_WIDTH = GI_SIMD_LEN_BYTE / sizeof(float); \ + GI_FLOAT32 res; \ + float remain; \ + _mode##Reducer(DType, size_t) { \ + res = GiBroadcastFloat32(_init); \ + remain = _init; \ + } \ + _mode##Reducer() = default; \ + void feed(const float* val) { \ + GI_FLOAT32 vval = GiLoadFloat32(val); \ + res = Gi##_Mode##imumFloat32(vval, res); \ + } \ + void feed_remain(const float* val) { \ + using namespace std; \ + remain = _mode(*val, remain); \ + } \ + void post(float* dst) { GiStoreFloat32(dst, res); } \ + void post_remain(float* dst) { *dst = remain; } \ + } + +REDUCER_MAX_MIN_C(max, Max, std::numeric_limits::lowest()); +REDUCER_MAX_MIN_C(min, Min, std::numeric_limits::max()); +#undef REDUCER_MAX_MIN_C + +#define REDUCER_MAX_MIN_C1(_mode, _Mode, _init) \ + template <> \ + struct _mode##Reducer { \ + using ctype = int8_t; \ + static constexpr int SIMD_WIDTH = GI_SIMD_LEN_BYTE / sizeof(int8_t); \ + GI_INT8 res; \ + _mode##Reducer(DType, size_t) { res = GiBroadcastInt8(_init); } \ + _mode##Reducer() = default; \ + void feed(const int8_t* val) { \ + GI_INT8 vval = GiLoadInt8(val); \ + res = Gi##_Mode##imumInt8(vval, res); \ + } \ + void feed_remain(const int8_t* val) { \ + GI_INT8 vval = GiBroadcastInt8(*val); \ + res = Gi##_Mode##imumInt8(vval, res); \ + } \ + void post(int8_t* dst) { *dst = GiReduce##_Mode##Int8(res); } \ + } + +REDUCER_MAX_MIN_C1(max, Max, -128); +REDUCER_MAX_MIN_C1(min, Min, 127); +#undef REDUCER_MAX_MIN_C1 + +#define REDUCER_MAX_MIN_C(_mode, _Mode, _init) \ + template <> \ + struct _mode##Reducer { \ + using ctype = int8_t; \ + static constexpr int SIMD_WIDTH = GI_SIMD_LEN_BYTE / sizeof(int8_t); \ + GI_INT8 res; \ + int8_t remain; \ + _mode##Reducer(DType, size_t) { \ + res = GiBroadcastInt8(_init); \ + remain = _init; \ + } \ + _mode##Reducer() = default; \ + void feed(const int8_t* val) { \ + GI_INT8 vval = GiLoadInt8(val); \ + res = Gi##_Mode##imumInt8(vval, res); \ + } \ + void feed_remain(const int8_t* val) { \ + using namespace std; \ + remain = _mode(*val, remain); \ + } \ + void post(int8_t* dst) { GiStoreInt8(dst, res); } \ + void post_remain(int8_t* dst) { *dst = remain; } \ + } + +REDUCER_MAX_MIN_C(max, Max, -128); +REDUCER_MAX_MIN_C(min, Min, 127); +#undef REDUCER_MAX_MIN_C + +/***************************Sum Product Reducer***************************/ +template +struct SumReducer; +template +struct ProductReducer; + +#define REDUCER_SUM_PRODUCT_C1(_mode, _Mode, _op, _init) \ + template <> \ + struct _mode##Reducer { \ + using ctype = float; \ + static constexpr int SIMD_WIDTH = GI_SIMD_LEN_BYTE / sizeof(float); \ + GI_FLOAT32 res; \ + float remain; \ + _mode##Reducer(DType, size_t) { \ + res = GiBroadcastFloat32(_init); \ + remain = _init; \ + } \ + _mode##Reducer() = default; \ + void feed(const float* val) { \ + GI_FLOAT32 vval = GiLoadFloat32(val); \ + res = Gi##_Mode##Float32(vval, res); \ + } \ + void feed_remain(const float* val) { \ + using namespace std; \ + auto op = _op(); \ + remain = op(remain, *val); \ + } \ + void post(float* dst) { \ + using namespace std; \ + auto op = _op(); \ + *dst = op(remain, GiReduce##_Mode##Float32(res)); \ + } \ + } + +REDUCER_SUM_PRODUCT_C1(Sum, Add, plus, 0.0f); +REDUCER_SUM_PRODUCT_C1(Product, Multiply, multiplies, 1.0f); +#undef REDUCER_SUM_PRODUCT_C1 + +#define REDUCER_SUM_PRODUCT_C(_mode, _Mode, _op, _init) \ + template <> \ + struct _mode##Reducer { \ + using ctype = float; \ + static constexpr int SIMD_WIDTH = GI_SIMD_LEN_BYTE / sizeof(float); \ + GI_FLOAT32 res; \ + float remain; \ + _mode##Reducer(DType, size_t) { \ + res = GiBroadcastFloat32(_init); \ + remain = _init; \ + } \ + _mode##Reducer() = default; \ + void feed(const float* val) { \ + GI_FLOAT32 vval = GiLoadFloat32(val); \ + res = Gi##_Mode##Float32(vval, res); \ + } \ + void feed_remain(const float* val) { \ + using namespace std; \ + auto op = _op(); \ + remain = op(remain, (*val)); \ + } \ + void post(float* dst) { GiStoreFloat32(dst, res); } \ + void post_remain(float* dst) { *dst = remain; } \ + } + +REDUCER_SUM_PRODUCT_C(Sum, Add, plus, 0.0f); +REDUCER_SUM_PRODUCT_C(Product, Multiply, multiplies, 1.0f); +#undef REDUCER_SUM_PRODUCT_C + +/***************************SumSqr Reducer***************************/ +template +struct SumSqrReducer; + +template <> +struct SumSqrReducer { + using ctype = float; + static constexpr int SIMD_WIDTH = GI_SIMD_LEN_BYTE / sizeof(float); + + GI_FLOAT32 res; + float result; + SumSqrReducer(DType, size_t cnt) : result(0.0f) { + MEGDNN_MARK_USED_VAR(cnt); + res = GiBroadcastFloat32(0.0f); + } + SumSqrReducer() = default; + void feed(const float* val) { + GI_FLOAT32 vval = GiLoadFloat32(val); + res = GiAddFloat32(GiMultiplyFloat32(vval, vval), res); + } + void feed_remain(const float* val) { + float vval = *val; + result += vval * vval; + } + void post(float* dst) { + result += GiReduceAddFloat32(res); + *dst = result; + } +}; +template <> +struct SumSqrReducer { + using ctype = float; + static constexpr int SIMD_WIDTH = GI_SIMD_LEN_BYTE / sizeof(float); + + GI_FLOAT32 res; + float remain; + SumSqrReducer(DType, size_t cnt) : remain(0.0f) { + MEGDNN_MARK_USED_VAR(cnt); + res = GiBroadcastFloat32(0.0f); + } + SumSqrReducer() = default; + void feed(const float* val) { + GI_FLOAT32 vval = GiLoadFloat32(val); + res = GiAddFloat32(GiMultiplyFloat32(vval, vval), res); + } + void feed_remain(const float* val) { remain += (*val) * (*val); } + void post(float* dst) { GiStoreFloat32(dst, res); } + void post_remain(float* dst) { *dst = remain; } +}; +/**************************************do reduce*************************/ + +template +struct Exec { + static void do_reduce( + const typename Reducer::ctype* src, typename Reducer::ctype* dst, + DType src_dtype, size_t A, size_t B, size_t C); +}; + +template +struct Exec { + static void do_reduce( + const typename Reducer::ctype* src, typename Reducer::ctype* dst, + DType src_dtype, size_t A, size_t B, size_t) { + size_t a = 0; + for (; a < A; a++) { + Reducer reducer0(src_dtype, B); + auto temp_src0 = src + a * B; + size_t b = 0; + for (; b + Reducer::SIMD_WIDTH <= B; b += Reducer::SIMD_WIDTH) { + reducer0.feed(temp_src0); + temp_src0 += Reducer::SIMD_WIDTH; + } + for (; b < B; b++) { + reducer0.feed_remain(temp_src0); + temp_src0++; + } + reducer0.post(dst); + dst++; + } + } +}; + +template +struct Exec { + static void do_reduce( + const typename Reducer::ctype* src, typename Reducer::ctype* dst, + DType src_dtype, size_t A, size_t B, size_t C) { + for (size_t a = 0; a < A; a++) { + size_t c = 0; + for (; c + Reducer::SIMD_WIDTH <= C; c += Reducer::SIMD_WIDTH) { + Reducer reducer(src_dtype, B); + for (size_t b = 0; b < B; b++) + reducer.feed(src + c + C * b); + reducer.post(dst); + dst += Reducer::SIMD_WIDTH; + } + for (; c < C; c++) { + Reducer reducer(src_dtype, B); + for (size_t b = 0; b < B; b++) + reducer.feed_remain(src + c + C * b); + reducer.post_remain(dst); + dst++; + } + src += B * C; + } + } +}; + +} // namespace + +// vim: syntax=cpp.doxygen diff --git a/dnn/test/arm_common/lstm.cpp b/dnn/test/arm_common/lstm.cpp index ddc45a10..ebcc6f89 100644 --- a/dnn/test/arm_common/lstm.cpp +++ b/dnn/test/arm_common/lstm.cpp @@ -181,7 +181,6 @@ TEST_F(ARM_COMMON, LSTM_FORWARD_RECORD) { TEST_F(ARM_COMMON, BENCHMARK_LSTM_FORWARD) { Benchmarker optimized_bench(handle()); - constexpr size_t RUNS = 20; auto run = [&](size_t hidden_size, size_t input_size) { optimized_bench.set_times(20).set_display(true); size_t gate_hidden_size = 4 * hidden_size; diff --git a/dnn/test/fallback/reduce.cpp b/dnn/test/fallback/reduce.cpp index 0a8de3b6..c508649b 100644 --- a/dnn/test/fallback/reduce.cpp +++ b/dnn/test/fallback/reduce.cpp @@ -18,6 +18,75 @@ using namespace megdnn; using namespace test; +TEST_F(FALLBACK, REDUCE_FULL) { + using Param = Reduce::Param; + using Mode = Param::Mode; + Checker checker(handle()); + UniformIntRNG rng{INT8_MIN >> 1, INT8_MAX >> 1}; + checker.set_rng(0, &rng); + struct Config { + Param param; + DType dtype; + TensorShape shape; + Config(Param param, DType dtype, TensorShape shape) + : param(param), dtype(dtype), shape(shape) {} + }; + std::vector configs; + for (auto mode : {Mode::MEAN, Mode::MAX, Mode::MIN}) + for (auto dtype : std::vector{ + dtype::Float32(), dtype::Float16(), dtype::QuantizedS8(1.3f), + dtype::Quantized8Asymm(1.3f, static_cast(3))}) + for (int32_t axis : {0, 1, 2}) { + for (size_t A : {1, 3, 5}) { + for (size_t B : {4, 6, 9, 16, 33, 45}) { + for (size_t C : {4, 6, 9, 16, 33, 45}) { + TensorShape shape{A, B, C}; + Param param(mode, axis); + Config config(param, dtype, shape); + configs.push_back(config); + } + } + } + } + for (auto&& config : configs) { + auto&& dtype = config.dtype; + auto&& param = config.param; + auto&& shape = config.shape; + + checker.set_dtype(0, dtype).set_param(param).execs({shape, {}}); + } + configs.clear(); + for (auto mode : {Mode::SUM, Mode::PRODUCT, Mode::SUM_SQR}) + for (auto dtype : std::vector{dtype::Float32(), dtype::Float16()}) + for (int32_t axis : {0, 1, 2}) { + for (size_t A : {1, 3, 5}) { + for (size_t B : {4, 6, 9, 16, 33, 45}) { + for (size_t C : {4, 6, 9, 16, 33, 45}) { + TensorShape shape{A, B, C}; + Param param(mode, axis); + Config config(param, dtype, shape); + configs.push_back(config); + } + } + } + } + + UniformFloatRNG rng_float(-2, 2); + checker.set_rng(0, &rng_float); + checker.set_epsilon(1e-1); + for (auto&& config : configs) { + auto&& dtype = config.dtype; + auto&& param = config.param; + auto&& shape = config.shape; + if (dtype == dtype::Float16()) + checker.set_epsilon(1e-1); + else + checker.set_epsilon(1e-3); + + checker.set_dtype(0, dtype).set_param(param).execs({shape, {}}); + } +} + TEST_F(FALLBACK, REDUCE) { using Param = Reduce::Param; using Mode = Param::Mode;