From 93c7e451886ac0d2aa1596ff715e01775e6fc3fa Mon Sep 17 00:00:00 2001 From: Megvii Engine Team Date: Wed, 23 Feb 2022 15:41:38 +0800 Subject: [PATCH] feat(arm): delete the reduant implement GitOrigin-RevId: ff32a3dc8b33c264956c64ccf730d363057ac501 --- dnn/src/arm_common/reduce/opr_impl.cpp | 293 +------------------------ dnn/src/fallback/general_intrinsic/gi_common.h | 8 +- dnn/src/fallback/general_intrinsic/gi_float.h | 23 +- dnn/src/fallback/general_intrinsic/gi_int.h | 12 +- dnn/src/fallback/reduce/reducer.h | 8 +- 5 files changed, 28 insertions(+), 316 deletions(-) diff --git a/dnn/src/arm_common/reduce/opr_impl.cpp b/dnn/src/arm_common/reduce/opr_impl.cpp index d67f96a4..b5e73c37 100644 --- a/dnn/src/arm_common/reduce/opr_impl.cpp +++ b/dnn/src/arm_common/reduce/opr_impl.cpp @@ -40,33 +40,6 @@ template struct MeanReducer; template <> -struct MeanReducer { - using ctype = int8_t; - static constexpr int SIMD_WIDTH = 16; - - int32_t res; - float coef; - MeanReducer(DType, size_t cnt) : res(0), coef(1.0 / cnt) {} - MeanReducer() = default; - void feed(const int8_t* val) { -#if MEGDNN_AARCH64 - res += vaddlvq_s8(vld1q_s8(val)); -#elif MEGDNN_ARMV7 - auto sum = vpaddlq_s16(vpaddlq_s8(vld1q_s8(val))); - res += (vgetq_lane_s32(sum, 0) + vgetq_lane_s32(sum, 1) + - vgetq_lane_s32(sum, 2) + vgetq_lane_s32(sum, 3)); -#else -#error "unsupport android arch" -#endif - } - void feed_remain(const int8_t* val) { res += *val; } - void post(int8_t* dst) { - float sum = res * coef; - *dst = std::round(sum); - } -}; - -template <> struct MeanReducer { using ctype = uint8_t; static constexpr int SIMD_WIDTH = 16; @@ -97,33 +70,6 @@ struct MeanReducer { } }; -template <> -struct MeanReducer { - using ctype = float; - static constexpr int SIMD_WIDTH = 4; - - float32x4_t res; - float result; - float coef; - MeanReducer(DType, size_t cnt) : result(0.0f), coef(1.0 / cnt) { - res = vdupq_n_f32(0.0f); - } - MeanReducer() = default; - void feed(const float* val) { res = vaddq_f32(vld1q_f32(val), res); } - void feed_remain(const float* val) { result += *val; } - void post(float* dst) { -#if MEGDNN_AARCH64 - result += vaddvq_f32(res); -#elif MEGDNN_ARMV7 - auto sum_temp = vpadd_f32(vget_low_f32(res), vget_high_f32(res)); - result += (vget_lane_f32(sum_temp, 0) + vget_lane_f32(sum_temp, 1)); -#else -#error "unsupport android arch" -#endif - *dst = result * coef; - } -}; - #if __ARM_FEATURE_FP16_VECTOR_ARITHMETIC template <> struct MeanReducer<__fp16, __fp16, __fp16, true> { @@ -171,73 +117,6 @@ struct MeanReducer<__fp16, __fp16, __fp16, false> { #endif template <> -struct MeanReducer { - using ctype = float; - static constexpr int SIMD_WIDTH = 4; - - float32x4_t res; - float remain; - float coef; - MeanReducer(DType, size_t cnt) : remain(0.0f), coef(1.0 / cnt) { - res = vdupq_n_f32(0.0f); - } - MeanReducer() = default; - void feed(const float* val) { res = vaddq_f32(vld1q_f32(val), res); } - void feed_remain(const float* val) { remain += *val; } - void post(float* dst) { - res = vmulq_n_f32(res, coef); - vst1q_f32(dst, res); - } - void post_remain(float* dst) { *dst = remain * coef; } -}; - -template <> -struct MeanReducer { - using ctype = int8_t; - static constexpr int SIMD_WIDTH = 16; - - int32x4_t res[4]; - int32_t remain; - int32_t cnt; - float coef; - float32x4_t vcoef; - MeanReducer(DType, size_t cnt) : remain(0), cnt(cnt), coef(1.0 / cnt) { - memset(res, 0, sizeof(res)); - vcoef = vdupq_n_f32(coef); - } - MeanReducer() = default; - void feed(const int8_t* val) { - const int8x16_t vval = vld1q_s8(val); - const int16x8_t vval_low = vmovl_s8(vget_low_s8(vval)); - const int16x8_t vval_high = vmovl_s8(vget_high_s8(vval)); - - const int32x4_t vval_low_low = vmovl_s16(vget_low_s16(vval_low)); - const int32x4_t vval_low_high = vmovl_s16(vget_high_s16(vval_low)); - const int32x4_t vval_high_low = vmovl_s16(vget_low_s16(vval_high)); - const int32x4_t vval_high_high = vmovl_s16(vget_high_s16(vval_high)); - - res[0] = vaddq_s32(res[0], vval_low_low); - res[1] = vaddq_s32(res[1], vval_low_high); - res[2] = vaddq_s32(res[2], vval_high_low); - res[3] = vaddq_s32(res[3], vval_high_high); - } - void feed_remain(const int8_t* val) { remain += *val; } - void post(int8_t* dst) { - for (int i = 0; i < 4; i += 2) { - float32x4_t vitem0 = vmulq_f32(vcvtq_f32_s32(res[i]), vcoef); - float32x4_t vitem1 = vmulq_f32(vcvtq_f32_s32(res[i + 1]), vcoef); - vst1_s8(dst, - (QConverter::convert({{vitem0, vitem1}}))); - dst += 8; - } - } - void post_remain(int8_t* dst) { - float sum = remain * coef; - *dst = std::round(sum); - } -}; - -template <> struct MeanReducer { using ctype = uint8_t; static constexpr int SIMD_WIDTH = 16; @@ -335,8 +214,6 @@ struct minReducer; } \ } -REDUCER_MAX_MIN_C1(max, dt_qint8, int8_t, int8_t, s, int, -128); -REDUCER_MAX_MIN_C1(min, dt_qint8, int8_t, int8_t, s, int, 127); REDUCER_MAX_MIN_C1(max, dt_quint8, uint8_t, uint8_t, u, uint, 0); REDUCER_MAX_MIN_C1(min, dt_quint8, uint8_t, uint8_t, u, uint, 255); #undef REDUCER_MAX_MIN_C1 @@ -364,45 +241,10 @@ REDUCER_MAX_MIN_C1(min, dt_quint8, uint8_t, uint8_t, u, uint, 255); void post_remain(ctype* dst) { vst1q_lane_##_stype(dst, remain, 0); } \ } -REDUCER_MAX_MIN_C(max, dt_qint8, int8_t, int8_t, s8, int, -128); -REDUCER_MAX_MIN_C(min, dt_qint8, int8_t, int8_t, s8, int, 127); REDUCER_MAX_MIN_C(max, dt_quint8, uint8_t, uint8_t, u8, uint, 0); REDUCER_MAX_MIN_C(min, dt_quint8, uint8_t, uint8_t, u8, uint, 255); #undef REDUCER_MAX_MIN_C -#define REDUCER_MAX_MIN_C1( \ - _mode, _dtype, _ctype, _comp_type, _stype, __stype, _num, _init) \ - template <> \ - struct _mode##Reducer<_dtype, _ctype, _comp_type, true> { \ - using ctype = _ctype; \ - static constexpr int SIMD_WIDTH = _num; \ - __stype res; \ - _mode##Reducer(DType, size_t) { res = vdupq_n_##_stype(_init); } \ - _mode##Reducer() = default; \ - void feed(const ctype* val) { \ - __stype vval = vld1q_##_stype(val); \ - res = v##_mode##q_##_stype(vval, res); \ - } \ - void feed_remain(const ctype* val) { \ - __stype vval = vdupq_n_##_stype(*val); \ - res = v##_mode##q_##_stype(vval, res); \ - } \ - void post(ctype* dst) { \ - auto val = v##_mode##_##_stype( \ - vget_low_##_stype(res), vget_high_##_stype(res)); \ - using namespace std; \ - *dst = _mode({vget_lane_##_stype(val, 0), vget_lane_##_stype(val, 1)}); \ - } \ - } - -REDUCER_MAX_MIN_C1( - max, dt_float32, float, float, f32, float32x4_t, 4, - std::numeric_limits::lowest()); -REDUCER_MAX_MIN_C1( - min, dt_float32, float, float, f32, float32x4_t, 4, - std::numeric_limits::max()); -#undef REDUCER_MAX_MIN_C1 - #if __ARM_FEATURE_FP16_VECTOR_ARITHMETIC #define REDUCER_MAX_MIN_C1( \ _mode, _dtype, _ctype, _comp_type, _stype, __stype, _num, _init) \ @@ -440,38 +282,6 @@ REDUCER_MAX_MIN_C1( #undef REDUCER_MAX_MIN_C1 #endif -#define REDUCER_MAX_MIN_C( \ - _mode, _dtype, _ctype, _comp_type, _stype, __stype, _num, _init) \ - template <> \ - struct _mode##Reducer<_dtype, _ctype, _comp_type, false> { \ - using ctype = _ctype; \ - static constexpr int SIMD_WIDTH = _num; \ - __stype res; \ - ctype remain; \ - _mode##Reducer(DType, size_t) { \ - res = vdupq_n_##_stype(_init); \ - remain = _init; \ - } \ - _mode##Reducer() = default; \ - void feed(const ctype* val) { \ - __stype vval = vld1q_##_stype(val); \ - res = v##_mode##q_##_stype(vval, res); \ - } \ - void feed_remain(const ctype* val) { \ - using namespace std; \ - remain = _mode(*val, remain); \ - } \ - void post(ctype* dst) { vst1q_##_stype(dst, res); } \ - void post_remain(ctype* dst) { *dst = remain; } \ - } - -REDUCER_MAX_MIN_C( - max, dt_float32, float, float, f32, float32x4_t, 4, - std::numeric_limits::lowest()); -REDUCER_MAX_MIN_C( - min, dt_float32, float, float, f32, float32x4_t, 4, - std::numeric_limits::max()); -#undef REDUCER_MAX_MIN_C #if __ARM_FEATURE_FP16_VECTOR_ARITHMETIC #define REDUCER_MAX_MIN_C( \ _mode, _dtype, _ctype, _comp_type, _stype, __stype, _num, _init) \ @@ -513,45 +323,6 @@ struct SumReducer; template struct ProductReducer; -#define REDUCER_SUM_PRODUCT_C1( \ - _mode, _dtype, _ctype, _comp_type, _stype, __stype, _num, _init, _act, _op) \ - template <> \ - struct _mode##Reducer<_dtype, _ctype, _comp_type, true> { \ - using ctype = _ctype; \ - static constexpr int SIMD_WIDTH = _num; \ - __stype res; \ - ctype remain; \ - _mode##Reducer(DType, size_t) { \ - res = vdupq_n_##_stype(_init); \ - remain = _init; \ - } \ - _mode##Reducer() = default; \ - void feed(const ctype* val) { \ - __stype vval = vld1q_##_stype(val); \ - res = v##_act##q_##_stype(vval, res); \ - } \ - void feed_remain(const ctype* val) { \ - using namespace std; \ - auto op = _op(); \ - remain = op(remain, *val); \ - } \ - void post(ctype* dst) { \ - using namespace std; \ - auto val = v##_act##_##_stype( \ - vget_low_##_stype(res), vget_high_##_stype(res)); \ - auto op = _op(); \ - *dst = \ - op(remain, \ - op(vget_lane_##_stype(val, 0), vget_lane_##_stype(val, 1))); \ - } \ - } - -REDUCER_SUM_PRODUCT_C1( - Sum, dt_float32, float, float, f32, float32x4_t, 4, 0, add, plus); -REDUCER_SUM_PRODUCT_C1( - Product, dt_float32, float, float, f32, float32x4_t, 4, 1.0f, mul, multiplies); -#undef REDUCER_SUM_PRODUCT_C1 - #define REDUCER_SUM_PRODUCT_C( \ _mode, _dtype, _ctype, _comp_type, _stype, __stype, _num, _init, _act, _op) \ template <> \ @@ -578,9 +349,6 @@ REDUCER_SUM_PRODUCT_C1( void post_remain(ctype* dst) { *dst = remain; } \ } -REDUCER_SUM_PRODUCT_C(Sum, dt_float32, float, float, f32, float32x4_t, 4, 0, add, plus); -REDUCER_SUM_PRODUCT_C( - Product, dt_float32, float, float, f32, float32x4_t, 4, 1, mul, multiplies); #if __ARM_FEATURE_FP16_VECTOR_ARITHMETIC REDUCER_SUM_PRODUCT_C(Sum, __fp16, __fp16, __fp16, f16, float16x8_t, 8, 0, add, plus); REDUCER_SUM_PRODUCT_C( @@ -633,59 +401,6 @@ REDUCER_SUM_PRODUCT_C1( template struct SumSqrReducer; -template <> -struct SumSqrReducer { - using ctype = float; - static constexpr int SIMD_WIDTH = 4; - - float32x4_t res; - float result; - SumSqrReducer(DType, size_t cnt) : result(0.0f) { - MEGDNN_MARK_USED_VAR(cnt); - res = vdupq_n_f32(0.0f); - } - SumSqrReducer() = default; - void feed(const float* val) { - float32x4_t vval = vld1q_f32(val); - res = vaddq_f32(vmulq_f32(vval, vval), res); - } - void feed_remain(const float* val) { - float vval = *val; - result += vval * vval; - } - void post(float* dst) { -#if MEGDNN_AARCH64 - result += vaddvq_f32(res); -#elif MEGDNN_ARMV7 - auto sum_temp = vpadd_f32(vget_low_f32(res), vget_high_f32(res)); - result += (vget_lane_f32(sum_temp, 0) + vget_lane_f32(sum_temp, 1)); -#else -#error "unsupport android arch" -#endif - *dst = result; - } -}; -template <> -struct SumSqrReducer { - using ctype = float; - static constexpr int SIMD_WIDTH = 4; - - float32x4_t res; - float remain; - SumSqrReducer(DType, size_t cnt) : remain(0.0f) { - MEGDNN_MARK_USED_VAR(cnt); - res = vdupq_n_f32(0.0f); - } - SumSqrReducer() = default; - void feed(const float* val) { - float32x4_t vval = vld1q_f32(val); - res = vaddq_f32(vmulq_f32(vval, vval), res); - } - void feed_remain(const float* val) { remain += (*val) * (*val); } - void post(float* dst) { vst1q_f32(dst, res); } - void post_remain(float* dst) { *dst = remain; } -}; - #if __ARM_FEATURE_FP16_VECTOR_ARITHMETIC template <> struct SumSqrReducer<__fp16, __fp16, __fp16, true> { @@ -873,14 +588,12 @@ void ReduceImpl::exec( default: \ break; \ } + if (src.layout.is_contiguous() && src.layout.dtype.category() == DTypeCategory::QUANTIZED && param().data_type == param::Reduce::DataType::DEFAULT) { DType src_type = src.layout.dtype; - if (src.layout.dtype.enumv() == DTypeEnum::QuantizedS8) { - DISPATCH_MODE_QUANTIZED(dt_qint8, int8_t, int32_t) - } if (src.layout.dtype.enumv() == DTypeEnum::Quantized8Asymm) { DISPATCH_MODE_QUANTIZED(dt_quint8, uint8_t, int32_t) } @@ -889,9 +602,7 @@ void ReduceImpl::exec( src.layout.dtype.category() == DTypeCategory::FLOAT && param().data_type == param::Reduce::DataType::DEFAULT) { DType src_type = src.layout.dtype; - if (src.layout.dtype.enumv() == DTypeEnum::Float32) { - DISPATCH_MODE_FLOAT(dt_float32, float, float) - } + MEGDNN_MARK_USED_VAR(src_type); #if __ARM_FEATURE_FP16_VECTOR_ARITHMETIC if (src.layout.dtype.enumv() == DTypeEnum::Float16) { DNN_INC_FLOAT16(DISPATCH_MODE_FLOAT(__fp16, __fp16, __fp16)); diff --git a/dnn/src/fallback/general_intrinsic/gi_common.h b/dnn/src/fallback/general_intrinsic/gi_common.h index 3b21d62d..3c5a06c4 100644 --- a/dnn/src/fallback/general_intrinsic/gi_common.h +++ b/dnn/src/fallback/general_intrinsic/gi_common.h @@ -20,13 +20,19 @@ #else #if defined(__arm__) || defined(__aarch64__) #include -#define GI_TARGET_ARM #endif #if defined(__x86_64__) || defined(__i386__) #include #include +#endif +#endif + +#if defined(__x86_64__) || defined(__i386__) || defined(_M_IX86) || defined(_M_X64) #define GI_TARGET_X86 #endif + +#if defined(__arm__) || defined(__aarch64__) +#define GI_TARGET_ARM #endif #ifdef _WIN32 diff --git a/dnn/src/fallback/general_intrinsic/gi_float.h b/dnn/src/fallback/general_intrinsic/gi_float.h index 65142d33..e306d608 100644 --- a/dnn/src/fallback/general_intrinsic/gi_float.h +++ b/dnn/src/fallback/general_intrinsic/gi_float.h @@ -454,22 +454,22 @@ GiBlendFloat32(GI_FLOAT32 Vector1, GI_FLOAT32 Vector2, GI_FLOAT32 Selection) { GiAndFloat32(Vector2, Selection), GiAndNotFloat32(Selection, Vector1)); } +#define MIN_NAN(a, b) (isnan(a) || (a) < (b)) ? (a) : (b); +#define MAX_NAN(a, b) (isnan(a) || (a) > (b)) ? (a) : (b); + GI_FORCEINLINE GI_FLOAT32 GiMaximumFloat32(GI_FLOAT32 Vector1, GI_FLOAT32 Vector2) { #if defined(GI_NEON_INTRINSICS) return vmaxq_f32(Vector1, Vector2); -#elif defined(GI_SSE2_INTRINSICS) +#else //! _mm_max_ps does not fellow the IEEE standard when input is NAN, so //! implement by C code -#define MAX_NAN(a, b) (std::isnan(a) || (a) > (b)) ? (a) : (b); GI_FLOAT32 max; for (size_t i = 0; i < GI_SIMD_LEN_BYTE / sizeof(float); i++) { max[i] = MAX_NAN(Vector1[i], Vector2[i]); } return max; -#else - return GiBlendFloat32(Vector2, Vector1, Vector1 > Vector2); #endif } @@ -478,18 +478,14 @@ GI_FLOAT32 GiMinimumFloat32(GI_FLOAT32 Vector1, GI_FLOAT32 Vector2) { #if defined(GI_NEON_INTRINSICS) return vminq_f32(Vector1, Vector2); -#elif defined(GI_SSE2_INTRINSICS) - return _mm_min_ps(Vector1, Vector2); +#else //! _mm_min_ps does not fellow the IEEE standard when input is NAN, so //! implement by C code -#define MIN_NAN(a, b) (std::isnan(a) || (a) < (b)) ? (a) : (b); GI_FLOAT32 min; for (size_t i = 0; i < GI_SIMD_LEN_BYTE / sizeof(float); i++) { min[i] = MIN_NAN(Vector1[i], Vector2[i]); } return min; -#else - return GiBlendFloat32(Vector2, Vector1, Vector2 > Vector1); #endif } @@ -563,11 +559,6 @@ float GiReduceMaximumFloat32(GI_FLOAT32 Vector) { VectorLow = vpmax_f32(VectorLow, VectorHigh); VectorLow = vpmax_f32(VectorLow, VectorHigh); return vget_lane_f32(VectorLow, 0); -#elif defined(GI_VSX_INTRINSICS) - Vector = GiMaximumFloat32( - Vector, GI_FLOAT32(vec_splat((__vector long long)Vector, 1))); - Vector = GiMaximumFloat32(Vector, vec_splat(Vector, 1)); - return Vector[0]; #elif defined(GI_SSE2_INTRINSICS) Vector = GiMaximumFloat32( Vector, _mm_shuffle_ps(Vector, Vector, _MM_SHUFFLE(2, 3, 2, 3))); @@ -577,7 +568,7 @@ float GiReduceMaximumFloat32(GI_FLOAT32 Vector) { #else float ret = Vector[0]; for (size_t i = 1; i < GI_SIMD_LEN_BYTE / sizeof(float); i++) { - ret = Max(ret, Vector[i]); + ret = MAX_NAN(ret, Vector[i]); } return ret; #endif @@ -602,7 +593,7 @@ float GiReduceMinimumFloat32(GI_FLOAT32 Vector) { #else float ret = Vector[0]; for (size_t i = 1; i < GI_SIMD_LEN_BYTE / sizeof(float); i++) { - ret = Min(ret, Vector[i]); + ret = MIN_NAN(ret, Vector[i]); } return ret; #endif diff --git a/dnn/src/fallback/general_intrinsic/gi_int.h b/dnn/src/fallback/general_intrinsic/gi_int.h index aeabfa21..8749c5db 100644 --- a/dnn/src/fallback/general_intrinsic/gi_int.h +++ b/dnn/src/fallback/general_intrinsic/gi_int.h @@ -416,7 +416,7 @@ GiMoveLowLongInt16(GI_INT16 Vector) { } GI_FORCEINLINE -int16_t GiReduceAddInt8(GI_INT8 Vector) { +int32_t GiReduceAddInt8(GI_INT8 Vector) { #if defined(GI_NEON64_INTRINSICS) return vaddlvq_s8(Vector); #elif defined(GI_NEON32_INTRINSICS) @@ -467,8 +467,10 @@ int8_t GiReduceMaxInt8(GI_INT8 Vector) { #elif defined(GI_NEON32_INTRINSICS) int8x8_t VectorLow = vget_low_s8(Vector); int8x8_t VectorHigh = vget_high_s8(Vector); - VectorLow = vpmin_s8(VectorLow, VectorHigh); - VectorLow = vpmin_s8(VectorLow, VectorHigh); + VectorLow = vpmax_s8(VectorLow, VectorHigh); + VectorLow = vpmax_s8(VectorLow, VectorLow); + VectorLow = vpmax_s8(VectorLow, VectorLow); + VectorLow = vpmax_s8(VectorLow, VectorLow); return vget_lane_s8(VectorLow, 0); #elif defined(GI_SSE42_INTRINSICS) __m128i v0 = _mm_cvtepi8_epi16(Vector); @@ -514,7 +516,9 @@ int8_t GiReduceMinInt8(GI_INT8 Vector) { int8x8_t VectorLow = vget_low_s8(Vector); int8x8_t VectorHigh = vget_high_s8(Vector); VectorLow = vpmin_s8(VectorLow, VectorHigh); - VectorLow = vpmin_s8(VectorLow, VectorHigh); + VectorLow = vpmin_s8(VectorLow, VectorLow); + VectorLow = vpmin_s8(VectorLow, VectorLow); + VectorLow = vpmin_s8(VectorLow, VectorLow); return vget_lane_s8(VectorLow, 0); #elif defined(GI_SSE42_INTRINSICS) __m128i v0 = _mm_cvtepi8_epi16(Vector); diff --git a/dnn/src/fallback/reduce/reducer.h b/dnn/src/fallback/reduce/reducer.h index efeee039..66126b93 100644 --- a/dnn/src/fallback/reduce/reducer.h +++ b/dnn/src/fallback/reduce/reducer.h @@ -145,7 +145,7 @@ struct minReducer; _mode##Reducer() = default; \ void feed(const float* val) { \ auto vval = GiLoadFloat32(val); \ - res = Gi##_Mode##imumFloat32(vval, res); \ + res = Gi##_Mode##imumFloat32(res, vval); \ } \ void feed_remain(const float* val) { \ auto vval = GiBroadcastFloat32(*val); \ @@ -172,7 +172,7 @@ REDUCER_MAX_MIN_C1(min, Min, std::numeric_limits::max()); _mode##Reducer() = default; \ void feed(const float* val) { \ GI_FLOAT32 vval = GiLoadFloat32(val); \ - res = Gi##_Mode##imumFloat32(vval, res); \ + res = Gi##_Mode##imumFloat32(res, vval); \ } \ void feed_remain(const float* val) { \ using namespace std; \ @@ -200,7 +200,7 @@ REDUCER_MAX_MIN_C(min, Min, std::numeric_limits::max()); } \ void feed_remain(const int8_t* val) { \ GI_INT8 vval = GiBroadcastInt8(*val); \ - res = Gi##_Mode##imumInt8(vval, res); \ + res = Gi##_Mode##imumInt8(res, vval); \ } \ void post(int8_t* dst) { *dst = GiReduce##_Mode##Int8(res); } \ } @@ -223,7 +223,7 @@ REDUCER_MAX_MIN_C1(min, Min, 127); _mode##Reducer() = default; \ void feed(const int8_t* val) { \ GI_INT8 vval = GiLoadInt8(val); \ - res = Gi##_Mode##imumInt8(vval, res); \ + res = Gi##_Mode##imumInt8(res, vval); \ } \ void feed_remain(const int8_t* val) { \ using namespace std; \