feat(fallback): reduce support general intrinsic

GitOrigin-RevId: f250aa7b2a
3 years ago · e34a642b31
--- a/dnn/src/fallback/general_intrinsic/gi_common.h
+++ b/dnn/src/fallback/general_intrinsic/gi_common.h
@@ -95,8 +95,8 @@ typedef __m128i GI_INT16;
 typedef __m128i GI_INT32;
 #else
 typedef float GI_FLOAT32 __attribute__((vector_size(16)));
 typedef uint16_t GI_UINT8 __attribute__((vector_size(16)));
 typedef int16_t GI_INT8 __attribute__((vector_size(16)));
 typedef uint8_t GI_UINT8 __attribute__((vector_size(16)));
 typedef int8_t GI_INT8 __attribute__((vector_size(16)));
 typedef int16_t GI_INT16 __attribute__((vector_size(16)));
 typedef int32_t GI_INT32 __attribute__((vector_size(16)));
 #endif
@@ -119,6 +119,9 @@ typedef int32_t GI_INT32 __attribute__((vector_size(16)));
 #define GI_SIMD_LEN_BYTE 16
 #endif

 #define Max(a, b) (a) > (b) ? (a) : (b)
 #define Min(a, b) (a) < (b) ? (a) : (b)

 typedef struct {
    GI_INT32 val[2];
 } GI_INT32_V2;
--- a/dnn/src/fallback/general_intrinsic/gi_float.h
+++ b/dnn/src/fallback/general_intrinsic/gi_float.h
@@ -223,7 +223,7 @@ GiInterleaveLowFloat32(GI_FLOAT32 Vector1, GI_FLOAT32 Vector2) {
 #if defined(GI_NEON64_INTRINSICS)
    return vzip1q_f32(Vector1, Vector2);
 #elif defined(GI_NEON32_INTRINSICS)
    float32x2_t zipped = vzipq_f32(Vector1, Vector2);
    float32x4x2_t zipped = vzipq_f32(Vector1, Vector2);
    return zipped.val[0];
 #elif defined(GI_SSE2_INTRINSICS)
    return _mm_unpacklo_ps(Vector1, Vector2);
@@ -243,7 +243,7 @@ GiInterleaveHighFloat32(GI_FLOAT32 Vector1, GI_FLOAT32 Vector2) {
 #if defined(GI_NEON64_INTRINSICS)
    return vzip2q_f32(Vector1, Vector2);
 #elif defined(GI_NEON32_INTRINSICS)
    float32x2_t zipped = vzipq_f32(Vector1, Vector2);
    float32x4x2_t zipped = vzipq_f32(Vector1, Vector2);
    return zipped.val[1];
 #elif defined(GI_SSE2_INTRINSICS)
    return _mm_unpackhi_ps(Vector1, Vector2);
@@ -460,7 +460,14 @@ GiMaximumFloat32(GI_FLOAT32 Vector1, GI_FLOAT32 Vector2) {
 #if defined(GI_NEON_INTRINSICS)
    return vmaxq_f32(Vector1, Vector2);
 #elif defined(GI_SSE2_INTRINSICS)
    return _mm_max_ps(Vector1, Vector2);
    //! _mm_max_ps does not fellow the IEEE standard when input is NAN, so
    //! implement by C code
 #define MAX_NAN(a, b) (std::isnan(a) || (a) > (b)) ? (a) : (b);
    GI_FLOAT32 max;
    for (size_t i = 0; i < GI_SIMD_LEN_BYTE / sizeof(float); i++) {
        max[i] = MAX_NAN(Vector1[i], Vector2[i]);
    }
    return max;
 #else
    return GiBlendFloat32(Vector2, Vector1, Vector1 > Vector2);
 #endif
@@ -473,6 +480,14 @@ GiMinimumFloat32(GI_FLOAT32 Vector1, GI_FLOAT32 Vector2) {
    return vminq_f32(Vector1, Vector2);
 #elif defined(GI_SSE2_INTRINSICS)
    return _mm_min_ps(Vector1, Vector2);
    //! _mm_min_ps does not fellow the IEEE standard when input is NAN, so
    //! implement by C code
 #define MIN_NAN(a, b) (std::isnan(a) || (a) < (b)) ? (a) : (b);
    GI_FLOAT32 min;
    for (size_t i = 0; i < GI_SIMD_LEN_BYTE / sizeof(float); i++) {
        min[i] = MIN_NAN(Vector1[i], Vector2[i]);
    }
    return min;
 #else
    return GiBlendFloat32(Vector2, Vector1, Vector2 > Vector1);
 #endif
--- a/dnn/src/fallback/general_intrinsic/gi_int.h
+++ b/dnn/src/fallback/general_intrinsic/gi_int.h
@@ -97,7 +97,7 @@ void GiStoreInt8(int8_t* Buffer, GI_INT8 Vector) {
 #elif defined(GI_SSE2_INTRINSICS)
    _mm_storeu_si128((__m128i*)Buffer, Vector);
 #else
    for (int i = 0; i < 16; i++) {
    for (size_t i = 0; i < GI_SIMD_LEN_BYTE / sizeof(int8_t); i++) {
        Buffer[i] = Vector[i];
    }
 #endif
@@ -197,7 +197,8 @@ GiAndNotInt8(GI_INT8 VectorNot, GI_INT8 Vector) {
 #elif defined(GI_SSE2_INTRINSICS)
    return _mm_andnot_si128(VectorNot, Vector);
 #else
    return (~VectorNot) & Vector;
    GI_INT8 Not = ~VectorNot;
    return (Not & Vector);
 #endif
 }

@@ -327,11 +328,13 @@ GiMoveHighLongInt8(GI_INT8 Vector) {
    for (int i = 0; i < 8; i++) {
        data[i] = o_data[8 + i];
    }
    return _mm_loadu_si16(data);
    return _mm_loadu_si128((__m128i*)data);
 #else
    GI_INT16 ret;
    for (size_t i = 0; i < GI_SIMD_LEN_BYTE / 2 / sizeof(int8_t); i++) {
        ret[i] = Vector[GI_SIMD_LEN_BYTE / 2 + i];
    int8_t* data = (int8_t*)&Vector;
    size_t half_length = GI_SIMD_LEN_BYTE / 2 / sizeof(int8_t);
    for (size_t i = 0; i < half_length; i++) {
        ret[i] = data[i + half_length];
    }
    return ret;
 #endif
@@ -351,10 +354,11 @@ GiMoveLowLongInt8(GI_INT8 Vector) {
    for (int i = 0; i < 8; i++) {
        data[i] = o_data[i];
    }
    return _mm_loadu_si16(data);
    return _mm_loadu_si128((__m128i*)data);
 #else
    GI_INT16 ret;
    for (size_t i = 0; i < GI_SIMD_LEN_BYTE / 2 / sizeof(int8_t); i++) {
    size_t half_length = GI_SIMD_LEN_BYTE / 2 / sizeof(int8_t);
    for (size_t i = 0; i < half_length; i++) {
        ret[i] = Vector[i];
    }
    return ret;
@@ -375,11 +379,12 @@ GiMoveHighLongInt16(GI_INT16 Vector) {
    for (int i = 0; i < 4; i++) {
        data[i] = o_data[4 + i];
    }
    return _mm_loadu_si32(data);
    return _mm_loadu_si128((__m128i*)data);
 #else
    GI_INT32 ret;
    for (size_t i = 0; i < GI_SIMD_LEN_BYTE / 2 / sizeof(int16_t); i++) {
        ret[i] = Vector[GI_SIMD_LEN_BYTE / 2 + i];
    size_t half_length = GI_SIMD_LEN_BYTE / 2 / sizeof(int16_t);
    for (size_t i = 0; i < half_length; i++) {
        ret[i] = Vector[half_length + i];
    }
    return ret;
 #endif
@@ -399,10 +404,11 @@ GiMoveLowLongInt16(GI_INT16 Vector) {
    for (int i = 0; i < 4; i++) {
        data[i] = o_data[i];
    }
    return _mm_loadu_si32(data);
    return _mm_loadu_si128((__m128i*)data);
 #else
    GI_INT32 ret;
    for (size_t i = 0; i < GI_SIMD_LEN_BYTE / 2 / sizeof(int16_t); i++) {
    size_t half_length = GI_SIMD_LEN_BYTE / 2 / sizeof(int16_t);
    for (size_t i = 0; i < half_length; i++) {
        ret[i] = Vector[i];
    }
    return ret;
@@ -414,7 +420,7 @@ int16_t GiReduceAddInt8(GI_INT8 Vector) {
 #if defined(GI_NEON64_INTRINSICS)
    return vaddlvq_s8(Vector);
 #elif defined(GI_NEON32_INTRINSICS)
    int32_t sum = vpaddlq_s16(vpaddlq_s8(Vector));
    int32x4_t sum = vpaddlq_s16(vpaddlq_s8(Vector));
    return (vgetq_lane_s32(sum, 0) + vgetq_lane_s32(sum, 1) + vgetq_lane_s32(sum, 2) +
            vgetq_lane_s32(sum, 3));
 #elif defined(GI_SSE42_INTRINSICS)
@@ -431,8 +437,8 @@ int16_t GiReduceAddInt8(GI_INT8 Vector) {
    return (int16_t)(ret);

 #elif defined(GI_SSE2_INTRINSICS)
    __m64 low = GiGetLowInt8x16(Vector);
    __m64 high = GiGetHighInt8x16(Vector);
    __m64 low = _mm_movepi64_pi64(Vector);
    __m64 high = _mm_movepi64_pi64(_mm_unpackhi_epi64(Vector, Vector));
    __m128 v0 = _mm_cvtpi8_ps(low);
    __m128 v1 = _mm_cvtpi8_ps(_mm_unpackhi_pi32(low, low));
    __m128 v2 = _mm_cvtpi8_ps(high);
@@ -447,16 +453,13 @@ int16_t GiReduceAddInt8(GI_INT8 Vector) {
    return (int16_t)(ret0 + ret1 + ret2 + ret3);
 #else
    int32_t sum = 0;
    for (size_t i = 0; i < GI_SIMD_LEN_BYTE / sizeof(int32_t); i++) {
    for (size_t i = 0; i < GI_SIMD_LEN_BYTE / sizeof(int8_t); i++) {
        sum += Vector[i];
    }
    return sum;
 #endif
 }

 #define Max(a, b) (a) > (b) ? (a) : (b)
 #define Min(a, b) (a) < (b) ? (a) : (b)

 GI_FORCEINLINE
 int8_t GiReduceMaxInt8(GI_INT8 Vector) {
 #if defined(GI_NEON64_INTRINSICS)
@@ -480,23 +483,23 @@ int8_t GiReduceMaxInt8(GI_INT8 Vector) {
    ret = Max(_mm_extract_epi32(sum, 3), ret);
    return (int8_t)ret;
 #elif defined(GI_SSE2_INTRINSICS)
    __m64 low = GiGetLowInt8x16(Vector);
    __m64 high = GiGetHighInt8x16(Vector);
    __m64 low = _mm_movepi64_pi64(Vector);
    __m64 high = _mm_movepi64_pi64(_mm_unpackhi_epi64(Vector, Vector));
    __m128 v0 = _mm_cvtpi8_ps(low);
    __m128 v1 = _mm_cvtpi8_ps(_mm_unpackhi_pi32(low, low));
    __m128 v2 = _mm_cvtpi8_ps(high);
    __m128 v3 = _mm_cvtpi8_ps(_mm_unpackhi_pi32(high, high));
    __m128 sum0 = _mm_add_ps(v0, v1);
    __m128 sum1 = _mm_add_ps(v2, v3);
    __m128 sum = _mm_add_ps(sum0, sum1);
    float ret0 = _mm_cvtss_f32(sum);
    float ret1 = _mm_cvtss_f32(_mm_shuffle_ps(sum, sum, _MM_SHUFFLE(1, 1, 1, 1)));
    float ret2 = _mm_cvtss_f32(_mm_shuffle_ps(sum, sum, _MM_SHUFFLE(2, 2, 2, 2)));
    float ret3 = _mm_cvtss_f32(_mm_shuffle_ps(sum, sum, _MM_SHUFFLE(3, 3, 3, 3)));
    __m128 max0 = _mm_max_ps(v0, v1);
    __m128 max1 = _mm_max_ps(v2, v3);
    __m128 max = _mm_max_ps(max0, max1);
    float ret0 = _mm_cvtss_f32(max);
    float ret1 = _mm_cvtss_f32(_mm_shuffle_ps(max, max, _MM_SHUFFLE(1, 1, 1, 1)));
    float ret2 = _mm_cvtss_f32(_mm_shuffle_ps(max, max, _MM_SHUFFLE(2, 2, 2, 2)));
    float ret3 = _mm_cvtss_f32(_mm_shuffle_ps(max, max, _MM_SHUFFLE(3, 3, 3, 3)));
    return (int8_t)(Max(Max(ret0, ret1), Max(ret2, ret3)));
 #else
    int8_t max = Vector[0];
    for (size_t i = 1; i < GI_SIMD_LEN_BYTE / sizeof(int32_t); i++) {
    for (size_t i = 1; i < GI_SIMD_LEN_BYTE / sizeof(int8_t); i++) {
        max = Max(max, Vector[i]);
    }
    return max;
@@ -526,23 +529,23 @@ int8_t GiReduceMinInt8(GI_INT8 Vector) {
    ret = Min(_mm_extract_epi32(sum, 3), ret);
    return (int8_t)ret;
 #elif defined(GI_SSE2_INTRINSICS)
    __m64 low = GiGetLowInt8x16(Vector);
    __m64 high = GiGetHighInt8x16(Vector);
    __m64 low = _mm_movepi64_pi64(Vector);
    __m64 high = _mm_movepi64_pi64(_mm_unpackhi_epi64(Vector, Vector));
    __m128 v0 = _mm_cvtpi8_ps(low);
    __m128 v1 = _mm_cvtpi8_ps(_mm_unpackhi_pi32(low, low));
    __m128 v2 = _mm_cvtpi8_ps(high);
    __m128 v3 = _mm_cvtpi8_ps(_mm_unpackhi_pi32(high, high));
    __m128 sum0 = _mm_add_ps(v0, v1);
    __m128 sum1 = _mm_add_ps(v2, v3);
    __m128 sum = _mm_add_ps(sum0, sum1);
    float ret0 = _mm_cvtss_f32(sum);
    float ret1 = _mm_cvtss_f32(_mm_shuffle_ps(sum, sum, _MM_SHUFFLE(1, 1, 1, 1)));
    float ret2 = _mm_cvtss_f32(_mm_shuffle_ps(sum, sum, _MM_SHUFFLE(2, 2, 2, 2)));
    float ret3 = _mm_cvtss_f32(_mm_shuffle_ps(sum, sum, _MM_SHUFFLE(3, 3, 3, 3)));
    __m128 min0 = _mm_min_ps(v0, v1);
    __m128 min1 = _mm_min_ps(v2, v3);
    __m128 min = _mm_min_ps(min0, min1);
    float ret0 = _mm_cvtss_f32(min);
    float ret1 = _mm_cvtss_f32(_mm_shuffle_ps(min, min, _MM_SHUFFLE(1, 1, 1, 1)));
    float ret2 = _mm_cvtss_f32(_mm_shuffle_ps(min, min, _MM_SHUFFLE(2, 2, 2, 2)));
    float ret3 = _mm_cvtss_f32(_mm_shuffle_ps(min, min, _MM_SHUFFLE(3, 3, 3, 3)));
    return (int8_t)(Min(Min(ret0, ret1), Min(ret2, ret3)));
 #else
    int8_t min = Vector[0];
    for (size_t i = 1; i < GI_SIMD_LEN_BYTE / sizeof(int32_t); i++) {
    for (size_t i = 1; i < GI_SIMD_LEN_BYTE / sizeof(int8_t); i++) {
        min = Min(min, Vector[i]);
    }
    return min;
@@ -561,8 +564,7 @@ GiCvtFromFloat32ToInt8(GI_FLOAT32 src) {
 #if __ARM_ARCH >= 8
    int32x4_t vres0 = vcvtaq_s32_f32(src);
    int16x8_t mid_s16 = vcombine_s16(vqmovn_s32(vres0), vqmovn_s32(vres0));
    int8x8_t ret = vqmovn_s16(vcombine_s16(vqmovn_s32(mid_s16), vqmovn_s32(mid_s16)));
    return vcombine_s16(ret, ret);
    return vcombine_s8(vqmovn_s16(mid_s16), vqmovn_s16(mid_s16));
 #else
    float32x4_t vzero = vdupq_n_f32(0.f);
    float32x4_t vfhalf = vdupq_n_f32(0.5f);
@@ -570,8 +572,7 @@ GiCvtFromFloat32ToInt8(GI_FLOAT32 src) {
    float32x4_t vinc0 = vbslq_f32(vcgeq_f32(src, vzero), vfhalf, vfneg_half);
    int32x4_t vres0 = vcvtq_s32_f32(vaddq_f32(src, vinc0));
    int16x8_t mid_s16 = vcombine_s16(vqmovn_s32(vres0), vqmovn_s32(vres0));
    int8x8_t ret = vqmovn_s16(vcombine_s16(vqmovn_s32(mid_s16), vqmovn_s32(mid_s16)));
    return vcombine_s16(ret, ret);
    return vcombine_s8(vqmovn_s16(mid_s16), vqmovn_s16(mid_s16));
 #endif
 #elif defined(GI_SSE42_INTRINSICS)
    __m128 vfzero = _mm_set1_ps(0.f);
--- a/dnn/src/fallback/quantized_converter.h
+++ b/dnn/src/fallback/quantized_converter.h
@@ -0,0 +1,81 @@
 /**
 * \file dnn/src/arm_common/quantized_converter.h
 * MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
 *
 * Copyright (c) 2014-2021 Megvii Inc. All rights reserved.
 *
 * Unless required by applicable law or agreed to in writing,
 * software distributed under the License is distributed on an
 * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or
 * implied.
 */
 #pragma once

 #include "megdnn/dtype.h"
 #include "megdnn/oprs.h"
 #include "src/common/utils.h"
 #include "src/fallback/general_intrinsic/gi_float.h"
 #include "src/fallback/general_intrinsic/gi_int.h"

 namespace megdnn {
 namespace fallback {

 struct QConverterBase {
    inline static GI_INT32 vzero() { return GiBroadcastInt32(0); }

    inline static GI_FLOAT32 vfzero() { return GiBroadcastFloat32(0.f); }

    inline static GI_FLOAT32 vfhalf() { return GiBroadcastFloat32(0.5f); }

    inline static GI_FLOAT32 vfneg_half() { return GiBroadcastFloat32(-0.5f); }
 };

 struct QConverter {
    template <typename dst_type, typename... src_type>
    static inline dst_type convert(const src_type&... src);

    template <typename dst_type, typename... src_type>
    static inline dst_type round(const src_type&... src);
 };

 template <>
 inline dt_qint8 QConverter::convert(const float& src) {
    return dt_qint8(saturate<int8_t, float>(std::round(src), -128, 127));
 }

 template <>
 inline dt_quint8 QConverter::convert(const float& src, const uint8_t& zp) {
    return dt_quint8(saturate<uint8_t, float>(std::round(src) + zp, 0, 255));
 }

 template <>
 inline dt_qint32 QConverter::convert(const float& src) {
    return dt_qint32(saturate<int32_t, float>(
            std::round(src), static_cast<float>(std::numeric_limits<int32_t>::min()),
            static_cast<float>(std::numeric_limits<int32_t>::max())));
 }

 template <>
 inline GI_FLOAT32_V2 QConverter::convert(const GI_INT16& vsrc) {
    GI_INT32 vhi = GiMoveHighLongInt16(vsrc);
    GI_INT32 vlo = GiMoveLowLongInt16(vsrc);
    return {{GiCastToFloat32(vlo), GiCastToFloat32(vhi)}};
 }

 template <>
 inline GI_INT8 QConverter::convert(const GI_FLOAT32_V2& vsrc) {
    return GiCvtFromFloat32V2ToInt8(vsrc);
 }
 template <>
 inline GI_INT8 QConverter::convert(const GI_FLOAT32& src) {
    return GiCvtFromFloat32ToInt8(src);
 }

 template <>
 inline GI_INT32 QConverter::round(const GI_FLOAT32& vsrc) {
    return GiRoundAsInt32(vsrc);
 }
 }  // namespace fallback
 }  // namespace megdnn

 // vim: syntax=cpp.doxygen
--- a/dnn/src/fallback/reduce/opr_impl.cpp
+++ b/dnn/src/fallback/reduce/opr_impl.cpp
@@ -14,11 +14,13 @@
 #include "src/naive/handle.h"

 #include "midout.h"
 #include "reducer.h"
 #include "src/common/reduce_helper.h"

 MIDOUT_DECL(megdnn_fb_reduce_op)
 MIDOUT_DECL(megdnn_fb_reduce_c)
 MIDOUT_DECL(megdnn_fb_reduce_dtype)
 MIDOUT_DECL(megdnn_fallback_reduce_optimized)

 namespace {

@@ -77,11 +79,20 @@ namespace fallback {

 void ReduceImpl::exec(
        _megdnn_tensor_in src, _megdnn_tensor_out dst, _megdnn_workspace workspace) {
    check_exec(src.layout, dst.layout, workspace.size);
    if (!exec_optimized(src, dst, workspace)) {
        return exec_fallback(src, dst, workspace);
    }
 }

 void ReduceImpl::exec_fallback(
        _megdnn_tensor_in src, _megdnn_tensor_out dst, _megdnn_workspace workspace) {
    using namespace reduce;
    using Mode = Param::Mode;
    check_exec(src.layout, dst.layout, workspace.size);
    size_t A, B, C;
    get_ABC(src.layout, A, B, C, param().axis);

 #define cb_by_op(src_type, dst_type, _wtype, mode_, Op_, kern_func)                   \
    if (param().mode == mode_) {                                                      \
        typedef DTypeTrait<src_type>::ctype src_ctype;                                \
@@ -176,6 +187,101 @@ void ReduceImpl::exec(
    naive::ReduceForwardImpl::exec(src, dst, workspace);
 }

 bool ReduceImpl::exec_optimized(
        _megdnn_tensor_in src, _megdnn_tensor_out dst, _megdnn_workspace) {
    size_t A, B, C;
    reduce::get_ABC(src.layout, A, B, C, param().axis);
    bool execed = false;
    using Mode = param::Reduce::Mode;
 #define DISPATCH_FUNC(Reducer, dtype, ctype, comp_type)                           \
    if (C == 1) {                                                                 \
        using _Reducer = Reducer<dtype, ctype, comp_type, true>;                  \
        std::function<void(const ctype*, ctype*, DType, size_t, size_t, size_t)>  \
                do_reduce = Exec<_Reducer, true>::do_reduce;                      \
        MIDOUT_BEGIN(                                                             \
                megdnn_fallback_reduce_optimized, ctype, dtype, comp_type,        \
                midout_iv(0)) {                                                   \
            MEGDNN_DISPATCH_CPU_KERN_OPR(do_reduce(                               \
                    reinterpret_cast<ctype*>(src.raw_ptr()),                      \
                    reinterpret_cast<ctype*>(dst.raw_ptr()), src_type, A, B, C)); \
            execed = true;                                                        \
        }                                                                         \
        MIDOUT_END();                                                             \
    } else {                                                                      \
        using _Reducer = Reducer<dtype, ctype, comp_type, false>;                 \
        std::function<void(const ctype*, ctype*, DType, size_t, size_t, size_t)>  \
                do_reduce = Exec<_Reducer, false>::do_reduce;                     \
        MIDOUT_BEGIN(                                                             \
                megdnn_fallback_reduce_optimized, ctype, dtype, comp_type,        \
                midout_iv(1)) {                                                   \
            MEGDNN_DISPATCH_CPU_KERN_OPR(do_reduce(                               \
                    reinterpret_cast<ctype*>(src.raw_ptr()),                      \
                    reinterpret_cast<ctype*>(dst.raw_ptr()), src_type, A, B, C)); \
            execed = true;                                                        \
        }                                                                         \
        MIDOUT_END();                                                             \
    }

 #define DISPATCH_MODE_QUANTIZED(dtype, ctype, comp_type)         \
    switch (param().mode) {                                      \
        case Mode::MEAN:                                         \
            DISPATCH_FUNC(MeanReducer, dtype, ctype, comp_type); \
            break;                                               \
        case Mode::MAX:                                          \
            DISPATCH_FUNC(maxReducer, dtype, ctype, ctype);      \
            break;                                               \
        case Mode::MIN:                                          \
            DISPATCH_FUNC(minReducer, dtype, ctype, ctype);      \
            break;                                               \
        default:                                                 \
            break;                                               \
    }

 #define DISPATCH_MODE_FLOAT(dtype, ctype, comp_type)             \
    switch (param().mode) {                                      \
        case Mode::MEAN:                                         \
            DISPATCH_FUNC(MeanReducer, dtype, ctype, comp_type); \
            break;                                               \
        case Mode::MAX:                                          \
            DISPATCH_FUNC(maxReducer, dtype, ctype, ctype);      \
            break;                                               \
        case Mode::MIN:                                          \
            DISPATCH_FUNC(minReducer, dtype, ctype, ctype);      \
            break;                                               \
        case Mode::SUM:                                          \
            DISPATCH_FUNC(SumReducer, dtype, ctype, ctype);      \
            break;                                               \
        case Mode::SUM_SQR:                                      \
            DISPATCH_FUNC(SumSqrReducer, dtype, ctype, ctype);   \
            break;                                               \
        case Mode::PRODUCT:                                      \
            DISPATCH_FUNC(ProductReducer, dtype, ctype, ctype);  \
            break;                                               \
        default:                                                 \
            break;                                               \
    }
    if (src.layout.is_contiguous() &&
        src.layout.dtype.category() == DTypeCategory::QUANTIZED &&
        param().data_type == param::Reduce::DataType::DEFAULT) {
        DType src_type = src.layout.dtype;
        if (src.layout.dtype.enumv() == DTypeEnum::QuantizedS8) {
            DISPATCH_MODE_QUANTIZED(dt_qint8, int8_t, int32_t)
        }
    } else if (
            src.layout.is_contiguous() &&
            src.layout.dtype.category() == DTypeCategory::FLOAT &&
            param().data_type == param::Reduce::DataType::DEFAULT) {
        DType src_type = src.layout.dtype;
        if (src.layout.dtype.enumv() == DTypeEnum::Float32) {
            DISPATCH_MODE_FLOAT(dt_float32, float, float)
        }
    }
    return execed;
 #undef DISPATCH_FUNC
 #undef DISPATCH_MODE_QUANTIZED
 #undef DISPATCH_MODE_FLOAT
 }

 }  // namespace fallback
 }  // namespace megdnn
   // vim: syntax=cpp.doxygen
--- a/dnn/src/fallback/reduce/opr_impl.h
+++ b/dnn/src/fallback/reduce/opr_impl.h
@@ -19,6 +19,10 @@ public:
    using ReduceForwardImpl::ReduceForwardImpl;
    void exec(
            _megdnn_tensor_in src, _megdnn_tensor_out dst, _megdnn_workspace) override;
    bool exec_optimized(
            _megdnn_tensor_in src, _megdnn_tensor_out dst, _megdnn_workspace);
    void exec_fallback(
            _megdnn_tensor_in src, _megdnn_tensor_out dst, _megdnn_workspace);
 };

 }  // namespace fallback
--- a/dnn/src/fallback/reduce/reducer.h
+++ b/dnn/src/fallback/reduce/reducer.h
@@ -0,0 +1,417 @@
 /**
 * \file dnn/src/fallback/reduce/reducer.h
 * MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
 *
 * Copyright (c) 2014-2021 Megvii Inc. All rights reserved.
 *
 * Unless required by applicable law or agreed to in writing,
 * software distributed under the License is distributed on an
 * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 */
 #pragma once

 #include "src/common/utils.h"
 #include "src/fallback/general_intrinsic/gi_float.h"
 #include "src/fallback/general_intrinsic/gi_int.h"
 #include "src/fallback/quantized_converter.h"

 using namespace megdnn;
 using namespace fallback;

 namespace {

 /*****************************Mean Reducer***********************/
 template <typename dtype, typename ctype, typename comp_type, bool C1>
 struct MeanReducer;

 template <>
 struct MeanReducer<dt_qint8, int8_t, int32_t, true> {
    using ctype = int8_t;
    static constexpr int SIMD_WIDTH = GI_SIMD_LEN_BYTE / sizeof(int8_t);

    int32_t res;
    float coef;
    MeanReducer(DType, size_t cnt) : res(0), coef(1.0 / cnt) {}
    MeanReducer() = default;
    void feed(const int8_t* val) { res += GiReduceAddInt8(GiLoadInt8(val)); }
    void feed_remain(const int8_t* val) { res += *val; }
    void post(int8_t* dst) {
        float sum = res * coef;
        *dst = std::round(sum);
    }
 };

 template <>
 struct MeanReducer<dt_qint8, int8_t, int32_t, false> {
    using ctype = int8_t;
    static constexpr int SIMD_WIDTH = GI_SIMD_LEN_BYTE / sizeof(int8_t);

    GI_INT32 res[4];
    int32_t remain;
    int32_t cnt;
    float coef;
    GI_FLOAT32 vcoef;
    MeanReducer(DType, size_t cnt) : remain(0), cnt(cnt), coef(1.0 / cnt) {
        memset(res, 0, sizeof(res));
        vcoef = GiBroadcastFloat32(coef);
    }
    MeanReducer() = default;
    void feed(const int8_t* val) {
        const GI_INT8 vval = GiLoadInt8(val);
        const GI_INT16 vval_low = GiMoveLowLongInt8(vval);
        const GI_INT16 vval_high = GiMoveHighLongInt8(vval);

        const GI_INT32 vval_low_low = GiMoveLowLongInt16(vval_low);
        const GI_INT32 vval_low_high = GiMoveHighLongInt16(vval_low);
        const GI_INT32 vval_high_low = GiMoveLowLongInt16(vval_high);
        const GI_INT32 vval_high_high = GiMoveHighLongInt16(vval_high);

        res[0] = GiAddInt32(res[0], vval_low_low);
        res[1] = GiAddInt32(res[1], vval_low_high);
        res[2] = GiAddInt32(res[2], vval_high_low);
        res[3] = GiAddInt32(res[3], vval_high_high);
    }
    void feed_remain(const int8_t* val) { remain += *val; }
    void post(int8_t* dst) {
        for (int i = 0; i < 4; i += 2) {
            GI_FLOAT32 vitem0 = GiMultiplyFloat32(GiCastToFloat32(res[i]), vcoef);
            GI_FLOAT32 vitem1 = GiMultiplyFloat32(GiCastToFloat32(res[i + 1]), vcoef);
            GiStoreLowInt8(
                    dst,
                    (QConverter::convert<GI_INT8, GI_FLOAT32_V2>({{vitem0, vitem1}})));
            dst += 8;
        }
    }
    void post_remain(int8_t* dst) {
        float sum = remain * coef;
        *dst = std::round(sum);
    }
 };

 template <>
 struct MeanReducer<dt_float32, float, float, true> {
    using ctype = float;
    static constexpr int SIMD_WIDTH = GI_SIMD_LEN_BYTE / sizeof(float);

    GI_FLOAT32 res;
    float result;
    float coef;
    MeanReducer(DType, size_t cnt) : result(0.0f), coef(1.0 / cnt) {
        res = GiBroadcastFloat32(0.0f);
    }
    MeanReducer() = default;
    void feed(const float* val) { res = GiAddFloat32(GiLoadFloat32(val), res); }
    void feed_remain(const float* val) { result += *val; }
    void post(float* dst) {
        result += GiReduceAddFloat32(res);
        *dst = result * coef;
    }
 };

 template <>
 struct MeanReducer<dt_float32, float, float, false> {
    using ctype = float;
    static constexpr int SIMD_WIDTH = GI_SIMD_LEN_BYTE / sizeof(float);

    GI_FLOAT32 res;
    float remain;
    float coef;
    MeanReducer(DType, size_t cnt) : remain(0.0f), coef(1.0 / cnt) {
        res = GiBroadcastFloat32(0.0f);
    }
    MeanReducer() = default;
    void feed(const float* val) { res = GiAddFloat32(GiLoadFloat32(val), res); }
    void feed_remain(const float* val) { remain += *val; }
    void post(float* dst) {
        res = GiMultiplyScalerFloat32(res, coef);
        GiStoreFloat32(dst, res);
    }
    void post_remain(float* dst) { *dst = remain * coef; }
 };

 /******************************max min Reducer****************************/
 template <typename dtype, typename ctype, typename comp_type, bool C1>
 struct maxReducer;
 template <typename dtype, typename ctype, typename comp_type, bool C1>
 struct minReducer;

 #define REDUCER_MAX_MIN_C1(_mode, _Mode, _init)                             \
    template <>                                                             \
    struct _mode##Reducer<dt_float32, float, float, true> {                 \
        using ctype = float;                                                \
        static constexpr int SIMD_WIDTH = GI_SIMD_LEN_BYTE / sizeof(float); \
        GI_FLOAT32 res;                                                     \
        _mode##Reducer(DType, size_t) { res = GiBroadcastFloat32(_init); }  \
        _mode##Reducer() = default;                                         \
        void feed(const float* val) {                                       \
            auto vval = GiLoadFloat32(val);                                 \
            res = Gi##_Mode##imumFloat32(vval, res);                        \
        }                                                                   \
        void feed_remain(const float* val) {                                \
            auto vval = GiBroadcastFloat32(*val);                           \
            res = Gi##_Mode##imumFloat32(vval, res);                        \
        }                                                                   \
        void post(float* dst) { *dst = GiReduce##_Mode##imumFloat32(res); } \
    }

 REDUCER_MAX_MIN_C1(max, Max, std::numeric_limits<dt_float32>::lowest());
 REDUCER_MAX_MIN_C1(min, Min, std::numeric_limits<dt_float32>::max());
 #undef REDUCER_MAX_MIN_C1

 #define REDUCER_MAX_MIN_C(_mode, _Mode, _init)                              \
    template <>                                                             \
    struct _mode##Reducer<dt_float32, float, float, false> {                \
        using ctype = float;                                                \
        static constexpr int SIMD_WIDTH = GI_SIMD_LEN_BYTE / sizeof(float); \
        GI_FLOAT32 res;                                                     \
        float remain;                                                       \
        _mode##Reducer(DType, size_t) {                                     \
            res = GiBroadcastFloat32(_init);                                \
            remain = _init;                                                 \
        }                                                                   \
        _mode##Reducer() = default;                                         \
        void feed(const float* val) {                                       \
            GI_FLOAT32 vval = GiLoadFloat32(val);                           \
            res = Gi##_Mode##imumFloat32(vval, res);                        \
        }                                                                   \
        void feed_remain(const float* val) {                                \
            using namespace std;                                            \
            remain = _mode(*val, remain);                                   \
        }                                                                   \
        void post(float* dst) { GiStoreFloat32(dst, res); }                 \
        void post_remain(float* dst) { *dst = remain; }                     \
    }

 REDUCER_MAX_MIN_C(max, Max, std::numeric_limits<dt_float32>::lowest());
 REDUCER_MAX_MIN_C(min, Min, std::numeric_limits<dt_float32>::max());
 #undef REDUCER_MAX_MIN_C

 #define REDUCER_MAX_MIN_C1(_mode, _Mode, _init)                              \
    template <>                                                              \
    struct _mode##Reducer<dt_qint8, int8_t, int8_t, true> {                  \
        using ctype = int8_t;                                                \
        static constexpr int SIMD_WIDTH = GI_SIMD_LEN_BYTE / sizeof(int8_t); \
        GI_INT8 res;                                                         \
        _mode##Reducer(DType, size_t) { res = GiBroadcastInt8(_init); }      \
        _mode##Reducer() = default;                                          \
        void feed(const int8_t* val) {                                       \
            GI_INT8 vval = GiLoadInt8(val);                                  \
            res = Gi##_Mode##imumInt8(vval, res);                            \
        }                                                                    \
        void feed_remain(const int8_t* val) {                                \
            GI_INT8 vval = GiBroadcastInt8(*val);                            \
            res = Gi##_Mode##imumInt8(vval, res);                            \
        }                                                                    \
        void post(int8_t* dst) { *dst = GiReduce##_Mode##Int8(res); }        \
    }

 REDUCER_MAX_MIN_C1(max, Max, -128);
 REDUCER_MAX_MIN_C1(min, Min, 127);
 #undef REDUCER_MAX_MIN_C1

 #define REDUCER_MAX_MIN_C(_mode, _Mode, _init)                               \
    template <>                                                              \
    struct _mode##Reducer<dt_qint8, int8_t, int8_t, false> {                 \
        using ctype = int8_t;                                                \
        static constexpr int SIMD_WIDTH = GI_SIMD_LEN_BYTE / sizeof(int8_t); \
        GI_INT8 res;                                                         \
        int8_t remain;                                                       \
        _mode##Reducer(DType, size_t) {                                      \
            res = GiBroadcastInt8(_init);                                    \
            remain = _init;                                                  \
        }                                                                    \
        _mode##Reducer() = default;                                          \
        void feed(const int8_t* val) {                                       \
            GI_INT8 vval = GiLoadInt8(val);                                  \
            res = Gi##_Mode##imumInt8(vval, res);                            \
        }                                                                    \
        void feed_remain(const int8_t* val) {                                \
            using namespace std;                                             \
            remain = _mode(*val, remain);                                    \
        }                                                                    \
        void post(int8_t* dst) { GiStoreInt8(dst, res); }                    \
        void post_remain(int8_t* dst) { *dst = remain; }                     \
    }

 REDUCER_MAX_MIN_C(max, Max, -128);
 REDUCER_MAX_MIN_C(min, Min, 127);
 #undef REDUCER_MAX_MIN_C

 /***************************Sum Product Reducer***************************/
 template <typename dtype, typename ctype, typename comp_type, bool C1>
 struct SumReducer;
 template <typename dtype, typename ctype, typename comp_type, bool C1>
 struct ProductReducer;

 #define REDUCER_SUM_PRODUCT_C1(_mode, _Mode, _op, _init)                    \
    template <>                                                             \
    struct _mode##Reducer<dt_float32, float, float, true> {                 \
        using ctype = float;                                                \
        static constexpr int SIMD_WIDTH = GI_SIMD_LEN_BYTE / sizeof(float); \
        GI_FLOAT32 res;                                                     \
        float remain;                                                       \
        _mode##Reducer(DType, size_t) {                                     \
            res = GiBroadcastFloat32(_init);                                \
            remain = _init;                                                 \
        }                                                                   \
        _mode##Reducer() = default;                                         \
        void feed(const float* val) {                                       \
            GI_FLOAT32 vval = GiLoadFloat32(val);                           \
            res = Gi##_Mode##Float32(vval, res);                            \
        }                                                                   \
        void feed_remain(const float* val) {                                \
            using namespace std;                                            \
            auto op = _op<float>();                                         \
            remain = op(remain, *val);                                      \
        }                                                                   \
        void post(float* dst) {                                             \
            using namespace std;                                            \
            auto op = _op<float>();                                         \
            *dst = op(remain, GiReduce##_Mode##Float32(res));               \
        }                                                                   \
    }

 REDUCER_SUM_PRODUCT_C1(Sum, Add, plus, 0.0f);
 REDUCER_SUM_PRODUCT_C1(Product, Multiply, multiplies, 1.0f);
 #undef REDUCER_SUM_PRODUCT_C1

 #define REDUCER_SUM_PRODUCT_C(_mode, _Mode, _op, _init)                     \
    template <>                                                             \
    struct _mode##Reducer<dt_float32, float, float, false> {                \
        using ctype = float;                                                \
        static constexpr int SIMD_WIDTH = GI_SIMD_LEN_BYTE / sizeof(float); \
        GI_FLOAT32 res;                                                     \
        float remain;                                                       \
        _mode##Reducer(DType, size_t) {                                     \
            res = GiBroadcastFloat32(_init);                                \
            remain = _init;                                                 \
        }                                                                   \
        _mode##Reducer() = default;                                         \
        void feed(const float* val) {                                       \
            GI_FLOAT32 vval = GiLoadFloat32(val);                           \
            res = Gi##_Mode##Float32(vval, res);                            \
        }                                                                   \
        void feed_remain(const float* val) {                                \
            using namespace std;                                            \
            auto op = _op<float>();                                         \
            remain = op(remain, (*val));                                    \
        }                                                                   \
        void post(float* dst) { GiStoreFloat32(dst, res); }                 \
        void post_remain(float* dst) { *dst = remain; }                     \
    }

 REDUCER_SUM_PRODUCT_C(Sum, Add, plus, 0.0f);
 REDUCER_SUM_PRODUCT_C(Product, Multiply, multiplies, 1.0f);
 #undef REDUCER_SUM_PRODUCT_C

 /***************************SumSqr Reducer***************************/
 template <typename dtype, typename ctype, typename comp_type, bool C1>
 struct SumSqrReducer;

 template <>
 struct SumSqrReducer<dt_float32, float, float, true> {
    using ctype = float;
    static constexpr int SIMD_WIDTH = GI_SIMD_LEN_BYTE / sizeof(float);

    GI_FLOAT32 res;
    float result;
    SumSqrReducer(DType, size_t cnt) : result(0.0f) {
        MEGDNN_MARK_USED_VAR(cnt);
        res = GiBroadcastFloat32(0.0f);
    }
    SumSqrReducer() = default;
    void feed(const float* val) {
        GI_FLOAT32 vval = GiLoadFloat32(val);
        res = GiAddFloat32(GiMultiplyFloat32(vval, vval), res);
    }
    void feed_remain(const float* val) {
        float vval = *val;
        result += vval * vval;
    }
    void post(float* dst) {
        result += GiReduceAddFloat32(res);
        *dst = result;
    }
 };
 template <>
 struct SumSqrReducer<dt_float32, float, float, false> {
    using ctype = float;
    static constexpr int SIMD_WIDTH = GI_SIMD_LEN_BYTE / sizeof(float);

    GI_FLOAT32 res;
    float remain;
    SumSqrReducer(DType, size_t cnt) : remain(0.0f) {
        MEGDNN_MARK_USED_VAR(cnt);
        res = GiBroadcastFloat32(0.0f);
    }
    SumSqrReducer() = default;
    void feed(const float* val) {
        GI_FLOAT32 vval = GiLoadFloat32(val);
        res = GiAddFloat32(GiMultiplyFloat32(vval, vval), res);
    }
    void feed_remain(const float* val) { remain += (*val) * (*val); }
    void post(float* dst) { GiStoreFloat32(dst, res); }
    void post_remain(float* dst) { *dst = remain; }
 };
 /**************************************do reduce*************************/

 template <typename Reducer, bool C1>
 struct Exec {
    static void do_reduce(
            const typename Reducer::ctype* src, typename Reducer::ctype* dst,
            DType src_dtype, size_t A, size_t B, size_t C);
 };

 template <typename Reducer>
 struct Exec<Reducer, true> {
    static void do_reduce(
            const typename Reducer::ctype* src, typename Reducer::ctype* dst,
            DType src_dtype, size_t A, size_t B, size_t) {
        size_t a = 0;
        for (; a < A; a++) {
            Reducer reducer0(src_dtype, B);
            auto temp_src0 = src + a * B;
            size_t b = 0;
            for (; b + Reducer::SIMD_WIDTH <= B; b += Reducer::SIMD_WIDTH) {
                reducer0.feed(temp_src0);
                temp_src0 += Reducer::SIMD_WIDTH;
            }
            for (; b < B; b++) {
                reducer0.feed_remain(temp_src0);
                temp_src0++;
            }
            reducer0.post(dst);
            dst++;
        }
    }
 };

 template <typename Reducer>
 struct Exec<Reducer, false> {
    static void do_reduce(
            const typename Reducer::ctype* src, typename Reducer::ctype* dst,
            DType src_dtype, size_t A, size_t B, size_t C) {
        for (size_t a = 0; a < A; a++) {
            size_t c = 0;
            for (; c + Reducer::SIMD_WIDTH <= C; c += Reducer::SIMD_WIDTH) {
                Reducer reducer(src_dtype, B);
                for (size_t b = 0; b < B; b++)
                    reducer.feed(src + c + C * b);
                reducer.post(dst);
                dst += Reducer::SIMD_WIDTH;
            }
            for (; c < C; c++) {
                Reducer reducer(src_dtype, B);
                for (size_t b = 0; b < B; b++)
                    reducer.feed_remain(src + c + C * b);
                reducer.post_remain(dst);
                dst++;
            }
            src += B * C;
        }
    }
 };

 }  // namespace

 // vim: syntax=cpp.doxygen
--- a/dnn/test/arm_common/lstm.cpp
+++ b/dnn/test/arm_common/lstm.cpp
@@ -181,7 +181,6 @@ TEST_F(ARM_COMMON, LSTM_FORWARD_RECORD) {

 TEST_F(ARM_COMMON, BENCHMARK_LSTM_FORWARD) {
    Benchmarker<LSTM> optimized_bench(handle());
    constexpr size_t RUNS = 20;
    auto run = [&](size_t hidden_size, size_t input_size) {
        optimized_bench.set_times(20).set_display(true);
        size_t gate_hidden_size = 4 * hidden_size;
--- a/dnn/test/fallback/reduce.cpp
+++ b/dnn/test/fallback/reduce.cpp
@@ -18,6 +18,75 @@
 using namespace megdnn;
 using namespace test;

 TEST_F(FALLBACK, REDUCE_FULL) {
    using Param = Reduce::Param;
    using Mode = Param::Mode;
    Checker<Reduce> checker(handle());
    UniformIntRNG rng{INT8_MIN >> 1, INT8_MAX >> 1};
    checker.set_rng(0, &rng);
    struct Config {
        Param param;
        DType dtype;
        TensorShape shape;
        Config(Param param, DType dtype, TensorShape shape)
                : param(param), dtype(dtype), shape(shape) {}
    };
    std::vector<Config> configs;
    for (auto mode : {Mode::MEAN, Mode::MAX, Mode::MIN})
        for (auto dtype : std::vector<DType>{
                     dtype::Float32(), dtype::Float16(), dtype::QuantizedS8(1.3f),
                     dtype::Quantized8Asymm(1.3f, static_cast<uint8_t>(3))})
            for (int32_t axis : {0, 1, 2}) {
                for (size_t A : {1, 3, 5}) {
                    for (size_t B : {4, 6, 9, 16, 33, 45}) {
                        for (size_t C : {4, 6, 9, 16, 33, 45}) {
                            TensorShape shape{A, B, C};
                            Param param(mode, axis);
                            Config config(param, dtype, shape);
                            configs.push_back(config);
                        }
                    }
                }
            }
    for (auto&& config : configs) {
        auto&& dtype = config.dtype;
        auto&& param = config.param;
        auto&& shape = config.shape;

        checker.set_dtype(0, dtype).set_param(param).execs({shape, {}});
    }
    configs.clear();
    for (auto mode : {Mode::SUM, Mode::PRODUCT, Mode::SUM_SQR})
        for (auto dtype : std::vector<DType>{dtype::Float32(), dtype::Float16()})
            for (int32_t axis : {0, 1, 2}) {
                for (size_t A : {1, 3, 5}) {
                    for (size_t B : {4, 6, 9, 16, 33, 45}) {
                        for (size_t C : {4, 6, 9, 16, 33, 45}) {
                            TensorShape shape{A, B, C};
                            Param param(mode, axis);
                            Config config(param, dtype, shape);
                            configs.push_back(config);
                        }
                    }
                }
            }

    UniformFloatRNG rng_float(-2, 2);
    checker.set_rng(0, &rng_float);
    checker.set_epsilon(1e-1);
    for (auto&& config : configs) {
        auto&& dtype = config.dtype;
        auto&& param = config.param;
        auto&& shape = config.shape;
        if (dtype == dtype::Float16())
            checker.set_epsilon(1e-1);
        else
            checker.set_epsilon(1e-3);

        checker.set_dtype(0, dtype).set_param(param).execs({shape, {}});
    }
 }

 TEST_F(FALLBACK, REDUCE) {
    using Param = Reduce::Param;
    using Mode = Param::Mode;