From e34a642b31311a64961c5927657025d2bb0279fa Mon Sep 17 00:00:00 2001
From: Megvii Engine Team <megengine@megvii.com>
Date: Thu, 17 Feb 2022 18:27:26 +0800
Subject: [PATCH] feat(fallback): reduce support general intrinsic

GitOrigin-RevId: f250aa7b2a145a66699636c11e5602f02693ed2a
---
 dnn/src/fallback/general_intrinsic/gi_common.h |   7 +-
 dnn/src/fallback/general_intrinsic/gi_float.h  |  21 +-
 dnn/src/fallback/general_intrinsic/gi_int.h    |  87 +++---
 dnn/src/fallback/quantized_converter.h         |  81 +++++
 dnn/src/fallback/reduce/opr_impl.cpp           | 106 +++++++
 dnn/src/fallback/reduce/opr_impl.h             |   4 +
 dnn/src/fallback/reduce/reducer.h              | 417 +++++++++++++++++++++++++
 dnn/test/arm_common/lstm.cpp                   |   1 -
 dnn/test/fallback/reduce.cpp                   |  69 ++++
 9 files changed, 744 insertions(+), 49 deletions(-)
 create mode 100644 dnn/src/fallback/quantized_converter.h
 create mode 100644 dnn/src/fallback/reduce/reducer.h

diff --git a/dnn/src/fallback/general_intrinsic/gi_common.h b/dnn/src/fallback/general_intrinsic/gi_common.h
index 3050f424..3b21d62d 100644
--- a/dnn/src/fallback/general_intrinsic/gi_common.h
+++ b/dnn/src/fallback/general_intrinsic/gi_common.h
@@ -95,8 +95,8 @@ typedef __m128i GI_INT16;
 typedef __m128i GI_INT32;
 #else
 typedef float GI_FLOAT32 __attribute__((vector_size(16)));
-typedef uint16_t GI_UINT8 __attribute__((vector_size(16)));
-typedef int16_t GI_INT8 __attribute__((vector_size(16)));
+typedef uint8_t GI_UINT8 __attribute__((vector_size(16)));
+typedef int8_t GI_INT8 __attribute__((vector_size(16)));
 typedef int16_t GI_INT16 __attribute__((vector_size(16)));
 typedef int32_t GI_INT32 __attribute__((vector_size(16)));
 #endif
@@ -119,6 +119,9 @@ typedef int32_t GI_INT32 __attribute__((vector_size(16)));
 #define GI_SIMD_LEN_BYTE 16
 #endif
 
+#define Max(a, b) (a) > (b) ? (a) : (b)
+#define Min(a, b) (a) < (b) ? (a) : (b)
+
 typedef struct {
     GI_INT32 val[2];
 } GI_INT32_V2;
diff --git a/dnn/src/fallback/general_intrinsic/gi_float.h b/dnn/src/fallback/general_intrinsic/gi_float.h
index 79b1e778..65142d33 100644
--- a/dnn/src/fallback/general_intrinsic/gi_float.h
+++ b/dnn/src/fallback/general_intrinsic/gi_float.h
@@ -223,7 +223,7 @@ GiInterleaveLowFloat32(GI_FLOAT32 Vector1, GI_FLOAT32 Vector2) {
 #if defined(GI_NEON64_INTRINSICS)
     return vzip1q_f32(Vector1, Vector2);
 #elif defined(GI_NEON32_INTRINSICS)
-    float32x2_t zipped = vzipq_f32(Vector1, Vector2);
+    float32x4x2_t zipped = vzipq_f32(Vector1, Vector2);
     return zipped.val[0];
 #elif defined(GI_SSE2_INTRINSICS)
     return _mm_unpacklo_ps(Vector1, Vector2);
@@ -243,7 +243,7 @@ GiInterleaveHighFloat32(GI_FLOAT32 Vector1, GI_FLOAT32 Vector2) {
 #if defined(GI_NEON64_INTRINSICS)
     return vzip2q_f32(Vector1, Vector2);
 #elif defined(GI_NEON32_INTRINSICS)
-    float32x2_t zipped = vzipq_f32(Vector1, Vector2);
+    float32x4x2_t zipped = vzipq_f32(Vector1, Vector2);
     return zipped.val[1];
 #elif defined(GI_SSE2_INTRINSICS)
     return _mm_unpackhi_ps(Vector1, Vector2);
@@ -460,7 +460,14 @@ GiMaximumFloat32(GI_FLOAT32 Vector1, GI_FLOAT32 Vector2) {
 #if defined(GI_NEON_INTRINSICS)
     return vmaxq_f32(Vector1, Vector2);
 #elif defined(GI_SSE2_INTRINSICS)
-    return _mm_max_ps(Vector1, Vector2);
+    //! _mm_max_ps does not fellow the IEEE standard when input is NAN, so
+    //! implement by C code
+#define MAX_NAN(a, b) (std::isnan(a) || (a) > (b)) ? (a) : (b);
+    GI_FLOAT32 max;
+    for (size_t i = 0; i < GI_SIMD_LEN_BYTE / sizeof(float); i++) {
+        max[i] = MAX_NAN(Vector1[i], Vector2[i]);
+    }
+    return max;
 #else
     return GiBlendFloat32(Vector2, Vector1, Vector1 > Vector2);
 #endif
@@ -473,6 +480,14 @@ GiMinimumFloat32(GI_FLOAT32 Vector1, GI_FLOAT32 Vector2) {
     return vminq_f32(Vector1, Vector2);
 #elif defined(GI_SSE2_INTRINSICS)
     return _mm_min_ps(Vector1, Vector2);
+    //! _mm_min_ps does not fellow the IEEE standard when input is NAN, so
+    //! implement by C code
+#define MIN_NAN(a, b) (std::isnan(a) || (a) < (b)) ? (a) : (b);
+    GI_FLOAT32 min;
+    for (size_t i = 0; i < GI_SIMD_LEN_BYTE / sizeof(float); i++) {
+        min[i] = MIN_NAN(Vector1[i], Vector2[i]);
+    }
+    return min;
 #else
     return GiBlendFloat32(Vector2, Vector1, Vector2 > Vector1);
 #endif
diff --git a/dnn/src/fallback/general_intrinsic/gi_int.h b/dnn/src/fallback/general_intrinsic/gi_int.h
index 38dfafd3..aeabfa21 100644
--- a/dnn/src/fallback/general_intrinsic/gi_int.h
+++ b/dnn/src/fallback/general_intrinsic/gi_int.h
@@ -97,7 +97,7 @@ void GiStoreInt8(int8_t* Buffer, GI_INT8 Vector) {
 #elif defined(GI_SSE2_INTRINSICS)
     _mm_storeu_si128((__m128i*)Buffer, Vector);
 #else
-    for (int i = 0; i < 16; i++) {
+    for (size_t i = 0; i < GI_SIMD_LEN_BYTE / sizeof(int8_t); i++) {
         Buffer[i] = Vector[i];
     }
 #endif
@@ -197,7 +197,8 @@ GiAndNotInt8(GI_INT8 VectorNot, GI_INT8 Vector) {
 #elif defined(GI_SSE2_INTRINSICS)
     return _mm_andnot_si128(VectorNot, Vector);
 #else
-    return (~VectorNot) & Vector;
+    GI_INT8 Not = ~VectorNot;
+    return (Not & Vector);
 #endif
 }
 
@@ -327,11 +328,13 @@ GiMoveHighLongInt8(GI_INT8 Vector) {
     for (int i = 0; i < 8; i++) {
         data[i] = o_data[8 + i];
     }
-    return _mm_loadu_si16(data);
+    return _mm_loadu_si128((__m128i*)data);
 #else
     GI_INT16 ret;
-    for (size_t i = 0; i < GI_SIMD_LEN_BYTE / 2 / sizeof(int8_t); i++) {
-        ret[i] = Vector[GI_SIMD_LEN_BYTE / 2 + i];
+    int8_t* data = (int8_t*)&Vector;
+    size_t half_length = GI_SIMD_LEN_BYTE / 2 / sizeof(int8_t);
+    for (size_t i = 0; i < half_length; i++) {
+        ret[i] = data[i + half_length];
     }
     return ret;
 #endif
@@ -351,10 +354,11 @@ GiMoveLowLongInt8(GI_INT8 Vector) {
     for (int i = 0; i < 8; i++) {
         data[i] = o_data[i];
     }
-    return _mm_loadu_si16(data);
+    return _mm_loadu_si128((__m128i*)data);
 #else
     GI_INT16 ret;
-    for (size_t i = 0; i < GI_SIMD_LEN_BYTE / 2 / sizeof(int8_t); i++) {
+    size_t half_length = GI_SIMD_LEN_BYTE / 2 / sizeof(int8_t);
+    for (size_t i = 0; i < half_length; i++) {
         ret[i] = Vector[i];
     }
     return ret;
@@ -375,11 +379,12 @@ GiMoveHighLongInt16(GI_INT16 Vector) {
     for (int i = 0; i < 4; i++) {
         data[i] = o_data[4 + i];
     }
-    return _mm_loadu_si32(data);
+    return _mm_loadu_si128((__m128i*)data);
 #else
     GI_INT32 ret;
-    for (size_t i = 0; i < GI_SIMD_LEN_BYTE / 2 / sizeof(int16_t); i++) {
-        ret[i] = Vector[GI_SIMD_LEN_BYTE / 2 + i];
+    size_t half_length = GI_SIMD_LEN_BYTE / 2 / sizeof(int16_t);
+    for (size_t i = 0; i < half_length; i++) {
+        ret[i] = Vector[half_length + i];
     }
     return ret;
 #endif
@@ -399,10 +404,11 @@ GiMoveLowLongInt16(GI_INT16 Vector) {
     for (int i = 0; i < 4; i++) {
         data[i] = o_data[i];
     }
-    return _mm_loadu_si32(data);
+    return _mm_loadu_si128((__m128i*)data);
 #else
     GI_INT32 ret;
-    for (size_t i = 0; i < GI_SIMD_LEN_BYTE / 2 / sizeof(int16_t); i++) {
+    size_t half_length = GI_SIMD_LEN_BYTE / 2 / sizeof(int16_t);
+    for (size_t i = 0; i < half_length; i++) {
         ret[i] = Vector[i];
     }
     return ret;
@@ -414,7 +420,7 @@ int16_t GiReduceAddInt8(GI_INT8 Vector) {
 #if defined(GI_NEON64_INTRINSICS)
     return vaddlvq_s8(Vector);
 #elif defined(GI_NEON32_INTRINSICS)
-    int32_t sum = vpaddlq_s16(vpaddlq_s8(Vector));
+    int32x4_t sum = vpaddlq_s16(vpaddlq_s8(Vector));
     return (vgetq_lane_s32(sum, 0) + vgetq_lane_s32(sum, 1) + vgetq_lane_s32(sum, 2) +
             vgetq_lane_s32(sum, 3));
 #elif defined(GI_SSE42_INTRINSICS)
@@ -431,8 +437,8 @@ int16_t GiReduceAddInt8(GI_INT8 Vector) {
     return (int16_t)(ret);
 
 #elif defined(GI_SSE2_INTRINSICS)
-    __m64 low = GiGetLowInt8x16(Vector);
-    __m64 high = GiGetHighInt8x16(Vector);
+    __m64 low = _mm_movepi64_pi64(Vector);
+    __m64 high = _mm_movepi64_pi64(_mm_unpackhi_epi64(Vector, Vector));
     __m128 v0 = _mm_cvtpi8_ps(low);
     __m128 v1 = _mm_cvtpi8_ps(_mm_unpackhi_pi32(low, low));
     __m128 v2 = _mm_cvtpi8_ps(high);
@@ -447,16 +453,13 @@ int16_t GiReduceAddInt8(GI_INT8 Vector) {
     return (int16_t)(ret0 + ret1 + ret2 + ret3);
 #else
     int32_t sum = 0;
-    for (size_t i = 0; i < GI_SIMD_LEN_BYTE / sizeof(int32_t); i++) {
+    for (size_t i = 0; i < GI_SIMD_LEN_BYTE / sizeof(int8_t); i++) {
         sum += Vector[i];
     }
     return sum;
 #endif
 }
 
-#define Max(a, b) (a) > (b) ? (a) : (b)
-#define Min(a, b) (a) < (b) ? (a) : (b)
-
 GI_FORCEINLINE
 int8_t GiReduceMaxInt8(GI_INT8 Vector) {
 #if defined(GI_NEON64_INTRINSICS)
@@ -480,23 +483,23 @@ int8_t GiReduceMaxInt8(GI_INT8 Vector) {
     ret = Max(_mm_extract_epi32(sum, 3), ret);
     return (int8_t)ret;
 #elif defined(GI_SSE2_INTRINSICS)
-    __m64 low = GiGetLowInt8x16(Vector);
-    __m64 high = GiGetHighInt8x16(Vector);
+    __m64 low = _mm_movepi64_pi64(Vector);
+    __m64 high = _mm_movepi64_pi64(_mm_unpackhi_epi64(Vector, Vector));
     __m128 v0 = _mm_cvtpi8_ps(low);
     __m128 v1 = _mm_cvtpi8_ps(_mm_unpackhi_pi32(low, low));
     __m128 v2 = _mm_cvtpi8_ps(high);
     __m128 v3 = _mm_cvtpi8_ps(_mm_unpackhi_pi32(high, high));
-    __m128 sum0 = _mm_add_ps(v0, v1);
-    __m128 sum1 = _mm_add_ps(v2, v3);
-    __m128 sum = _mm_add_ps(sum0, sum1);
-    float ret0 = _mm_cvtss_f32(sum);
-    float ret1 = _mm_cvtss_f32(_mm_shuffle_ps(sum, sum, _MM_SHUFFLE(1, 1, 1, 1)));
-    float ret2 = _mm_cvtss_f32(_mm_shuffle_ps(sum, sum, _MM_SHUFFLE(2, 2, 2, 2)));
-    float ret3 = _mm_cvtss_f32(_mm_shuffle_ps(sum, sum, _MM_SHUFFLE(3, 3, 3, 3)));
+    __m128 max0 = _mm_max_ps(v0, v1);
+    __m128 max1 = _mm_max_ps(v2, v3);
+    __m128 max = _mm_max_ps(max0, max1);
+    float ret0 = _mm_cvtss_f32(max);
+    float ret1 = _mm_cvtss_f32(_mm_shuffle_ps(max, max, _MM_SHUFFLE(1, 1, 1, 1)));
+    float ret2 = _mm_cvtss_f32(_mm_shuffle_ps(max, max, _MM_SHUFFLE(2, 2, 2, 2)));
+    float ret3 = _mm_cvtss_f32(_mm_shuffle_ps(max, max, _MM_SHUFFLE(3, 3, 3, 3)));
     return (int8_t)(Max(Max(ret0, ret1), Max(ret2, ret3)));
 #else
     int8_t max = Vector[0];
-    for (size_t i = 1; i < GI_SIMD_LEN_BYTE / sizeof(int32_t); i++) {
+    for (size_t i = 1; i < GI_SIMD_LEN_BYTE / sizeof(int8_t); i++) {
         max = Max(max, Vector[i]);
     }
     return max;
@@ -526,23 +529,23 @@ int8_t GiReduceMinInt8(GI_INT8 Vector) {
     ret = Min(_mm_extract_epi32(sum, 3), ret);
     return (int8_t)ret;
 #elif defined(GI_SSE2_INTRINSICS)
-    __m64 low = GiGetLowInt8x16(Vector);
-    __m64 high = GiGetHighInt8x16(Vector);
+    __m64 low = _mm_movepi64_pi64(Vector);
+    __m64 high = _mm_movepi64_pi64(_mm_unpackhi_epi64(Vector, Vector));
     __m128 v0 = _mm_cvtpi8_ps(low);
     __m128 v1 = _mm_cvtpi8_ps(_mm_unpackhi_pi32(low, low));
     __m128 v2 = _mm_cvtpi8_ps(high);
     __m128 v3 = _mm_cvtpi8_ps(_mm_unpackhi_pi32(high, high));
-    __m128 sum0 = _mm_add_ps(v0, v1);
-    __m128 sum1 = _mm_add_ps(v2, v3);
-    __m128 sum = _mm_add_ps(sum0, sum1);
-    float ret0 = _mm_cvtss_f32(sum);
-    float ret1 = _mm_cvtss_f32(_mm_shuffle_ps(sum, sum, _MM_SHUFFLE(1, 1, 1, 1)));
-    float ret2 = _mm_cvtss_f32(_mm_shuffle_ps(sum, sum, _MM_SHUFFLE(2, 2, 2, 2)));
-    float ret3 = _mm_cvtss_f32(_mm_shuffle_ps(sum, sum, _MM_SHUFFLE(3, 3, 3, 3)));
+    __m128 min0 = _mm_min_ps(v0, v1);
+    __m128 min1 = _mm_min_ps(v2, v3);
+    __m128 min = _mm_min_ps(min0, min1);
+    float ret0 = _mm_cvtss_f32(min);
+    float ret1 = _mm_cvtss_f32(_mm_shuffle_ps(min, min, _MM_SHUFFLE(1, 1, 1, 1)));
+    float ret2 = _mm_cvtss_f32(_mm_shuffle_ps(min, min, _MM_SHUFFLE(2, 2, 2, 2)));
+    float ret3 = _mm_cvtss_f32(_mm_shuffle_ps(min, min, _MM_SHUFFLE(3, 3, 3, 3)));
     return (int8_t)(Min(Min(ret0, ret1), Min(ret2, ret3)));
 #else
     int8_t min = Vector[0];
-    for (size_t i = 1; i < GI_SIMD_LEN_BYTE / sizeof(int32_t); i++) {
+    for (size_t i = 1; i < GI_SIMD_LEN_BYTE / sizeof(int8_t); i++) {
         min = Min(min, Vector[i]);
     }
     return min;
@@ -561,8 +564,7 @@ GiCvtFromFloat32ToInt8(GI_FLOAT32 src) {
 #if __ARM_ARCH >= 8
     int32x4_t vres0 = vcvtaq_s32_f32(src);
     int16x8_t mid_s16 = vcombine_s16(vqmovn_s32(vres0), vqmovn_s32(vres0));
-    int8x8_t ret = vqmovn_s16(vcombine_s16(vqmovn_s32(mid_s16), vqmovn_s32(mid_s16)));
-    return vcombine_s16(ret, ret);
+    return vcombine_s8(vqmovn_s16(mid_s16), vqmovn_s16(mid_s16));
 #else
     float32x4_t vzero = vdupq_n_f32(0.f);
     float32x4_t vfhalf = vdupq_n_f32(0.5f);
@@ -570,8 +572,7 @@ GiCvtFromFloat32ToInt8(GI_FLOAT32 src) {
     float32x4_t vinc0 = vbslq_f32(vcgeq_f32(src, vzero), vfhalf, vfneg_half);
     int32x4_t vres0 = vcvtq_s32_f32(vaddq_f32(src, vinc0));
     int16x8_t mid_s16 = vcombine_s16(vqmovn_s32(vres0), vqmovn_s32(vres0));
-    int8x8_t ret = vqmovn_s16(vcombine_s16(vqmovn_s32(mid_s16), vqmovn_s32(mid_s16)));
-    return vcombine_s16(ret, ret);
+    return vcombine_s8(vqmovn_s16(mid_s16), vqmovn_s16(mid_s16));
 #endif
 #elif defined(GI_SSE42_INTRINSICS)
     __m128 vfzero = _mm_set1_ps(0.f);
diff --git a/dnn/src/fallback/quantized_converter.h b/dnn/src/fallback/quantized_converter.h
new file mode 100644
index 00000000..b842a862
--- /dev/null
+++ b/dnn/src/fallback/quantized_converter.h
@@ -0,0 +1,81 @@
+/**
+ * \file dnn/src/arm_common/quantized_converter.h
+ * MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
+ *
+ * Copyright (c) 2014-2021 Megvii Inc. All rights reserved.
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or
+ * implied.
+ */
+#pragma once
+
+#include "megdnn/dtype.h"
+#include "megdnn/oprs.h"
+#include "src/common/utils.h"
+#include "src/fallback/general_intrinsic/gi_float.h"
+#include "src/fallback/general_intrinsic/gi_int.h"
+
+namespace megdnn {
+namespace fallback {
+
+struct QConverterBase {
+    inline static GI_INT32 vzero() { return GiBroadcastInt32(0); }
+
+    inline static GI_FLOAT32 vfzero() { return GiBroadcastFloat32(0.f); }
+
+    inline static GI_FLOAT32 vfhalf() { return GiBroadcastFloat32(0.5f); }
+
+    inline static GI_FLOAT32 vfneg_half() { return GiBroadcastFloat32(-0.5f); }
+};
+
+struct QConverter {
+    template <typename dst_type, typename... src_type>
+    static inline dst_type convert(const src_type&... src);
+
+    template <typename dst_type, typename... src_type>
+    static inline dst_type round(const src_type&... src);
+};
+
+template <>
+inline dt_qint8 QConverter::convert(const float& src) {
+    return dt_qint8(saturate<int8_t, float>(std::round(src), -128, 127));
+}
+
+template <>
+inline dt_quint8 QConverter::convert(const float& src, const uint8_t& zp) {
+    return dt_quint8(saturate<uint8_t, float>(std::round(src) + zp, 0, 255));
+}
+
+template <>
+inline dt_qint32 QConverter::convert(const float& src) {
+    return dt_qint32(saturate<int32_t, float>(
+            std::round(src), static_cast<float>(std::numeric_limits<int32_t>::min()),
+            static_cast<float>(std::numeric_limits<int32_t>::max())));
+}
+
+template <>
+inline GI_FLOAT32_V2 QConverter::convert(const GI_INT16& vsrc) {
+    GI_INT32 vhi = GiMoveHighLongInt16(vsrc);
+    GI_INT32 vlo = GiMoveLowLongInt16(vsrc);
+    return {{GiCastToFloat32(vlo), GiCastToFloat32(vhi)}};
+}
+
+template <>
+inline GI_INT8 QConverter::convert(const GI_FLOAT32_V2& vsrc) {
+    return GiCvtFromFloat32V2ToInt8(vsrc);
+}
+template <>
+inline GI_INT8 QConverter::convert(const GI_FLOAT32& src) {
+    return GiCvtFromFloat32ToInt8(src);
+}
+
+template <>
+inline GI_INT32 QConverter::round(const GI_FLOAT32& vsrc) {
+    return GiRoundAsInt32(vsrc);
+}
+}  // namespace fallback
+}  // namespace megdnn
+
+// vim: syntax=cpp.doxygen
diff --git a/dnn/src/fallback/reduce/opr_impl.cpp b/dnn/src/fallback/reduce/opr_impl.cpp
index 78192424..66041510 100644
--- a/dnn/src/fallback/reduce/opr_impl.cpp
+++ b/dnn/src/fallback/reduce/opr_impl.cpp
@@ -14,11 +14,13 @@
 #include "src/naive/handle.h"
 
 #include "midout.h"
+#include "reducer.h"
 #include "src/common/reduce_helper.h"
 
 MIDOUT_DECL(megdnn_fb_reduce_op)
 MIDOUT_DECL(megdnn_fb_reduce_c)
 MIDOUT_DECL(megdnn_fb_reduce_dtype)
+MIDOUT_DECL(megdnn_fallback_reduce_optimized)
 
 namespace {
 
@@ -77,11 +79,20 @@ namespace fallback {
 
 void ReduceImpl::exec(
         _megdnn_tensor_in src, _megdnn_tensor_out dst, _megdnn_workspace workspace) {
+    check_exec(src.layout, dst.layout, workspace.size);
+    if (!exec_optimized(src, dst, workspace)) {
+        return exec_fallback(src, dst, workspace);
+    }
+}
+
+void ReduceImpl::exec_fallback(
+        _megdnn_tensor_in src, _megdnn_tensor_out dst, _megdnn_workspace workspace) {
     using namespace reduce;
     using Mode = Param::Mode;
     check_exec(src.layout, dst.layout, workspace.size);
     size_t A, B, C;
     get_ABC(src.layout, A, B, C, param().axis);
+
 #define cb_by_op(src_type, dst_type, _wtype, mode_, Op_, kern_func)                   \
     if (param().mode == mode_) {                                                      \
         typedef DTypeTrait<src_type>::ctype src_ctype;                                \
@@ -176,6 +187,101 @@ void ReduceImpl::exec(
     naive::ReduceForwardImpl::exec(src, dst, workspace);
 }
 
+bool ReduceImpl::exec_optimized(
+        _megdnn_tensor_in src, _megdnn_tensor_out dst, _megdnn_workspace) {
+    size_t A, B, C;
+    reduce::get_ABC(src.layout, A, B, C, param().axis);
+    bool execed = false;
+    using Mode = param::Reduce::Mode;
+#define DISPATCH_FUNC(Reducer, dtype, ctype, comp_type)                           \
+    if (C == 1) {                                                                 \
+        using _Reducer = Reducer<dtype, ctype, comp_type, true>;                  \
+        std::function<void(const ctype*, ctype*, DType, size_t, size_t, size_t)>  \
+                do_reduce = Exec<_Reducer, true>::do_reduce;                      \
+        MIDOUT_BEGIN(                                                             \
+                megdnn_fallback_reduce_optimized, ctype, dtype, comp_type,        \
+                midout_iv(0)) {                                                   \
+            MEGDNN_DISPATCH_CPU_KERN_OPR(do_reduce(                               \
+                    reinterpret_cast<ctype*>(src.raw_ptr()),                      \
+                    reinterpret_cast<ctype*>(dst.raw_ptr()), src_type, A, B, C)); \
+            execed = true;                                                        \
+        }                                                                         \
+        MIDOUT_END();                                                             \
+    } else {                                                                      \
+        using _Reducer = Reducer<dtype, ctype, comp_type, false>;                 \
+        std::function<void(const ctype*, ctype*, DType, size_t, size_t, size_t)>  \
+                do_reduce = Exec<_Reducer, false>::do_reduce;                     \
+        MIDOUT_BEGIN(                                                             \
+                megdnn_fallback_reduce_optimized, ctype, dtype, comp_type,        \
+                midout_iv(1)) {                                                   \
+            MEGDNN_DISPATCH_CPU_KERN_OPR(do_reduce(                               \
+                    reinterpret_cast<ctype*>(src.raw_ptr()),                      \
+                    reinterpret_cast<ctype*>(dst.raw_ptr()), src_type, A, B, C)); \
+            execed = true;                                                        \
+        }                                                                         \
+        MIDOUT_END();                                                             \
+    }
+
+#define DISPATCH_MODE_QUANTIZED(dtype, ctype, comp_type)         \
+    switch (param().mode) {                                      \
+        case Mode::MEAN:                                         \
+            DISPATCH_FUNC(MeanReducer, dtype, ctype, comp_type); \
+            break;                                               \
+        case Mode::MAX:                                          \
+            DISPATCH_FUNC(maxReducer, dtype, ctype, ctype);      \
+            break;                                               \
+        case Mode::MIN:                                          \
+            DISPATCH_FUNC(minReducer, dtype, ctype, ctype);      \
+            break;                                               \
+        default:                                                 \
+            break;                                               \
+    }
+
+#define DISPATCH_MODE_FLOAT(dtype, ctype, comp_type)             \
+    switch (param().mode) {                                      \
+        case Mode::MEAN:                                         \
+            DISPATCH_FUNC(MeanReducer, dtype, ctype, comp_type); \
+            break;                                               \
+        case Mode::MAX:                                          \
+            DISPATCH_FUNC(maxReducer, dtype, ctype, ctype);      \
+            break;                                               \
+        case Mode::MIN:                                          \
+            DISPATCH_FUNC(minReducer, dtype, ctype, ctype);      \
+            break;                                               \
+        case Mode::SUM:                                          \
+            DISPATCH_FUNC(SumReducer, dtype, ctype, ctype);      \
+            break;                                               \
+        case Mode::SUM_SQR:                                      \
+            DISPATCH_FUNC(SumSqrReducer, dtype, ctype, ctype);   \
+            break;                                               \
+        case Mode::PRODUCT:                                      \
+            DISPATCH_FUNC(ProductReducer, dtype, ctype, ctype);  \
+            break;                                               \
+        default:                                                 \
+            break;                                               \
+    }
+    if (src.layout.is_contiguous() &&
+        src.layout.dtype.category() == DTypeCategory::QUANTIZED &&
+        param().data_type == param::Reduce::DataType::DEFAULT) {
+        DType src_type = src.layout.dtype;
+        if (src.layout.dtype.enumv() == DTypeEnum::QuantizedS8) {
+            DISPATCH_MODE_QUANTIZED(dt_qint8, int8_t, int32_t)
+        }
+    } else if (
+            src.layout.is_contiguous() &&
+            src.layout.dtype.category() == DTypeCategory::FLOAT &&
+            param().data_type == param::Reduce::DataType::DEFAULT) {
+        DType src_type = src.layout.dtype;
+        if (src.layout.dtype.enumv() == DTypeEnum::Float32) {
+            DISPATCH_MODE_FLOAT(dt_float32, float, float)
+        }
+    }
+    return execed;
+#undef DISPATCH_FUNC
+#undef DISPATCH_MODE_QUANTIZED
+#undef DISPATCH_MODE_FLOAT
+}
+
 }  // namespace fallback
 }  // namespace megdnn
    // vim: syntax=cpp.doxygen
diff --git a/dnn/src/fallback/reduce/opr_impl.h b/dnn/src/fallback/reduce/opr_impl.h
index 759b405d..3db4401a 100644
--- a/dnn/src/fallback/reduce/opr_impl.h
+++ b/dnn/src/fallback/reduce/opr_impl.h
@@ -19,6 +19,10 @@ public:
     using ReduceForwardImpl::ReduceForwardImpl;
     void exec(
             _megdnn_tensor_in src, _megdnn_tensor_out dst, _megdnn_workspace) override;
+    bool exec_optimized(
+            _megdnn_tensor_in src, _megdnn_tensor_out dst, _megdnn_workspace);
+    void exec_fallback(
+            _megdnn_tensor_in src, _megdnn_tensor_out dst, _megdnn_workspace);
 };
 
 }  // namespace fallback
diff --git a/dnn/src/fallback/reduce/reducer.h b/dnn/src/fallback/reduce/reducer.h
new file mode 100644
index 00000000..efeee039
--- /dev/null
+++ b/dnn/src/fallback/reduce/reducer.h
@@ -0,0 +1,417 @@
+/**
+ * \file dnn/src/fallback/reduce/reducer.h
+ * MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
+ *
+ * Copyright (c) 2014-2021 Megvii Inc. All rights reserved.
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ */
+#pragma once
+
+#include "src/common/utils.h"
+#include "src/fallback/general_intrinsic/gi_float.h"
+#include "src/fallback/general_intrinsic/gi_int.h"
+#include "src/fallback/quantized_converter.h"
+
+using namespace megdnn;
+using namespace fallback;
+
+namespace {
+
+/*****************************Mean Reducer***********************/
+template <typename dtype, typename ctype, typename comp_type, bool C1>
+struct MeanReducer;
+
+template <>
+struct MeanReducer<dt_qint8, int8_t, int32_t, true> {
+    using ctype = int8_t;
+    static constexpr int SIMD_WIDTH = GI_SIMD_LEN_BYTE / sizeof(int8_t);
+
+    int32_t res;
+    float coef;
+    MeanReducer(DType, size_t cnt) : res(0), coef(1.0 / cnt) {}
+    MeanReducer() = default;
+    void feed(const int8_t* val) { res += GiReduceAddInt8(GiLoadInt8(val)); }
+    void feed_remain(const int8_t* val) { res += *val; }
+    void post(int8_t* dst) {
+        float sum = res * coef;
+        *dst = std::round(sum);
+    }
+};
+
+template <>
+struct MeanReducer<dt_qint8, int8_t, int32_t, false> {
+    using ctype = int8_t;
+    static constexpr int SIMD_WIDTH = GI_SIMD_LEN_BYTE / sizeof(int8_t);
+
+    GI_INT32 res[4];
+    int32_t remain;
+    int32_t cnt;
+    float coef;
+    GI_FLOAT32 vcoef;
+    MeanReducer(DType, size_t cnt) : remain(0), cnt(cnt), coef(1.0 / cnt) {
+        memset(res, 0, sizeof(res));
+        vcoef = GiBroadcastFloat32(coef);
+    }
+    MeanReducer() = default;
+    void feed(const int8_t* val) {
+        const GI_INT8 vval = GiLoadInt8(val);
+        const GI_INT16 vval_low = GiMoveLowLongInt8(vval);
+        const GI_INT16 vval_high = GiMoveHighLongInt8(vval);
+
+        const GI_INT32 vval_low_low = GiMoveLowLongInt16(vval_low);
+        const GI_INT32 vval_low_high = GiMoveHighLongInt16(vval_low);
+        const GI_INT32 vval_high_low = GiMoveLowLongInt16(vval_high);
+        const GI_INT32 vval_high_high = GiMoveHighLongInt16(vval_high);
+
+        res[0] = GiAddInt32(res[0], vval_low_low);
+        res[1] = GiAddInt32(res[1], vval_low_high);
+        res[2] = GiAddInt32(res[2], vval_high_low);
+        res[3] = GiAddInt32(res[3], vval_high_high);
+    }
+    void feed_remain(const int8_t* val) { remain += *val; }
+    void post(int8_t* dst) {
+        for (int i = 0; i < 4; i += 2) {
+            GI_FLOAT32 vitem0 = GiMultiplyFloat32(GiCastToFloat32(res[i]), vcoef);
+            GI_FLOAT32 vitem1 = GiMultiplyFloat32(GiCastToFloat32(res[i + 1]), vcoef);
+            GiStoreLowInt8(
+                    dst,
+                    (QConverter::convert<GI_INT8, GI_FLOAT32_V2>({{vitem0, vitem1}})));
+            dst += 8;
+        }
+    }
+    void post_remain(int8_t* dst) {
+        float sum = remain * coef;
+        *dst = std::round(sum);
+    }
+};
+
+template <>
+struct MeanReducer<dt_float32, float, float, true> {
+    using ctype = float;
+    static constexpr int SIMD_WIDTH = GI_SIMD_LEN_BYTE / sizeof(float);
+
+    GI_FLOAT32 res;
+    float result;
+    float coef;
+    MeanReducer(DType, size_t cnt) : result(0.0f), coef(1.0 / cnt) {
+        res = GiBroadcastFloat32(0.0f);
+    }
+    MeanReducer() = default;
+    void feed(const float* val) { res = GiAddFloat32(GiLoadFloat32(val), res); }
+    void feed_remain(const float* val) { result += *val; }
+    void post(float* dst) {
+        result += GiReduceAddFloat32(res);
+        *dst = result * coef;
+    }
+};
+
+template <>
+struct MeanReducer<dt_float32, float, float, false> {
+    using ctype = float;
+    static constexpr int SIMD_WIDTH = GI_SIMD_LEN_BYTE / sizeof(float);
+
+    GI_FLOAT32 res;
+    float remain;
+    float coef;
+    MeanReducer(DType, size_t cnt) : remain(0.0f), coef(1.0 / cnt) {
+        res = GiBroadcastFloat32(0.0f);
+    }
+    MeanReducer() = default;
+    void feed(const float* val) { res = GiAddFloat32(GiLoadFloat32(val), res); }
+    void feed_remain(const float* val) { remain += *val; }
+    void post(float* dst) {
+        res = GiMultiplyScalerFloat32(res, coef);
+        GiStoreFloat32(dst, res);
+    }
+    void post_remain(float* dst) { *dst = remain * coef; }
+};
+
+/******************************max min Reducer****************************/
+template <typename dtype, typename ctype, typename comp_type, bool C1>
+struct maxReducer;
+template <typename dtype, typename ctype, typename comp_type, bool C1>
+struct minReducer;
+
+#define REDUCER_MAX_MIN_C1(_mode, _Mode, _init)                             \
+    template <>                                                             \
+    struct _mode##Reducer<dt_float32, float, float, true> {                 \
+        using ctype = float;                                                \
+        static constexpr int SIMD_WIDTH = GI_SIMD_LEN_BYTE / sizeof(float); \
+        GI_FLOAT32 res;                                                     \
+        _mode##Reducer(DType, size_t) { res = GiBroadcastFloat32(_init); }  \
+        _mode##Reducer() = default;                                         \
+        void feed(const float* val) {                                       \
+            auto vval = GiLoadFloat32(val);                                 \
+            res = Gi##_Mode##imumFloat32(vval, res);                        \
+        }                                                                   \
+        void feed_remain(const float* val) {                                \
+            auto vval = GiBroadcastFloat32(*val);                           \
+            res = Gi##_Mode##imumFloat32(vval, res);                        \
+        }                                                                   \
+        void post(float* dst) { *dst = GiReduce##_Mode##imumFloat32(res); } \
+    }
+
+REDUCER_MAX_MIN_C1(max, Max, std::numeric_limits<dt_float32>::lowest());
+REDUCER_MAX_MIN_C1(min, Min, std::numeric_limits<dt_float32>::max());
+#undef REDUCER_MAX_MIN_C1
+
+#define REDUCER_MAX_MIN_C(_mode, _Mode, _init)                              \
+    template <>                                                             \
+    struct _mode##Reducer<dt_float32, float, float, false> {                \
+        using ctype = float;                                                \
+        static constexpr int SIMD_WIDTH = GI_SIMD_LEN_BYTE / sizeof(float); \
+        GI_FLOAT32 res;                                                     \
+        float remain;                                                       \
+        _mode##Reducer(DType, size_t) {                                     \
+            res = GiBroadcastFloat32(_init);                                \
+            remain = _init;                                                 \
+        }                                                                   \
+        _mode##Reducer() = default;                                         \
+        void feed(const float* val) {                                       \
+            GI_FLOAT32 vval = GiLoadFloat32(val);                           \
+            res = Gi##_Mode##imumFloat32(vval, res);                        \
+        }                                                                   \
+        void feed_remain(const float* val) {                                \
+            using namespace std;                                            \
+            remain = _mode(*val, remain);                                   \
+        }                                                                   \
+        void post(float* dst) { GiStoreFloat32(dst, res); }                 \
+        void post_remain(float* dst) { *dst = remain; }                     \
+    }
+
+REDUCER_MAX_MIN_C(max, Max, std::numeric_limits<dt_float32>::lowest());
+REDUCER_MAX_MIN_C(min, Min, std::numeric_limits<dt_float32>::max());
+#undef REDUCER_MAX_MIN_C
+
+#define REDUCER_MAX_MIN_C1(_mode, _Mode, _init)                              \
+    template <>                                                              \
+    struct _mode##Reducer<dt_qint8, int8_t, int8_t, true> {                  \
+        using ctype = int8_t;                                                \
+        static constexpr int SIMD_WIDTH = GI_SIMD_LEN_BYTE / sizeof(int8_t); \
+        GI_INT8 res;                                                         \
+        _mode##Reducer(DType, size_t) { res = GiBroadcastInt8(_init); }      \
+        _mode##Reducer() = default;                                          \
+        void feed(const int8_t* val) {                                       \
+            GI_INT8 vval = GiLoadInt8(val);                                  \
+            res = Gi##_Mode##imumInt8(vval, res);                            \
+        }                                                                    \
+        void feed_remain(const int8_t* val) {                                \
+            GI_INT8 vval = GiBroadcastInt8(*val);                            \
+            res = Gi##_Mode##imumInt8(vval, res);                            \
+        }                                                                    \
+        void post(int8_t* dst) { *dst = GiReduce##_Mode##Int8(res); }        \
+    }
+
+REDUCER_MAX_MIN_C1(max, Max, -128);
+REDUCER_MAX_MIN_C1(min, Min, 127);
+#undef REDUCER_MAX_MIN_C1
+
+#define REDUCER_MAX_MIN_C(_mode, _Mode, _init)                               \
+    template <>                                                              \
+    struct _mode##Reducer<dt_qint8, int8_t, int8_t, false> {                 \
+        using ctype = int8_t;                                                \
+        static constexpr int SIMD_WIDTH = GI_SIMD_LEN_BYTE / sizeof(int8_t); \
+        GI_INT8 res;                                                         \
+        int8_t remain;                                                       \
+        _mode##Reducer(DType, size_t) {                                      \
+            res = GiBroadcastInt8(_init);                                    \
+            remain = _init;                                                  \
+        }                                                                    \
+        _mode##Reducer() = default;                                          \
+        void feed(const int8_t* val) {                                       \
+            GI_INT8 vval = GiLoadInt8(val);                                  \
+            res = Gi##_Mode##imumInt8(vval, res);                            \
+        }                                                                    \
+        void feed_remain(const int8_t* val) {                                \
+            using namespace std;                                             \
+            remain = _mode(*val, remain);                                    \
+        }                                                                    \
+        void post(int8_t* dst) { GiStoreInt8(dst, res); }                    \
+        void post_remain(int8_t* dst) { *dst = remain; }                     \
+    }
+
+REDUCER_MAX_MIN_C(max, Max, -128);
+REDUCER_MAX_MIN_C(min, Min, 127);
+#undef REDUCER_MAX_MIN_C
+
+/***************************Sum Product Reducer***************************/
+template <typename dtype, typename ctype, typename comp_type, bool C1>
+struct SumReducer;
+template <typename dtype, typename ctype, typename comp_type, bool C1>
+struct ProductReducer;
+
+#define REDUCER_SUM_PRODUCT_C1(_mode, _Mode, _op, _init)                    \
+    template <>                                                             \
+    struct _mode##Reducer<dt_float32, float, float, true> {                 \
+        using ctype = float;                                                \
+        static constexpr int SIMD_WIDTH = GI_SIMD_LEN_BYTE / sizeof(float); \
+        GI_FLOAT32 res;                                                     \
+        float remain;                                                       \
+        _mode##Reducer(DType, size_t) {                                     \
+            res = GiBroadcastFloat32(_init);                                \
+            remain = _init;                                                 \
+        }                                                                   \
+        _mode##Reducer() = default;                                         \
+        void feed(const float* val) {                                       \
+            GI_FLOAT32 vval = GiLoadFloat32(val);                           \
+            res = Gi##_Mode##Float32(vval, res);                            \
+        }                                                                   \
+        void feed_remain(const float* val) {                                \
+            using namespace std;                                            \
+            auto op = _op<float>();                                         \
+            remain = op(remain, *val);                                      \
+        }                                                                   \
+        void post(float* dst) {                                             \
+            using namespace std;                                            \
+            auto op = _op<float>();                                         \
+            *dst = op(remain, GiReduce##_Mode##Float32(res));               \
+        }                                                                   \
+    }
+
+REDUCER_SUM_PRODUCT_C1(Sum, Add, plus, 0.0f);
+REDUCER_SUM_PRODUCT_C1(Product, Multiply, multiplies, 1.0f);
+#undef REDUCER_SUM_PRODUCT_C1
+
+#define REDUCER_SUM_PRODUCT_C(_mode, _Mode, _op, _init)                     \
+    template <>                                                             \
+    struct _mode##Reducer<dt_float32, float, float, false> {                \
+        using ctype = float;                                                \
+        static constexpr int SIMD_WIDTH = GI_SIMD_LEN_BYTE / sizeof(float); \
+        GI_FLOAT32 res;                                                     \
+        float remain;                                                       \
+        _mode##Reducer(DType, size_t) {                                     \
+            res = GiBroadcastFloat32(_init);                                \
+            remain = _init;                                                 \
+        }                                                                   \
+        _mode##Reducer() = default;                                         \
+        void feed(const float* val) {                                       \
+            GI_FLOAT32 vval = GiLoadFloat32(val);                           \
+            res = Gi##_Mode##Float32(vval, res);                            \
+        }                                                                   \
+        void feed_remain(const float* val) {                                \
+            using namespace std;                                            \
+            auto op = _op<float>();                                         \
+            remain = op(remain, (*val));                                    \
+        }                                                                   \
+        void post(float* dst) { GiStoreFloat32(dst, res); }                 \
+        void post_remain(float* dst) { *dst = remain; }                     \
+    }
+
+REDUCER_SUM_PRODUCT_C(Sum, Add, plus, 0.0f);
+REDUCER_SUM_PRODUCT_C(Product, Multiply, multiplies, 1.0f);
+#undef REDUCER_SUM_PRODUCT_C
+
+/***************************SumSqr Reducer***************************/
+template <typename dtype, typename ctype, typename comp_type, bool C1>
+struct SumSqrReducer;
+
+template <>
+struct SumSqrReducer<dt_float32, float, float, true> {
+    using ctype = float;
+    static constexpr int SIMD_WIDTH = GI_SIMD_LEN_BYTE / sizeof(float);
+
+    GI_FLOAT32 res;
+    float result;
+    SumSqrReducer(DType, size_t cnt) : result(0.0f) {
+        MEGDNN_MARK_USED_VAR(cnt);
+        res = GiBroadcastFloat32(0.0f);
+    }
+    SumSqrReducer() = default;
+    void feed(const float* val) {
+        GI_FLOAT32 vval = GiLoadFloat32(val);
+        res = GiAddFloat32(GiMultiplyFloat32(vval, vval), res);
+    }
+    void feed_remain(const float* val) {
+        float vval = *val;
+        result += vval * vval;
+    }
+    void post(float* dst) {
+        result += GiReduceAddFloat32(res);
+        *dst = result;
+    }
+};
+template <>
+struct SumSqrReducer<dt_float32, float, float, false> {
+    using ctype = float;
+    static constexpr int SIMD_WIDTH = GI_SIMD_LEN_BYTE / sizeof(float);
+
+    GI_FLOAT32 res;
+    float remain;
+    SumSqrReducer(DType, size_t cnt) : remain(0.0f) {
+        MEGDNN_MARK_USED_VAR(cnt);
+        res = GiBroadcastFloat32(0.0f);
+    }
+    SumSqrReducer() = default;
+    void feed(const float* val) {
+        GI_FLOAT32 vval = GiLoadFloat32(val);
+        res = GiAddFloat32(GiMultiplyFloat32(vval, vval), res);
+    }
+    void feed_remain(const float* val) { remain += (*val) * (*val); }
+    void post(float* dst) { GiStoreFloat32(dst, res); }
+    void post_remain(float* dst) { *dst = remain; }
+};
+/**************************************do reduce*************************/
+
+template <typename Reducer, bool C1>
+struct Exec {
+    static void do_reduce(
+            const typename Reducer::ctype* src, typename Reducer::ctype* dst,
+            DType src_dtype, size_t A, size_t B, size_t C);
+};
+
+template <typename Reducer>
+struct Exec<Reducer, true> {
+    static void do_reduce(
+            const typename Reducer::ctype* src, typename Reducer::ctype* dst,
+            DType src_dtype, size_t A, size_t B, size_t) {
+        size_t a = 0;
+        for (; a < A; a++) {
+            Reducer reducer0(src_dtype, B);
+            auto temp_src0 = src + a * B;
+            size_t b = 0;
+            for (; b + Reducer::SIMD_WIDTH <= B; b += Reducer::SIMD_WIDTH) {
+                reducer0.feed(temp_src0);
+                temp_src0 += Reducer::SIMD_WIDTH;
+            }
+            for (; b < B; b++) {
+                reducer0.feed_remain(temp_src0);
+                temp_src0++;
+            }
+            reducer0.post(dst);
+            dst++;
+        }
+    }
+};
+
+template <typename Reducer>
+struct Exec<Reducer, false> {
+    static void do_reduce(
+            const typename Reducer::ctype* src, typename Reducer::ctype* dst,
+            DType src_dtype, size_t A, size_t B, size_t C) {
+        for (size_t a = 0; a < A; a++) {
+            size_t c = 0;
+            for (; c + Reducer::SIMD_WIDTH <= C; c += Reducer::SIMD_WIDTH) {
+                Reducer reducer(src_dtype, B);
+                for (size_t b = 0; b < B; b++)
+                    reducer.feed(src + c + C * b);
+                reducer.post(dst);
+                dst += Reducer::SIMD_WIDTH;
+            }
+            for (; c < C; c++) {
+                Reducer reducer(src_dtype, B);
+                for (size_t b = 0; b < B; b++)
+                    reducer.feed_remain(src + c + C * b);
+                reducer.post_remain(dst);
+                dst++;
+            }
+            src += B * C;
+        }
+    }
+};
+
+}  // namespace
+
+// vim: syntax=cpp.doxygen
diff --git a/dnn/test/arm_common/lstm.cpp b/dnn/test/arm_common/lstm.cpp
index ddc45a10..ebcc6f89 100644
--- a/dnn/test/arm_common/lstm.cpp
+++ b/dnn/test/arm_common/lstm.cpp
@@ -181,7 +181,6 @@ TEST_F(ARM_COMMON, LSTM_FORWARD_RECORD) {
 
 TEST_F(ARM_COMMON, BENCHMARK_LSTM_FORWARD) {
     Benchmarker<LSTM> optimized_bench(handle());
-    constexpr size_t RUNS = 20;
     auto run = [&](size_t hidden_size, size_t input_size) {
         optimized_bench.set_times(20).set_display(true);
         size_t gate_hidden_size = 4 * hidden_size;
diff --git a/dnn/test/fallback/reduce.cpp b/dnn/test/fallback/reduce.cpp
index 0a8de3b6..c508649b 100644
--- a/dnn/test/fallback/reduce.cpp
+++ b/dnn/test/fallback/reduce.cpp
@@ -18,6 +18,75 @@
 using namespace megdnn;
 using namespace test;
 
+TEST_F(FALLBACK, REDUCE_FULL) {
+    using Param = Reduce::Param;
+    using Mode = Param::Mode;
+    Checker<Reduce> checker(handle());
+    UniformIntRNG rng{INT8_MIN >> 1, INT8_MAX >> 1};
+    checker.set_rng(0, &rng);
+    struct Config {
+        Param param;
+        DType dtype;
+        TensorShape shape;
+        Config(Param param, DType dtype, TensorShape shape)
+                : param(param), dtype(dtype), shape(shape) {}
+    };
+    std::vector<Config> configs;
+    for (auto mode : {Mode::MEAN, Mode::MAX, Mode::MIN})
+        for (auto dtype : std::vector<DType>{
+                     dtype::Float32(), dtype::Float16(), dtype::QuantizedS8(1.3f),
+                     dtype::Quantized8Asymm(1.3f, static_cast<uint8_t>(3))})
+            for (int32_t axis : {0, 1, 2}) {
+                for (size_t A : {1, 3, 5}) {
+                    for (size_t B : {4, 6, 9, 16, 33, 45}) {
+                        for (size_t C : {4, 6, 9, 16, 33, 45}) {
+                            TensorShape shape{A, B, C};
+                            Param param(mode, axis);
+                            Config config(param, dtype, shape);
+                            configs.push_back(config);
+                        }
+                    }
+                }
+            }
+    for (auto&& config : configs) {
+        auto&& dtype = config.dtype;
+        auto&& param = config.param;
+        auto&& shape = config.shape;
+
+        checker.set_dtype(0, dtype).set_param(param).execs({shape, {}});
+    }
+    configs.clear();
+    for (auto mode : {Mode::SUM, Mode::PRODUCT, Mode::SUM_SQR})
+        for (auto dtype : std::vector<DType>{dtype::Float32(), dtype::Float16()})
+            for (int32_t axis : {0, 1, 2}) {
+                for (size_t A : {1, 3, 5}) {
+                    for (size_t B : {4, 6, 9, 16, 33, 45}) {
+                        for (size_t C : {4, 6, 9, 16, 33, 45}) {
+                            TensorShape shape{A, B, C};
+                            Param param(mode, axis);
+                            Config config(param, dtype, shape);
+                            configs.push_back(config);
+                        }
+                    }
+                }
+            }
+
+    UniformFloatRNG rng_float(-2, 2);
+    checker.set_rng(0, &rng_float);
+    checker.set_epsilon(1e-1);
+    for (auto&& config : configs) {
+        auto&& dtype = config.dtype;
+        auto&& param = config.param;
+        auto&& shape = config.shape;
+        if (dtype == dtype::Float16())
+            checker.set_epsilon(1e-1);
+        else
+            checker.set_epsilon(1e-3);
+
+        checker.set_dtype(0, dtype).set_param(param).execs({shape, {}});
+    }
+}
+
 TEST_F(FALLBACK, REDUCE) {
     using Param = Reduce::Param;
     using Mode = Param::Mode;