fix(dnn): fix reduce sum/mean error when b is large

GitOrigin-RevId: d1bae619b1
2 years ago · 1b94380794
--- a/dnn/src/fallback/reduce/opr_impl.cpp
+++ b/dnn/src/fallback/reduce/opr_impl.cpp
@@ -5,7 +5,6 @@

 #include "midout.h"
 #include "reducer.h"
 #include "src/common/reduce_helper.h"

 MIDOUT_DECL(megdnn_fb_reduce_op)
 MIDOUT_DECL(megdnn_fb_reduce_c)
@@ -67,6 +66,27 @@ void reduce_exec(size_t A, size_t B, size_t C, Op op) MEGDNN_NOEXCEPT {
 namespace megdnn {
 namespace fallback {

 size_t ReduceImpl::get_workspace_in_bytes(
        const TensorLayout& src, const TensorLayout& dst) {
    MEGDNN_MARK_USED_VAR(src);
    MEGDNN_MARK_USED_VAR(dst);

    if (src.dtype.enumv() == DTypeEnum::Float32 &&
        (param().mode == Mode::MEAN || param().mode == Mode::SUM ||
         param().mode == Mode::SUM_SQR)) {
        size_t A, B, C;
        reduce::get_ABC(src, A, B, C, param().axis);
        if (C == 1) {
            // Using B = 247 as an example, you can understand why these parameters exist
            size_t _60xT_in_4 = (60 * 3) / 4;  // T = 3
            size_t _60xX_in_4 = 4;             // 0 < X < T, X = 1,2.
            size_t _XXxT_in_4 = 4;
            return ((B / _60xT_in_4 + _60xX_in_4 + _XXxT_in_4) * sizeof(float));
        }
    }
    return naive::ReduceForwardImpl::get_workspace_in_bytes(src, dst);
 }

 void ReduceImpl::exec(
        _megdnn_tensor_in src, _megdnn_tensor_out dst, _megdnn_workspace workspace) {
    check_exec(src.layout, dst.layout, workspace.size);
@@ -178,45 +198,52 @@ void ReduceImpl::exec_fallback(
 }

 bool ReduceImpl::exec_optimized(
        _megdnn_tensor_in src, _megdnn_tensor_out dst, _megdnn_workspace) {
        _megdnn_tensor_in src, _megdnn_tensor_out dst, _megdnn_workspace workspace) {
    size_t A, B, C;
    reduce::get_ABC(src.layout, A, B, C, param().axis);
    bool execed = false;
    using Mode = param::Reduce::Mode;
 #define DISPATCH_FUNC(Reducer, dtype, ctype, comp_type)                           \
    if (C == 1) {                                                                 \
        using _Reducer = Reducer<dtype, ctype, comp_type, true>;                  \
        using _ReducerC1SmallB = Reducer<dtype, ctype, comp_type, false>;         \
        std::function<void(const ctype*, ctype*, DType, size_t, size_t, size_t)>  \
                do_reduce = Exec<_Reducer, true>::do_reduce;                      \
        if (B == 2)                                                               \
            do_reduce = ExecC1SmallB<_ReducerC1SmallB, ctype, 2>::do_reduce;      \
        if (B == 3)                                                               \
            do_reduce = ExecC1SmallB<_ReducerC1SmallB, ctype, 3>::do_reduce;      \
        if (B == 4)                                                               \
            do_reduce = ExecC1SmallB<_ReducerC1SmallB, ctype, 4>::do_reduce;      \
        MIDOUT_BEGIN(                                                             \
                megdnn_fallback_reduce_optimized, ctype, dtype, comp_type,        \
                midout_iv(0)) {                                                   \
            MEGDNN_DISPATCH_CPU_KERN_OPR(do_reduce(                               \
                    reinterpret_cast<ctype*>(src.raw_ptr()),                      \
                    reinterpret_cast<ctype*>(dst.raw_ptr()), src_type, A, B, C)); \
            execed = true;                                                        \
        }                                                                         \
        MIDOUT_END();                                                             \
    } else {                                                                      \
        using _Reducer = Reducer<dtype, ctype, comp_type, false>;                 \
        std::function<void(const ctype*, ctype*, DType, size_t, size_t, size_t)>  \
                do_reduce = Exec<_Reducer, false>::do_reduce;                     \
        MIDOUT_BEGIN(                                                             \
                megdnn_fallback_reduce_optimized, ctype, dtype, comp_type,        \
                midout_iv(1)) {                                                   \
            MEGDNN_DISPATCH_CPU_KERN_OPR(do_reduce(                               \
                    reinterpret_cast<ctype*>(src.raw_ptr()),                      \
                    reinterpret_cast<ctype*>(dst.raw_ptr()), src_type, A, B, C)); \
            execed = true;                                                        \
        }                                                                         \
        MIDOUT_END();                                                             \

 #define DISPATCH_FUNC(Reducer, dtype, ctype, comp_type)                         \
    if (C == 1) {                                                               \
        using _Reducer = Reducer<dtype, ctype, comp_type, true>;                \
        using _ReducerC1SmallB = Reducer<dtype, ctype, comp_type, false>;       \
        std::function<void(                                                     \
                const ctype*, ctype*, DType, size_t, size_t, size_t,            \
                _megdnn_workspace)>                                             \
                do_reduce = Exec<_Reducer, true>::do_reduce;                    \
        if (B == 2)                                                             \
            do_reduce = ExecC1SmallB<_ReducerC1SmallB, ctype, 2>::do_reduce;    \
        if (B == 3)                                                             \
            do_reduce = ExecC1SmallB<_ReducerC1SmallB, ctype, 3>::do_reduce;    \
        if (B == 4)                                                             \
            do_reduce = ExecC1SmallB<_ReducerC1SmallB, ctype, 4>::do_reduce;    \
        MIDOUT_BEGIN(                                                           \
                megdnn_fallback_reduce_optimized, ctype, dtype, comp_type,      \
                midout_iv(0)) {                                                 \
            MEGDNN_DISPATCH_CPU_KERN_OPR(do_reduce(                             \
                    reinterpret_cast<ctype*>(src.raw_ptr()),                    \
                    reinterpret_cast<ctype*>(dst.raw_ptr()), src_type, A, B, C, \
                    workspace));                                                \
            execed = true;                                                      \
        }                                                                       \
        MIDOUT_END();                                                           \
    } else {                                                                    \
        using _Reducer = Reducer<dtype, ctype, comp_type, false>;               \
        std::function<void(                                                     \
                const ctype*, ctype*, DType, size_t, size_t, size_t,            \
                _megdnn_workspace)>                                             \
                do_reduce = Exec<_Reducer, false>::do_reduce;                   \
        MIDOUT_BEGIN(                                                           \
                megdnn_fallback_reduce_optimized, ctype, dtype, comp_type,      \
                midout_iv(1)) {                                                 \
            MEGDNN_DISPATCH_CPU_KERN_OPR(do_reduce(                             \
                    reinterpret_cast<ctype*>(src.raw_ptr()),                    \
                    reinterpret_cast<ctype*>(dst.raw_ptr()), src_type, A, B, C, \
                    workspace));                                                \
            execed = true;                                                      \
        }                                                                       \
        MIDOUT_END();                                                           \
    }

 #define DISPATCH_MODE_QUANTIZED(dtype, ctype, comp_type)         \
--- a/dnn/src/fallback/reduce/opr_impl.h
+++ b/dnn/src/fallback/reduce/opr_impl.h
@@ -1,4 +1,5 @@
 #pragma once
 #include "src/common/reduce_helper.h"
 #include "src/naive/reduce/opr_impl.h"

 namespace megdnn {
@@ -13,6 +14,8 @@ public:
            _megdnn_tensor_in src, _megdnn_tensor_out dst, _megdnn_workspace);
    void exec_fallback(
            _megdnn_tensor_in src, _megdnn_tensor_out dst, _megdnn_workspace);
    size_t get_workspace_in_bytes(
            const TensorLayout& src, const TensorLayout& dst) override;
 };

 }  // namespace fallback
--- a/dnn/src/fallback/reduce/reducer.h
+++ b/dnn/src/fallback/reduce/reducer.h
@@ -1,5 +1,6 @@
 #pragma once

 #include "src/common/unroll_macro.h"
 #include "src/common/utils.h"
 #include "src/fallback/general_intrinsic/gi_float.h"
 #include "src/fallback/general_intrinsic/gi_int.h"
@@ -395,14 +396,14 @@ template <typename Reducer, bool C1>
 struct Exec {
    static void do_reduce(
            const typename Reducer::ctype* src, typename Reducer::ctype* dst,
            DType src_dtype, size_t A, size_t B, size_t C);
            DType src_dtype, size_t A, size_t B, size_t C, _megdnn_workspace);
 };

 template <typename Reducer>
 struct Exec<Reducer, true> {
    static void do_reduce(
            const typename Reducer::ctype* src, typename Reducer::ctype* dst,
            DType src_dtype, size_t A, size_t B, size_t) {
            DType src_dtype, size_t A, size_t B, size_t, _megdnn_workspace) {
        size_t a = 0;
        for (; a < A; a++) {
            Reducer reducer0(src_dtype, B);
@@ -426,7 +427,7 @@ template <typename Reducer>
 struct Exec<Reducer, false> {
    static void do_reduce(
            const typename Reducer::ctype* src, typename Reducer::ctype* dst,
            DType src_dtype, size_t A, size_t B, size_t C) {
            DType src_dtype, size_t A, size_t B, size_t C, _megdnn_workspace) {
        for (size_t a = 0; a < A; a++) {
            size_t c = 0;
            for (; c + Reducer::SIMD_WIDTH <= C; c += Reducer::SIMD_WIDTH) {
@@ -448,10 +449,276 @@ struct Exec<Reducer, false> {
    }
 };

 // function kern_4x15xT()
 // 1. Loop the calculation with SIMD_WIDTH x 15 x T as a set of data
 // 2. T affects accuracy, i.e. SIMD_ Width x 15 x T data accumulated into SIMD_ Width
 // data 3.D0-d14 is used for reading, then bisection and addition, the addition result
 // is stored in D15, and D15 is written once in T cycles

 // function kern_4xXXx1()
 // Enter this function when the remaining number is less than 60
 // 1. The first switch is to gather the redundant numbers at the end into a vector,
 // which can be processed in vector units in subsequent processes
 // 2. The second switch loads multiple vectors
 // 3. The third switch, binary calculation, results in a vector
 #define ImplementC1LargeB(rd_type, coef, case_load, load, for_shift, cal_final_res)    \
    template <>                                                                        \
    struct Exec<rd_type##Reducer<dt_float32, float, float, true>, true> {              \
        using rd_type##Reducer_ = rd_type##Reducer<dt_float32, float, float, true>;    \
        static constexpr int SIMD_WIDTH = GI_SIMD_LEN_BYTE / sizeof(float);            \
        static constexpr int VREG_NUM = 16;                                            \
        static void kern_4x15xT(                                                       \
                const float* read_ptr, size_t& read_idx, float* write_ptr,             \
                size_t& write_idx, size_t remain_size, size_t T) {                     \
            GI_FLOAT32_t d0, d1, d2, d3, d4, d5, d6, d7, d8, d9, d10, d11, d12, d13,   \
                    d14, d15;                                                          \
            constexpr size_t STEP = SIMD_WIDTH * (VREG_NUM - 1);                       \
            while (read_idx + STEP <= remain_size) {                                   \
                d15 = GiBroadcastFloat32(0.0);                                         \
                size_t i = 0;                                                          \
                for (; read_idx + STEP <= remain_size && i < T;                        \
                     read_idx += STEP, i++) {                                          \
                    const float* _read_ptr = read_ptr + read_idx;                      \
                    UNROLL_CALL_RAW(15, load, _read_ptr, read_ptr, write_ptr)          \
                    d0 = GiAddFloat32(d0, d1);                                         \
                    d2 = GiAddFloat32(d2, d3);                                         \
                    d4 = GiAddFloat32(d4, d5);                                         \
                    d6 = GiAddFloat32(d6, d7);                                         \
                    d8 = GiAddFloat32(d8, d9);                                         \
                    d10 = GiAddFloat32(d10, d11);                                      \
                    d12 = GiAddFloat32(d12, d13);                                      \
                    d0 = GiAddFloat32(d0, d2);                                         \
                    d4 = GiAddFloat32(d4, d6);                                         \
                    d8 = GiAddFloat32(d8, d10);                                        \
                    d12 = GiAddFloat32(d12, d14);                                      \
                    d0 = GiAddFloat32(d0, d4);                                         \
                    d8 = GiAddFloat32(d8, d12);                                        \
                    d0 = GiAddFloat32(d0, d8);                                         \
                    d15 = GiAddFloat32(d0, d15);                                       \
                }                                                                      \
                GiStoreFloat32(write_ptr + write_idx, d15);                            \
                write_idx += SIMD_WIDTH;                                               \
            }                                                                          \
        }                                                                              \
        static void kern_4xXXx1(                                                       \
                const float* read_ptr, size_t& read_idx, float* write_ptr,             \
                size_t& write_idx, size_t remain_size) {                               \
            size_t block_num = remain_size / SIMD_WIDTH;                               \
            size_t tail_num = remain_size % SIMD_WIDTH;                                \
            if (block_num == 0) {                                                      \
                for_shift(read_ptr, read_idx, write_ptr, write_idx, tail_num);         \
                write_idx += tail_num;                                                 \
            } else {                                                                   \
                GI_FLOAT32_t d0, d1, d2, d3, d4, d5, d6, d7, d8, d9, d10, d11, d12,    \
                        d13, d14, d15;                                                 \
                float buf[4];                                                          \
                switch (tail_num) {                                                    \
                    case 3:                                                            \
                        buf[0] = read_ptr[read_idx + remain_size - 1];                 \
                        buf[1] = read_ptr[read_idx + remain_size - 2];                 \
                        buf[2] = read_ptr[read_idx + remain_size - 3];                 \
                        buf[3] = 0;                                                    \
                        load(0, buf, read_ptr, write_ptr);                             \
                        break;                                                         \
                    case 2:                                                            \
                        buf[0] = read_ptr[read_idx + remain_size - 1];                 \
                        buf[1] = read_ptr[read_idx + remain_size - 2];                 \
                        buf[2] = 0;                                                    \
                        buf[3] = 0;                                                    \
                        load(0, buf, read_ptr, write_ptr);                             \
                        break;                                                         \
                    case 1:                                                            \
                        buf[0] = read_ptr[read_idx + remain_size - 1];                 \
                        buf[1] = 0;                                                    \
                        buf[2] = 0;                                                    \
                        buf[3] = 0;                                                    \
                        load(0, buf, read_ptr, write_ptr);                             \
                        break;                                                         \
                    default:                                                           \
                        d0 = GiBroadcastFloat32(0.0);                                  \
                        break;                                                         \
                }                                                                      \
                d15 = d0;                                                              \
                remain_size -= tail_num;                                               \
                const float* _read_ptr = read_ptr + read_idx;                          \
                switch (block_num) {                                                   \
                    case_load(15, _read_ptr, 14, read_ptr, write_ptr);                 \
                    case_load(14, _read_ptr, 13, read_ptr, write_ptr);                 \
                    case_load(13, _read_ptr, 12, read_ptr, write_ptr);                 \
                    case_load(12, _read_ptr, 11, read_ptr, write_ptr);                 \
                    case_load(11, _read_ptr, 10, read_ptr, write_ptr);                 \
                    case_load(10, _read_ptr, 9, read_ptr, write_ptr);                  \
                    case_load(9, _read_ptr, 8, read_ptr, write_ptr);                   \
                    case_load(8, _read_ptr, 7, read_ptr, write_ptr);                   \
                    case_load(7, _read_ptr, 6, read_ptr, write_ptr);                   \
                    case_load(6, _read_ptr, 5, read_ptr, write_ptr);                   \
                    case_load(5, _read_ptr, 4, read_ptr, write_ptr);                   \
                    case_load(4, _read_ptr, 3, read_ptr, write_ptr);                   \
                    case_load(3, _read_ptr, 2, read_ptr, write_ptr);                   \
                    case_load(2, _read_ptr, 1, read_ptr, write_ptr);                   \
                    case_load(1, _read_ptr, 0, read_ptr, write_ptr);                   \
                    default:                                                           \
                        break;                                                         \
                }                                                                      \
                d0 = GiAddFloat32(d0, d15);                                            \
                while (block_num > 1) {                                                \
                    switch (block_num) {                                               \
                        case 15:                                                       \
                        case 14:                                                       \
                            d0 = GiAddFloat32(d0, d1);                                 \
                            d1 = GiAddFloat32(d2, d3);                                 \
                            d2 = GiAddFloat32(d4, d5);                                 \
                            d3 = GiAddFloat32(d6, d7);                                 \
                            d4 = GiAddFloat32(d8, d9);                                 \
                            d5 = GiAddFloat32(d10, d11);                               \
                            d6 = GiAddFloat32(d12, d13);                               \
                            if (block_num & 1)                                         \
                                d7 = d14;                                              \
                            break;                                                     \
                        case 13:                                                       \
                        case 12:                                                       \
                            d0 = GiAddFloat32(d0, d1);                                 \
                            d1 = GiAddFloat32(d2, d3);                                 \
                            d2 = GiAddFloat32(d4, d5);                                 \
                            d3 = GiAddFloat32(d6, d7);                                 \
                            d4 = GiAddFloat32(d8, d9);                                 \
                            d5 = GiAddFloat32(d10, d11);                               \
                            if (block_num & 1)                                         \
                                d6 = d12;                                              \
                            break;                                                     \
                        case 11:                                                       \
                        case 10:                                                       \
                            d0 = GiAddFloat32(d0, d1);                                 \
                            d1 = GiAddFloat32(d2, d3);                                 \
                            d2 = GiAddFloat32(d4, d5);                                 \
                            d3 = GiAddFloat32(d6, d7);                                 \
                            d4 = GiAddFloat32(d8, d9);                                 \
                            if (block_num & 1)                                         \
                                d5 = d10;                                              \
                            break;                                                     \
                        case 9:                                                        \
                        case 8:                                                        \
                            d0 = GiAddFloat32(d0, d1);                                 \
                            d1 = GiAddFloat32(d2, d3);                                 \
                            d2 = GiAddFloat32(d4, d5);                                 \
                            d3 = GiAddFloat32(d6, d7);                                 \
                            if (block_num & 1)                                         \
                                d4 = d8;                                               \
                            break;                                                     \
                        case 7:                                                        \
                        case 6:                                                        \
                            d0 = GiAddFloat32(d0, d1);                                 \
                            d1 = GiAddFloat32(d2, d3);                                 \
                            d2 = GiAddFloat32(d4, d5);                                 \
                            if (block_num & 1)                                         \
                                d3 = d6;                                               \
                            break;                                                     \
                        case 5:                                                        \
                        case 4:                                                        \
                            d0 = GiAddFloat32(d0, d1);                                 \
                            d1 = GiAddFloat32(d2, d3);                                 \
                            if (block_num & 1)                                         \
                                d2 = d4;                                               \
                            break;                                                     \
                        case 3:                                                        \
                        case 2:                                                        \
                            d0 = GiAddFloat32(d0, d1);                                 \
                            if (block_num & 1)                                         \
                                d1 = d2;                                               \
                        default:                                                       \
                            break;                                                     \
                    }                                                                  \
                    block_num = (block_num + 1) / 2;                                   \
                }                                                                      \
                GiStoreFloat32(write_ptr + write_idx, d0);                             \
                write_idx += SIMD_WIDTH;                                               \
            }                                                                          \
        }                                                                              \
        static void do_reduce(                                                         \
                const float* src, float* dst, DType src_dtype, size_t A, size_t B,     \
                size_t, _megdnn_workspace workspace) {                                 \
            MEGDNN_MARK_USED_VAR(src_dtype);                                           \
            float* workspace_ptr = workspace.raw_ptr->as<float>();                     \
            constexpr size_t T = 3;                                                    \
            for (size_t a = 0; a < A; a++) {                                           \
                size_t remain_size = B;                                                \
                const float* read_ptr = src + a * B;                                   \
                float* write_ptr = workspace_ptr;                                      \
                while (remain_size > SIMD_WIDTH) {                                     \
                    size_t read_idx = 0;                                               \
                    size_t write_idx = 0;                                              \
                    kern_4x15xT(                                                       \
                            read_ptr, read_idx, write_ptr, write_idx, remain_size, T); \
                    kern_4xXXx1(                                                       \
                            read_ptr, read_idx, write_ptr, write_idx,                  \
                            remain_size - read_idx);                                   \
                    remain_size = write_idx;                                           \
                    read_ptr = workspace_ptr;                                          \
                }                                                                      \
                cal_final_res(remain_size, read_ptr, write_ptr, dst, coef);            \
                dst++;                                                                 \
            }                                                                          \
        }                                                                              \
    };

 #define GI_LOAD(SHIFT, PTR, RD_PTR, WR_PTR) \
    d##SHIFT = GiLoadFloat32((PTR) + SIMD_WIDTH * SHIFT);
 #define GI_LOAD_THEN_MULT(SHIFT, PTR, RD_PTR, WR_PTR)     \
    d##SHIFT = GiLoadFloat32((PTR) + SIMD_WIDTH * SHIFT); \
    if (RD_PTR != WR_PTR)                                 \
        d##SHIFT = GiMultiplyFloat32(d##SHIFT, d##SHIFT);

 #define CASE_GI_LOAD(NUM, PTR, SHIFT, RD_PTR, WR_PTR) \
    case NUM:                                         \
        GI_LOAD(SHIFT, PTR, RD_PTR, WR_PTR)           \
        MEGDNN_FALLTHRU
 #define CASE_GI_LOAD_THEN_MULT(NUM, PTR, SHIFT, RD_PTR, WR_PTR) \
    case NUM:                                                   \
        GI_LOAD_THEN_MULT(SHIFT, PTR, RD_PTR, WR_PTR)           \
        MEGDNN_FALLTHRU

 #define FOR_MEAN_AND_SUM(rd_ptr, rd_idx, wr_ptr, wr_idx, tail_num) \
    for (size_t i = 0; i < tail_num; i++)                          \
        wr_ptr[wr_idx + i] = rd_ptr[rd_idx + i];
 #define FOR_SUM_SQUARE(rd_ptr, rd_idx, wr_ptr, wr_idx, tail_num)          \
    if (rd_ptr != wr_ptr)                                                 \
        for (size_t i = 0; i < tail_num; i++)                             \
            wr_ptr[wr_idx + i] = rd_ptr[rd_idx + i] * rd_ptr[rd_idx + i]; \
    else                                                                  \
        for (size_t i = 0; i < tail_num; i++)                             \
            wr_ptr[wr_idx + i] = rd_ptr[rd_idx + i];

 #define CAL_FINAL_RESULT(remain_size, read_ptr, write_ptr, dst_ptr, coef) \
    float val = 0;                                                        \
    if (write_ptr != read_ptr)                                            \
        for (size_t i = 0; i < remain_size; i++)                          \
            val = val + read_ptr[i];                                      \
    else                                                                  \
        for (size_t i = 0; i < remain_size; i++)                          \
            val = val + write_ptr[i];                                     \
    *dst_ptr = val * coef;
 #define CAL_FINAL_SQUARE_RESULT(remain_size, read_ptr, write_ptr, dst_ptr, coef) \
    float val = 0;                                                               \
    if (write_ptr != read_ptr)                                                   \
        for (size_t i = 0; i < remain_size; i++)                                 \
            val = val + read_ptr[i] * read_ptr[i];                               \
    else                                                                         \
        for (size_t i = 0; i < remain_size; i++)                                 \
            val = val + write_ptr[i];                                            \
    *dst_ptr = val * coef;

 ImplementC1LargeB(
        Mean, 1 / B, CASE_GI_LOAD, GI_LOAD, FOR_MEAN_AND_SUM, CAL_FINAL_RESULT);
 ImplementC1LargeB(Sum, 1, CASE_GI_LOAD, GI_LOAD, FOR_MEAN_AND_SUM, CAL_FINAL_RESULT);
 ImplementC1LargeB(
        SumSqr, 1, CASE_GI_LOAD_THEN_MULT, GI_LOAD_THEN_MULT, FOR_SUM_SQUARE,
        CAL_FINAL_SQUARE_RESULT);

 template <typename Reducer, typename dtype, size_t B>
 struct ExecC1SmallB {
    static void do_reduce(
            const dtype* src, dtype* dst, DType src_dtype, size_t A, size_t, size_t C);
            const dtype* src, dtype* dst, DType src_dtype, size_t A, size_t, size_t C,
            _megdnn_workspace);
 };

 #define ImplementC1SmallB(_ctype, _gi_type, _gi_ins)                                 \
@@ -459,7 +726,7 @@ struct ExecC1SmallB {
    struct ExecC1SmallB<Reducer, _ctype, B> {                                        \
        static void do_reduce(                                                       \
                const _ctype* src, _ctype* dst, DType src_dtype, size_t A, size_t,   \
                size_t) {                                                            \
                size_t, _megdnn_workspace) {                                         \
            size_t a = 0;                                                            \
            for (; a + Reducer::SIMD_WIDTH < A; a += Reducer::SIMD_WIDTH) {          \
                Reducer reducer(src_dtype, B);                                       \
--- a/dnn/test/fallback/reduce.cpp
+++ b/dnn/test/fallback/reduce.cpp
@@ -352,6 +352,78 @@ TEST_F(FALLBACK, BENCHMARK_REDUCE_VS_CONV) {
    };
    run();
 }

 TEST_F(FALLBACK, BENCHMARK_REDUCE) {
    auto run = [&]() {
        Benchmarker<Reduce> benchmarker_reduce(handle());
        benchmarker_reduce.set_display(false);
        using Mode = param::Reduce::Mode;

        constexpr size_t RUNS = 100;
        benchmarker_reduce.set_times(RUNS);

        TensorShape small{3 * 224 * 224};
        TensorShape large{3 * 224 * 224 * 100};
        param::Reduce param;
        param.axis = 0;

        for (auto i = 224; i < 224 * 2; i++) {
            for (auto mode : {Mode::SUM, Mode::MEAN, Mode::SUM_SQR}) {
                param.mode = mode;
                benchmarker_reduce.set_param(param);
                auto reduce = benchmarker_reduce.execs({{3 * 224 * i}, {}}) / RUNS;
            }
        }
        param.mode = param::Reduce::Mode::SUM;
        benchmarker_reduce.set_param(param);
        printf("SUM\n");
        {
            TensorLayout src(small, dtype::Float32());
            auto reduce = benchmarker_reduce.execs({src, {}}) / RUNS;

            printf("case 1: reduce use time %fms\n", reduce);
        }
        {
            TensorLayout src(large, dtype::Float32());
            auto reduce = benchmarker_reduce.execs({src, {}}) / RUNS;

            printf("case 1: reduce use time %fms\n", reduce);
        }

        param.mode = param::Reduce::Mode::MEAN;
        benchmarker_reduce.set_param(param);
        printf("MEAN\n");
        {
            TensorLayout src(small, dtype::Float32());
            auto reduce = benchmarker_reduce.execs({src, {}}) / RUNS;

            printf("case 2: reduce use time %fms\n", reduce);
        }
        {
            TensorLayout src(large, dtype::Float32());
            auto reduce = benchmarker_reduce.execs({src, {}}) / RUNS;

            printf("case 2: reduce use time %fms\n", reduce);
        }

        param.mode = param::Reduce::Mode::SUM_SQR;
        benchmarker_reduce.set_param(param);
        printf("SUM_SQR\n");
        {
            TensorLayout src(small, dtype::Float32());
            auto reduce = benchmarker_reduce.execs({src, {}}) / RUNS;

            printf("case 3: reduce use time %fms\n", reduce);
        }
        {
            TensorLayout src(large, dtype::Float32());
            auto reduce = benchmarker_reduce.execs({src, {}}) / RUNS;

            printf("case 3: reduce use time %fms\n", reduce);
        }
    };
    run();
 }
 #endif

 // vim: syntax=cpp.doxygen