feat(fallback): move arm_common pooling f32 algo to fallback gi

GitOrigin-RevId: 1bddd6dc2c
3 years ago · 91aaafd587
--- a/dnn/src/arm_common/pooling/algo.h
+++ b/dnn/src/arm_common/pooling/algo.h
@@ -12,7 +12,7 @@
 #pragma once
 #include "src/arm_common/pooling/opr_impl.h"
 #include "src/arm_common/pooling/pooling_helper.h"
 #include "src/common//utils.h"
 #include "src/common/utils.h"
 #include "src/naive/handle.h"

 namespace megdnn {
@@ -134,22 +134,15 @@ public:
    void exec(const PoolingKernParam& param) const override;
    MEGDNN_DECL_ALGO_TYPE(ARM_Filter5ModexStridexNCHW44)
 };
 class PoolingImpl::AlgoFp32ModexStridexNCHW44 final : public AlgoBase {
 public:
    AlgoAttribute attribute() const override { return AlgoAttribute::REPRODUCIBLE; };
    const char* name() const override {
        return "ARM_POOLING_FP32_MODEX_STRIDEX_NCHW44";
    }
    bool usable(const PoolingKernSizeParam& param) const override;
    void exec(const PoolingKernParam& param) const override;
    MEGDNN_DECL_ALGO_TYPE(ARM_Fp32ModexStridexNCHW44)
 };

 class PoolingImpl::AlgoFallback final : public AlgoBase {
 public:
    AlgoAttribute attribute() const override { return AlgoAttribute::REPRODUCIBLE; };
    const char* name() const override { return "FALLBACK_POOLING"; }
    bool usable(const PoolingKernSizeParam&) const override { return true; }
    void exec(const PoolingKernParam&) const override {}
    void exec(const PoolingKernParam&) const override {
        megdnn_assert(false, "code issue happened!!");
    }
    MEGDNN_DECL_ALGO_TYPE(ARM_Fallback)
 };
 WorkspaceBundle get_bundle(const PoolingImpl::PoolingKernSizeParam& param);
--- a/dnn/src/arm_common/pooling/opr_impl.cpp
+++ b/dnn/src/arm_common/pooling/opr_impl.cpp
@@ -32,7 +32,6 @@ private:
    AlgoFilter3ModexStridexNCHW44 algo_filter3_modex_stridex_nchw4;
    AlgoFilter4ModexStridexNCHW44 algo_filter4_modex_stridex_nchw4;
    AlgoFilter5ModexStridexNCHW44 algo_filter5_modex_stridex_nchw4;
    AlgoFp32ModexStridexNCHW44 algo_fp32_modex_stridex_nchw44;
    AlgoFallback algo_fallback;

 public:
@@ -49,7 +48,6 @@ public:
        all_algos.emplace_back(&algo_filter2_modex_stridex_nchw4);
        all_algos.emplace_back(&algo_filter4_modex_stridex_nchw4);
        all_algos.emplace_back(&algo_filter5_modex_stridex_nchw4);
        all_algos.emplace_back(&algo_fp32_modex_stridex_nchw44);
        all_algos.emplace_back(&algo_fallback);

        for (auto&& algo : all_algos) {
@@ -62,40 +60,6 @@ public:

 PoolingImpl::AlgoPack PoolingImpl::sm_algo_pack;

 PoolingImpl::PoolingKernSizeParam PoolingImpl::make_pooling_kern_szie_param(
        fallback::PoolingImpl* opr, const TensorLayout& src, const TensorLayout& dst) {
    auto safe_u32 = [](size_t v) -> uint32_t {
        megdnn_assert(
                v <= std::numeric_limits<uint32_t>::max(), "value too large: %zu", v);
        return v;
    };
    return {safe_u32(src.shape[0]),
            safe_u32(src.shape[1]),
            {{safe_u32(src.shape[2]), safe_u32(src.shape[3])}},
            {{safe_u32(dst.shape[2]), safe_u32(dst.shape[3])}},
            {{safe_u32(opr->param().pad_h), safe_u32(opr->param().pad_w)}},
            {{safe_u32(opr->param().window_h), safe_u32(opr->param().window_w)}},
            {{safe_u32(opr->param().stride_h), safe_u32(opr->param().stride_w)}},
            src.dtype,
            dst.dtype,
            opr->handle(),
            opr->param().format,
            opr->param().mode};
 };

 PoolingImpl::PoolingKernParam PoolingImpl::make_pooling_kern_param(
        fallback::PoolingImpl* opr, _megdnn_tensor_in src, _megdnn_tensor_out dst,
        _megdnn_workspace workspace) {
    PoolingKernParam ret;
    static_cast<PoolingKernSizeParam&>(ret) =
            make_pooling_kern_szie_param(opr, src.layout, dst.layout);
    ret.src_ptr = src.get_ref_ptr();
    ret.dst_ptr = dst.get_ref_ptr();
    ret.workspace_ptr = workspace.raw_ptr;
    ret.workspace_size = workspace.size;
    return ret;
 };

 size_t PoolingImpl::get_workspace_in_bytes(
        const TensorLayout& src, const TensorLayout& dst) {
    TensorLayoutArray layouts{src, dst};
--- a/dnn/src/arm_common/pooling/opr_impl.h
+++ b/dnn/src/arm_common/pooling/opr_impl.h
@@ -19,6 +19,10 @@ namespace arm_common {

 class PoolingImpl final : public fallback::PoolingImpl {
 private:
    //! TODO: remove
    //! AlgoFilterxModexStride1/AlgoFilter2ModexStride2
    //! AlgoFilter3AverageStride2/AlgoFilter4MaxStride2/AlgoFilter5MaxStride2
    //! after imp gi with float16 and int8 support to dnn/src/fallback/pooling/opr_impl.h
    class AlgoFilterxModexStride1;
    class AlgoFilter2ModexStride2;
    class AlgoFilter3MaxStride2;
@@ -31,7 +35,6 @@ private:
    class AlgoFilter3ModexStridexNCHW44;
    class AlgoFilter4ModexStridexNCHW44;
    class AlgoFilter5ModexStridexNCHW44;
    class AlgoFp32ModexStridexNCHW44;
    class AlgoFallback;
    class AlgoPack;
    static AlgoPack sm_algo_pack;
@@ -45,47 +48,10 @@ public:

    static size_t constexpr MAX_SPATIAL_DIM = 2;

    struct PoolingKernSizeParam {
        uint32_t n, ic;
        std::array<uint32_t, MAX_SPATIAL_DIM> isz, osz;
        std::array<uint32_t, MAX_SPATIAL_DIM> padding, filter, stride;
        DType src_type, dst_type;
        Handle* handle;
        Param::Format format;
        Mode mode;
    };

    struct PoolingKernParam : public PoolingKernSizeParam {
        RefPtr src_ptr;
        RefPtr dst_ptr;
        void* workspace_ptr;
        size_t workspace_size;

        template <typename T>
        const T* src() const {
            src_type.assert_is_compatible_ctype<T>();
            return static_cast<const T*>(src_ptr.get_ptr());
        }

        template <typename T>
        T* dst() const {
            dst_type.assert_is_compatible_ctype<T>();
            return static_cast<T*>(dst_ptr.get_ptr());
        }

        template <typename T>
        T* workspace() const {
            return static_cast<T*>(workspace_ptr);
        }
    };
    using PoolingKernSizeParam = fallback::PoolingImpl::PoolingKernSizeParam;

    PoolingKernSizeParam make_pooling_kern_szie_param(
            fallback::PoolingImpl* opr, const TensorLayout& src,
            const TensorLayout& dst);
    using PoolingKernParam = fallback::PoolingImpl::PoolingKernParam;

    PoolingKernParam make_pooling_kern_param(
            fallback::PoolingImpl* opr, _megdnn_tensor_in src, _megdnn_tensor_out dst,
            _megdnn_workspace workspace);
    class AlgoBase : public detail::Algorithm {
    public:
        enum class AlgoType : uint32_t {
--- a/dnn/src/fallback/general_intrinsic/gi_float.h
+++ b/dnn/src/fallback/general_intrinsic/gi_float.h
@@ -1325,3 +1325,35 @@ GI_FORCEINLINE float32x2_t GiGetHighFloat32(GI_FLOAT32_t a) {
    return ___gi_vget_high_f32(a);
 #endif
 }

 GI_FORCEINLINE float32x2_t GiPaddFloat32(float32x2_t a, float32x2_t b) {
 #if defined(GI_NEON_INTRINSICS)
    return vpadd_f32(a, b);
 #elif defined(GI_SSE2_INTRINSICS)
    float32x2_t res;
    res.m64_f32[0] = a.m64_f32[0] + a.m64_f32[1];
    res.m64_f32[1] = b.m64_f32[0] + b.m64_f32[1];
    return res;
 #else
    float32x2_t res;
    res[0] = a[0] + a[1];
    res[1] = b[0] + b[1];
    return res;
 #endif
 }

 GI_FORCEINLINE float32x2_t GiPmaxFloat32(float32x2_t a, float32x2_t b) {
 #if defined(GI_NEON_INTRINSICS)
    return vpmax_f32(a, b);
 #elif defined(GI_SSE2_INTRINSICS)
    float32x2_t res;
    res.m64_f32[0] = MAX_NAN(a.m64_f32[0], a.m64_f32[1]);
    res.m64_f32[1] = MAX_NAN(b.m64_f32[0], b.m64_f32[1]);
    return res;
 #else
    float32x2_t res;
    res[0] = MAX_NAN(a[0], a[1]);
    res[1] = MAX_NAN(b[0], b[1]);
    return res;
 #endif
 }
--- a/dnn/src/fallback/gi_intrinsic_helper.h
+++ b/dnn/src/fallback/gi_intrinsic_helper.h
@@ -0,0 +1,126 @@
 /**
 * \file dnn/src/fallback/gi_intrinsic_helper.h
 * MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
 *
 * Copyright (c) 2014-2022 Megvii Inc. All rights reserved.
 *
 * Unless required by applicable law or agreed to in writing,
 * software distributed under the License is distributed on an
 * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or
 * implied.
 */
 #pragma once
 #include "src/common/unroll_macro.h"
 #include "src/fallback/general_intrinsic/gi_float.h"
 namespace megdnn {
 namespace {

 template <
        int weight_number, int base_offset, int ptr_step, int oc_block, typename Func,
        typename T, typename T2, typename... XT>
 struct LoadHelper {
    static GI_FORCEINLINE void impl(T& weight, T2 ptr, int oc_offset, XT... args);
 };

 #define WEIGHT_CB(step) \
    src[step] = Func::impl(ptr + base_offset + step * ptr_step, args...);

 #define LOAD_HELPER(step)                                                          \
    template <                                                                     \
            int base_offset, int ptr_step, typename Func, typename T, typename T2, \
            typename... XT>                                                        \
    struct LoadHelper<step, base_offset, ptr_step, 0, Func, T, T2, XT...> {        \
        static GI_FORCEINLINE void impl(T& src, T2 ptr, int, XT... args) {         \
            UNROLL_CALL_RAW(step, WEIGHT_CB);                                      \
        }                                                                          \
    }

 LOAD_HELPER(1);
 LOAD_HELPER(2);
 LOAD_HELPER(3);
 LOAD_HELPER(4);
 LOAD_HELPER(5);
 LOAD_HELPER(6);
 LOAD_HELPER(7);
 LOAD_HELPER(8);
 LOAD_HELPER(9);
 LOAD_HELPER(10);
 LOAD_HELPER(11);
 LOAD_HELPER(12);
 LOAD_HELPER(13);
 LOAD_HELPER(14);
 LOAD_HELPER(15);
 LOAD_HELPER(16);

 #undef LOAD_HELPER
 #undef WEIGHT_CB

 ///////////////////////////c_dim = 1/////////////////////////
 #define WEIGHT_CB(step) src[0][step] = Func::impl(ptr + base_offset + step * ptr_step);

 #define LOAD_HELPER(step)                                                            \
    template <int base_offset, int ptr_step, typename Func, typename T, typename T2> \
    struct LoadHelper<step, base_offset, ptr_step, 1, Func, T, T2> {                 \
        static GI_FORCEINLINE void impl(T& src, T2 ptr, int) {                       \
            UNROLL_CALL_RAW(step, WEIGHT_CB);                                        \
        }                                                                            \
    }

 LOAD_HELPER(1);
 LOAD_HELPER(2);
 LOAD_HELPER(3);
 LOAD_HELPER(4);
 LOAD_HELPER(5);
 LOAD_HELPER(6);
 LOAD_HELPER(7);
 LOAD_HELPER(8);
 LOAD_HELPER(9);

 #undef LOAD_HELPER
 #undef WEIGHT_CB

 /////////////////////////c_dim = 2///////////////////////////////
 #define WEIGHT_CB(step)                                             \
    src[0][step] = Func::impl(ptr + base_offset + step * ptr_step); \
    src[1][step] = Func::impl(ptr + base_offset + step * ptr_step + oc_offset);

 #define LOAD_HELPER(step)                                                            \
    template <int base_offset, int ptr_step, typename Func, typename T, typename T2> \
    struct LoadHelper<step, base_offset, ptr_step, 2, Func, T, T2> {                 \
        static GI_FORCEINLINE void impl(T& src, T2 ptr, int oc_offset) {             \
            UNROLL_CALL_RAW(step, WEIGHT_CB);                                        \
        }                                                                            \
    }

 LOAD_HELPER(1);
 LOAD_HELPER(2);
 LOAD_HELPER(3);
 LOAD_HELPER(4);
 LOAD_HELPER(5);
 LOAD_HELPER(6);
 LOAD_HELPER(7);
 LOAD_HELPER(8);

 #undef LOAD_HELPER
 #undef WEIGHT_CB

 template <
        int weight_number, int base_offset, int ptr_step, int c_dim, typename Func,
        typename T, typename T2>
 GI_FORCEINLINE void load_helper(T& weight, T2 ptr, int oc_offset) {
    LoadHelper<weight_number, base_offset, ptr_step, c_dim, Func, T, T2>::impl(
            weight, ptr, oc_offset);
 }

 template <
        int weight_number, int base_offset, int ptr_step, int c_dim, typename Func,
        typename T, typename T2, typename... XT>
 GI_FORCEINLINE void load_helper_x(T& weight, T2 ptr, int oc_offset, XT... args) {
    LoadHelper<weight_number, base_offset, ptr_step, c_dim, Func, T, T2, XT...>::impl(
            weight, ptr, oc_offset, args...);
 }

 }  // namespace
 }  // namespace megdnn

 // vim: syntax=cpp.doxygen
--- a/dnn/src/fallback/pooling/gi/algo.cpp
+++ b/dnn/src/fallback/pooling/gi/algo.cpp
@@ -0,0 +1,403 @@
 /**
 * \file dnn/src/fallback/pooling/gi/algo.cpp
 * MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
 *
 * Copyright (c) 2014-2021 Megvii Inc. All rights reserved.
 *
 * Unless required by applicable law or agreed to in writing,
 * software distributed under the License is distributed on an
 * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or
 * implied.
 */
 #include "algo.h"
 #include "do_max_pooling_w4x4_s2x2.h"
 #include "megdnn/opr_param_defs.h"

 #include "midout.h"

 MIDOUT_DECL(megdnn_fallback_gi_pooling)

 namespace megdnn {
 namespace fallback {

 WorkspaceBundle get_bundle(const PoolingImpl::PoolingKernSizeParam& param) {
    megdnn_assert(
            param.src_type.category() == DTypeCategory::FLOAT &&
            param.format == param::Pooling::Format::NCHW &&
            (param.mode == param::Pooling::Mode::MAX ||
             (param.mode == param::Pooling::Mode::AVERAGE && param.filter[0] == 3)) &&
            param.filter[0] == param.filter[1] &&
            (param.filter[0] == 3 || param.filter[1] == 5) && param.stride[0] == 2 &&
            param.stride[1] == 2 && param.isz[0] >= 2 && param.isz[1] >= 2);
    //! max pooling nxn stride 2
    auto IW = param.isz[1];
    auto OW = param.osz[1];

    // In order to process odd size filter,
    // Firstly, Store a row of the input separately by odd and even numbers
    // Then process them, get a row of the outputs
    // We need to store n rows of results
    SmallVector<size_t> needed_mem;
    for (size_t i = 0; i < param.filter[0]; ++i)
        needed_mem.push_back(OW * param.src_type.size());
    needed_mem.push_back((IW + 1) / 2 * param.src_type.size());
    needed_mem.push_back((IW + 1) / 2 * param.src_type.size());
    WorkspaceBundle ws(nullptr, needed_mem, 16);
    return ws;
 }

 bool PoolingImpl::AlgoGiFilterxModexStride1::usable(
        const PoolingKernSizeParam& param) const {
    auto SH = param.stride[0];
    auto SW = param.stride[1];
    auto FH = param.filter[0];
    auto FW = param.filter[1];

    bool avaible = param.src_type.category() == DTypeCategory::FLOAT &&
                   param.format == Param::Format::NCHW && SH == 1 && SW == 1 &&
                   FH == FW && (FH == 2 || FH == 3);
    bool is_mode_ok = (param.mode == Mode::MAX || param.mode == Mode::AVERAGE);
    return avaible && is_mode_ok;
 }

 void PoolingImpl::AlgoGiFilterxModexStride1::exec(const PoolingKernParam& param) const {
    auto IH = param.isz[0], IW = param.isz[1];
    auto OH = param.osz[0], OW = param.osz[1];
    auto N = param.n, C = param.ic;
    auto PH = param.padding[0];
    auto PW = param.padding[1];
    auto FH = param.filter[0];

    auto src_ptr = param.src_ptr;
    auto dst_ptr = param.dst_ptr;

 #define DISPATCH_FUNC(Pooler, GiPooler, window, midout_type_id)                      \
    MIDOUT_BEGIN(                                                                    \
            megdnn_fallback_gi_pooling, midout_iv(0), midout_iv(midout_type_id),     \
            Pooler::MIDOUT_CASE_NUM, GiPooler::MIDOUT_CASE_NUM, window) {            \
        auto run = [C, IH, IW, OH, OW, PH, PW, src_ptr, dst_ptr,                     \
                    src_dtype = param.src_type](size_t index, size_t) {              \
            size_t n = index / C;                                                    \
            size_t c = index % C;                                                    \
            do_pooling_compact<Pooler MEGDNN_COMMA GiPooler MEGDNN_COMMA window>(    \
                    static_cast<const typename Pooler::ctype*>(src_ptr.get_ptr()) +  \
                            n * C * IH * IW + c * IH * IW,                           \
                    static_cast<typename Pooler::ctype*>(dst_ptr.get_ptr()) +        \
                            n * C * OH * OW + c * OH * OW,                           \
                    src_dtype, IH, IW, OH, OW, PH, PW);                              \
        };                                                                           \
        MEGDNN_DISPATCH_MULTI_THREAD_CPU_KERN(                                       \
                static_cast<::megdnn::naive::HandleImpl*>(param.handle), N* C, run); \
    }                                                                                \
    MIDOUT_END()

 #define DISPATCH_WINDOW(Pooler, GiPooler, dtype, ctype, comp_type, midout_type_id) \
    switch (FH) {                                                                  \
        case 2: {                                                                  \
            using _Pooler = Pooler<4, dtype, ctype, comp_type>;                    \
            using _GiPooler = GiPooler<4, dtype, ctype, comp_type>;                \
            DISPATCH_FUNC(_Pooler, _GiPooler, 2, midout_type_id);                  \
            break;                                                                 \
        }                                                                          \
        case 3: {                                                                  \
            using _Pooler = Pooler<9, dtype, ctype, comp_type>;                    \
            using _GiPooler = GiPooler<9, dtype, ctype, comp_type>;                \
            DISPATCH_FUNC(_Pooler, _GiPooler, 3, midout_type_id);                  \
            break;                                                                 \
        }                                                                          \
        default:                                                                   \
            megdnn_assert(0, "unsupport pooling filter size");                     \
            break;                                                                 \
    }

 #define DISPATCH_MODE(dtype, ctype, comp_type, midout_type_id)                        \
    switch (param.mode) {                                                             \
        case Mode::MAX:                                                               \
            DISPATCH_WINDOW(                                                          \
                    MaxPooler, GiMaxPooler, dtype, ctype, comp_type, midout_type_id); \
            break;                                                                    \
        case Mode::AVERAGE:                                                           \
            DISPATCH_WINDOW(                                                          \
                    MeanInPooler, GiMeanPooler, dtype, ctype, comp_type,              \
                    midout_type_id);                                                  \
            break;                                                                    \
        default:                                                                      \
            megdnn_assert(0, "unsupport pooling mode");                               \
            break;                                                                    \
    }

    if (param.src_type == dtype::Float32{}) {
        DISPATCH_MODE(dt_float32, float, float, 0);
    }
 #undef DISPATCH_FUNC
 #undef DISPATCH_WINDOW
 #undef DISPATCH_MODE
 }
 bool PoolingImpl::AlgoGiFilter2ModexStride2::usable(
        const PoolingKernSizeParam& param) const {
    auto SH = param.stride[0];
    auto SW = param.stride[1];
    auto FH = param.filter[0];
    auto FW = param.filter[1];

    bool avaible = param.src_type.category() == DTypeCategory::FLOAT &&
                   param.format == Param::Format::NCHW && FH == FW && SH == SW &&
                   FH == 2 && SH == 2;
    bool is_mode_ok = (param.mode == Mode::MAX || param.mode == Mode::AVERAGE);
    return avaible && is_mode_ok;
 }

 void PoolingImpl::AlgoGiFilter2ModexStride2::exec(const PoolingKernParam& param) const {
    auto IH = param.isz[0], IW = param.isz[1];
    auto OH = param.osz[0], OW = param.osz[1];
    auto N = param.n, C = param.ic;
    auto PH = param.padding[0];
    auto PW = param.padding[1];

    auto src_ptr = param.src_ptr;
    auto dst_ptr = param.dst_ptr;
 #define DISPATCH_FUNC(Pooler, mode, midout_type_id)                                  \
    MIDOUT_BEGIN(                                                                    \
            megdnn_fallback_gi_pooling, midout_iv(1), midout_iv(midout_type_id),     \
            Pooler::MIDOUT_CASE_NUM) {                                               \
        auto run = [C, IH, IW, OH, OW, PH, PW, src_ptr, dst_ptr,                     \
                    src_dtype = param.src_type](size_t index, size_t) {              \
            size_t n = index / C;                                                    \
            size_t c = index % C;                                                    \
            do_pooling_2x2<Pooler MEGDNN_COMMA mode>(                                \
                    static_cast<const typename Pooler::ctype*>(src_ptr.get_ptr()) +  \
                            n * C * IH * IW + c * IH * IW,                           \
                    static_cast<typename Pooler::ctype*>(dst_ptr.get_ptr()) +        \
                            n * C * OH * OW + c * OH * OW,                           \
                    src_dtype, IH, IW, OH, OW, PH, PW);                              \
        };                                                                           \
        MEGDNN_DISPATCH_MULTI_THREAD_CPU_KERN(                                       \
                static_cast<::megdnn::naive::HandleImpl*>(param.handle), N* C, run); \
    }                                                                                \
    MIDOUT_END()

 #define DISPATCH_MODE(dtype, ctype, comp_type, midout_type_id)        \
    switch (param.mode) {                                             \
        case Mode::MAX: {                                             \
            using _Pooler = MaxPooler<4, dtype, ctype, comp_type>;    \
            DISPATCH_FUNC(_Pooler, Mode::MAX, midout_type_id);        \
            break;                                                    \
        }                                                             \
        case Mode::AVERAGE: {                                         \
            using _Pooler = MeanInPooler<4, dtype, ctype, comp_type>; \
            DISPATCH_FUNC(_Pooler, Mode::AVERAGE, midout_type_id);    \
            break;                                                    \
        }                                                             \
        default:                                                      \
            megdnn_assert(0, "unsupport pooling mode");               \
            break;                                                    \
    }

    if (param.src_type == dtype::Float32{}) {
        DISPATCH_MODE(dt_float32, float, float, 0);
    }
 #undef DISPATCH_FUNC
 #undef DISPATCH_PAD
 #undef DISPATCH_MODE
 }

 bool PoolingImpl::AlgoGiFilter3MaxStride2::usable(
        const PoolingKernSizeParam& param) const {
    bool avaible = param.src_type.category() == DTypeCategory::FLOAT &&
                   param.format == Param::Format::NCHW && param.mode == Mode::MAX &&
                   param.filter[0] == 3 && param.filter[1] == 3 &&
                   param.stride[0] == 2 && param.stride[1] == 2 && param.isz[0] >= 2 &&
                   param.isz[1] >= 2;
    return avaible;
 }

 void PoolingImpl::AlgoGiFilter3MaxStride2::exec(const PoolingKernParam& param) const {
    auto IH = param.isz[0], IW = param.isz[1];
    auto OH = param.osz[0], OW = param.osz[1];
    auto N = param.n, C = param.ic;
    auto PH = param.padding[0];
    auto PW = param.padding[1];

    auto src_ptr = param.src_ptr;
    auto dst_ptr = param.dst_ptr;

 #define DISPATCH_FUNC(type, func, midout_type_id)                                    \
    MIDOUT_BEGIN(                                                                    \
            megdnn_fallback_gi_pooling, midout_iv(2), midout_iv(midout_type_id)) {   \
        WorkspaceBundle wbundle = get_bundle(param);                                 \
        auto run = [C, IH, IW, OH, OW, PH, PW, src_ptr, dst_ptr, wbundle = wbundle,  \
                    workspace_ptr = param.workspace<dt_byte>()](                     \
                           size_t index, size_t thread_id) {                         \
            auto ws = wbundle;                                                       \
            ws.set(workspace_ptr + ws.total_size_in_bytes() * thread_id);            \
            size_t n = index / C;                                                    \
            size_t c = index % C;                                                    \
            do_max_pooling_3x3_s2x2_float_gi(                                        \
                    static_cast<const type*>(src_ptr.get_ptr()) + n * C * IH * IW +  \
                            c * IH * IW,                                             \
                    static_cast<type*>(dst_ptr.get_ptr()) + n * C * OH * OW +        \
                            c * OH * OW,                                             \
                    IH, IW, OH, OW, PH, PW, ws);                                     \
        };                                                                           \
        MEGDNN_DISPATCH_MULTI_THREAD_CPU_KERN(                                       \
                static_cast<::megdnn::naive::HandleImpl*>(param.handle), N* C, run); \
    }                                                                                \
    MIDOUT_END();

    if (param.src_type == dtype::Float32{}) {
        DISPATCH_FUNC(float, float, 0);
    }
 #undef DISPATCH_FUNC
 }
 bool PoolingImpl::AlgoGiFilter3AverageStride2::usable(
        const PoolingKernSizeParam& param) const {
    bool avaible = (param.src_type.category() == DTypeCategory::FLOAT) &&
                   param.format == Param::Format::NCHW && param.mode == Mode::AVERAGE &&
                   param.filter[0] == 3 && param.filter[1] == 3 &&
                   param.stride[0] == 2 && param.stride[1] == 2 && param.isz[0] >= 2 &&
                   param.isz[1] >= 2;
    return avaible;
 }

 void PoolingImpl::AlgoGiFilter3AverageStride2::exec(
        const PoolingKernParam& param) const {
    auto IH = param.isz[0], IW = param.isz[1];
    auto OH = param.osz[0], OW = param.osz[1];
    auto N = param.n, C = param.ic;
    auto PH = param.padding[0];
    auto PW = param.padding[1];

    auto src_ptr = param.src_ptr;
    auto dst_ptr = param.dst_ptr;

 #define DISPATCH_FUNC(type, MEGDNN_SIMD_WIDTH, midout_type_id)                       \
    MIDOUT_BEGIN(                                                                    \
            megdnn_fallback_gi_pooling, midout_iv(3), midout_iv(midout_type_id)) {   \
        WorkspaceBundle wbundle = get_bundle(param);                                 \
        auto run = [C, IH, IW, OH, OW, PH, PW, src_ptr, dst_ptr, wbundle = wbundle,  \
                    workspace_ptr = param.workspace<dt_byte>()](                     \
                           size_t index, size_t thread_id) {                         \
            auto ws = wbundle;                                                       \
            ws.set(workspace_ptr + ws.total_size_in_bytes() * thread_id);            \
            size_t n = index / C;                                                    \
            size_t c = index % C;                                                    \
            do_average_pooling_3x3_s2x2_gi(                                          \
                    static_cast<const type*>(src_ptr.get_ptr()) + n * C * IH * IW +  \
                            c * IH * IW,                                             \
                    static_cast<type*>(dst_ptr.get_ptr()) + n * C * OH * OW +        \
                            c * OH * OW,                                             \
                    IH, IW, OH, OW, PH, PW, ws, MEGDNN_SIMD_WIDTH);                  \
        };                                                                           \
        MEGDNN_DISPATCH_MULTI_THREAD_CPU_KERN(                                       \
                static_cast<::megdnn::naive::HandleImpl*>(param.handle), N* C, run); \
    }                                                                                \
    MIDOUT_END();
    if (param.src_type == dtype::Float32{}) {
        DISPATCH_FUNC(dt_float32, 4, 0);
    }
 #undef DISPATCH_FUNC
 }
 bool PoolingImpl::AlgoGiFilter4MaxStride2::usable(
        const PoolingKernSizeParam& param) const {
    auto SH = param.stride[0];
    auto SW = param.stride[1];
    auto FH = param.filter[0];
    auto FW = param.filter[1];
    auto OH = param.osz[0], OW = param.osz[1];

    bool avaible = param.src_type.category() == DTypeCategory::FLOAT &&
                   param.format == Param::Format::NCHW && param.mode == Mode::MAX &&
                   FH == 4 && FW == 4 && SH == 2 && SW == 2 && OH >= 2 && OW >= 2;
    return avaible;
 }

 void PoolingImpl::AlgoGiFilter4MaxStride2::exec(const PoolingKernParam& param) const {
    auto IH = param.isz[0], IW = param.isz[1];
    auto OH = param.osz[0], OW = param.osz[1];
    auto N = param.n, C = param.ic;
    auto PH = param.padding[0];
    auto PW = param.padding[1];

    auto src_ptr = param.src_ptr;
    auto dst_ptr = param.dst_ptr;

 #define DISPATCH_FUNC(type, func, midout_type_id)                                    \
    MIDOUT_BEGIN(                                                                    \
            megdnn_fallback_gi_pooling, midout_iv(4), midout_iv(midout_type_id)) {   \
        auto run = [C, IH, IW, OH, OW, PH, PW, src_ptr, dst_ptr,                     \
                    src_dtype = param.src_type](size_t index, size_t) {              \
            size_t n = index / C;                                                    \
            size_t c = index % C;                                                    \
            do_max_pooling_w4x4_s2x2_##func##_gi(                                    \
                    static_cast<const type*>(src_ptr.get_ptr()) + n * C * IH * IW +  \
                            c * IH * IW,                                             \
                    static_cast<type*>(dst_ptr.get_ptr()) + n * C * OH * OW +        \
                            c * OH * OW,                                             \
                    src_dtype, IH, IW, OH, OW, PH, PW);                              \
        };                                                                           \
        MEGDNN_DISPATCH_MULTI_THREAD_CPU_KERN(                                       \
                static_cast<::megdnn::naive::HandleImpl*>(param.handle), N* C, run); \
    }                                                                                \
    MIDOUT_END();

    if (param.src_type == dtype::Float32{}) {
        DISPATCH_FUNC(float, float, 0);
    }
 #undef DISPATCH_FUNC
 }
 bool PoolingImpl::AlgoGiFilter5MaxStride2::usable(
        const PoolingKernSizeParam& param) const {
    auto SH = param.stride[0];
    auto SW = param.stride[1];
    auto FH = param.filter[0];
    auto FW = param.filter[1];
    auto OH = param.osz[0], OW = param.osz[1];

    bool avaible = param.src_type.category() == DTypeCategory::FLOAT &&
                   param.format == Param::Format::NCHW && param.mode == Mode::MAX &&
                   FH == 5 && FW == 5 && SH == 2 && SW == 2 && OH >= 2 && OW >= 2;
    return avaible;
 }

 void PoolingImpl::AlgoGiFilter5MaxStride2::exec(const PoolingKernParam& param) const {
    auto IH = param.isz[0], IW = param.isz[1];
    auto OH = param.osz[0], OW = param.osz[1];
    auto N = param.n, C = param.ic;
    auto PH = param.padding[0];
    auto PW = param.padding[1];

    auto src_ptr = param.src_ptr;
    auto dst_ptr = param.dst_ptr;

 #define DISPATCH_FUNC(dtype, type, midout_type_id, MEGDNN_SIMD_WIDTH)                \
    MIDOUT_BEGIN(                                                                    \
            megdnn_fallback_gi_pooling, midout_iv(5), midout_iv(midout_type_id)) {   \
        WorkspaceBundle wbundle = get_bundle(param);                                 \
        auto run = [C, IH, IW, OH, OW, PH, PW, src_ptr, dst_ptr, wbundle = wbundle,  \
                    workspace_ptr = param.workspace<dt_byte>()](                     \
                           size_t index, size_t thread_id) {                         \
            auto ws = wbundle;                                                       \
            ws.set(workspace_ptr + ws.total_size_in_bytes() * thread_id);            \
            size_t n = index / C;                                                    \
            size_t c = index % C;                                                    \
            do_max_pooling_w5x5_s2x2_gi<dtype>(                                      \
                    static_cast<const type*>(src_ptr.get_ptr()) + n * C * IH * IW +  \
                            c * IH * IW,                                             \
                    static_cast<type*>(dst_ptr.get_ptr()) + n * C * OH * OW +        \
                            c * OH * OW,                                             \
                    IH, IW, OH, OW, PH, PW, ws, MEGDNN_SIMD_WIDTH);                  \
        };                                                                           \
        MEGDNN_DISPATCH_MULTI_THREAD_CPU_KERN(                                       \
                static_cast<::megdnn::naive::HandleImpl*>(param.handle), N* C, run); \
    }                                                                                \
    MIDOUT_END();

    if (param.src_type == dtype::Float32{}) {
        DISPATCH_FUNC(dt_float32, float, 0, 4);
    }
 #undef DISPATCH_FUNC
 }

 }  // namespace fallback
 }  // namespace megdnn
 // vim: syntax=cpp.doxygen
--- a/dnn/src/fallback/pooling/gi/algo.h
+++ b/dnn/src/fallback/pooling/gi/algo.h
@@ -0,0 +1,103 @@
 /**
 * \file dnn/src/fallback/pooling/gi/algo.h
 * MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
 *
 * Copyright (c) 2014-2021 Megvii Inc. All rights reserved.
 *
 * Unless required by applicable law or agreed to in writing,
 * software distributed under the License is distributed on an
 * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or
 * implied.
 */
 #pragma once
 #include "src/common/utils.h"
 #include "src/fallback/pooling/opr_impl.h"

 #include "pooling_helper.h"

 #include "src/naive/handle.h"
 #include "src/naive/pooling/opr_impl.h"

 namespace megdnn {
 namespace fallback {

 using AlgoBase = PoolingImpl::AlgoBase;

 class PoolingImpl::AlgoGiFilterxModexStride1 final : public AlgoBase {
 public:
    AlgoAttribute attribute() const override { return AlgoAttribute::REPRODUCIBLE; };
    const char* name() const override { return "GI_POOLING_STRIDE1"; }
    bool usable(const PoolingKernSizeParam& param) const override;
    void exec(const PoolingKernParam& param) const override;
    MEGDNN_DECL_ALGO_TYPE(GI_FilterxModexStride1)
 };

 class PoolingImpl::AlgoGiFilter2ModexStride2 final : public AlgoBase {
 public:
    AlgoAttribute attribute() const override { return AlgoAttribute::REPRODUCIBLE; };
    const char* name() const override { return "GI_POOLING_STRIDE2"; }
    bool usable(const PoolingKernSizeParam& param) const override;
    void exec(const PoolingKernParam& param) const override;
    MEGDNN_DECL_ALGO_TYPE(GI_Filter2ModexStride2)
 };
 class PoolingImpl::AlgoGiFilter3MaxStride2 final : public AlgoBase {
 public:
    AlgoAttribute attribute() const override { return AlgoAttribute::REPRODUCIBLE; };
    const char* name() const override { return "GI_POOLING_FILTER3_MAX"; }
    bool usable(const PoolingKernSizeParam& param) const override;
    void exec(const PoolingKernParam& param) const override;
    MEGDNN_DECL_ALGO_TYPE(GI_Filter3MaxStride2)
 };

 class PoolingImpl::AlgoGiFilter3AverageStride2 final : public AlgoBase {
 public:
    AlgoAttribute attribute() const override { return AlgoAttribute::REPRODUCIBLE; };
    const char* name() const override { return "GI_POOLING_FILTER3_AVERAGE"; }
    bool usable(const PoolingKernSizeParam& param) const override;
    void exec(const PoolingKernParam& param) const override;
    MEGDNN_DECL_ALGO_TYPE(GI_Filter3AverageStride2)
 };

 class PoolingImpl::AlgoGiFilter4MaxStride2 final : public AlgoBase {
 public:
    AlgoAttribute attribute() const override { return AlgoAttribute::REPRODUCIBLE; };
    const char* name() const override { return "GI_POOLING_FILTER4_MAX"; }
    bool usable(const PoolingKernSizeParam& param) const override;
    void exec(const PoolingKernParam& param) const override;
    MEGDNN_DECL_ALGO_TYPE(GI_Filter4MaxStride2)
 };

 class PoolingImpl::AlgoGiFilter5MaxStride2 final : public AlgoBase {
 public:
    AlgoAttribute attribute() const override { return AlgoAttribute::REPRODUCIBLE; };
    const char* name() const override { return "GI_POOLING_FILTER5_MAX"; }
    bool usable(const PoolingKernSizeParam& param) const override;
    void exec(const PoolingKernParam& param) const override;
    MEGDNN_DECL_ALGO_TYPE(GI_Filter5MaxStride2)
 };

 class PoolingImpl::AlgoGiFp32ModexStridexNCHW44 final : public AlgoBase {
 public:
    AlgoAttribute attribute() const override { return AlgoAttribute::REPRODUCIBLE; };
    const char* name() const override { return "GI_POOLING_FP32_MODEX_STRIDEX_NCHW44"; }
    bool usable(const PoolingKernSizeParam& param) const override;
    void exec(const PoolingKernParam& param) const override;
    MEGDNN_DECL_ALGO_TYPE(GI_Fp32ModexStridexNCHW44)
 };

 class PoolingImpl::AlgoFallback final : public AlgoBase {
 public:
    AlgoAttribute attribute() const override { return AlgoAttribute::REPRODUCIBLE; };
    const char* name() const override { return "FALLBACK_NOT_GI_POOLING"; }
    bool usable(const PoolingKernSizeParam&) const override { return true; }
    void exec(const PoolingKernParam& /*param*/) const override {
        megdnn_assert(false, "code issue happened!!");
    }
    MEGDNN_DECL_ALGO_TYPE(FallbackNotGI)
 };
 WorkspaceBundle get_bundle(const PoolingImpl::PoolingKernSizeParam&);

 }  // namespace fallback
 }  // namespace megdnn

 // vim: syntax=cpp.doxygen
--- a/dnn/src/fallback/pooling/gi/algo_fp32_pooling_nchw44.cpp
+++ b/dnn/src/fallback/pooling/gi/algo_fp32_pooling_nchw44.cpp
@@ -1,5 +1,5 @@
 /**
 * \file dnn/src/arm_common/pooling/algo_fp32_pooling_nchw44.cpp
 * \file dnn/src/fallback/pooling/gi/algo_fp32_pooling_nchw44.cpp
 * MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
 *
 * Copyright (c) 2014-2021 Megvii Inc. All rights reserved.
@@ -10,17 +10,17 @@
 * implied.
 */

 #include "algo.h"
 #include "kern_fp32_pooling_nchw44.h"
 #include "megdnn/opr_param_defs.h"
 #include "src/arm_common/pooling/algo.h"
 #include "src/arm_common/pooling/kern_fp32_pooling_nchw44.h"

 #include "midout.h"

 MIDOUT_DECL(megdnn_arm_common_fp32_pooling_nchw44)
 MIDOUT_DECL(megdnn_fallback_fp32_pooling_nchw44)

 namespace megdnn {
 namespace arm_common {
 bool PoolingImpl::AlgoFp32ModexStridexNCHW44::usable(
 namespace fallback {
 bool PoolingImpl::AlgoGiFp32ModexStridexNCHW44::usable(
        const PoolingKernSizeParam& param) const {
    uint32_t sh = param.stride[0];
    uint32_t sw = param.stride[1];
@@ -37,7 +37,7 @@ bool PoolingImpl::AlgoFp32ModexStridexNCHW44::usable(
    return avaible && size_ok;
 }

 void PoolingImpl::AlgoFp32ModexStridexNCHW44::exec(
 void PoolingImpl::AlgoGiFp32ModexStridexNCHW44::exec(
        const PoolingKernParam& param) const {
    int ih = param.isz[0];
    int iw = param.isz[1];
@@ -55,7 +55,7 @@ void PoolingImpl::AlgoFp32ModexStridexNCHW44::exec(

 #define DISPATCH_FUNC(filter, stride, mode)                                           \
    MIDOUT_BEGIN(                                                                     \
            megdnn_arm_common_fp32_pooling_nchw44, midout_iv(0),                      \
            megdnn_fallback_fp32_pooling_nchw44, midout_iv(0),                        \
            midout_iv(#filter #stride #mode##_hash)) {                                \
        auto run = [ih, iw, oh, ow, ph, pw, src_ptr, dst_ptr](size_t index, size_t) { \
            const int c_idx = index;                                                  \
@@ -135,7 +135,7 @@ void PoolingImpl::AlgoFp32ModexStridexNCHW44::exec(
 #undef DISPATCH_FUNC
 }

 }  // namespace arm_common
 }  // namespace fallback
 }  // namespace megdnn

 // vim: syntax=cpp.doxygen
--- a/dnn/src/fallback/pooling/gi/do_max_pooling_3x3_s2x2_float.cpp
+++ b/dnn/src/fallback/pooling/gi/do_max_pooling_3x3_s2x2_float.cpp
@@ -0,0 +1,157 @@
 /**
 * \file dnn/src/fallback/pooling/gi/do_max_pooling_3x3_s2x2_float.cpp
 * MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
 *
 * Copyright (c) 2014-2022 Megvii Inc. All rights reserved.
 *
 * Unless required by applicable law or agreed to in writing,
 * software distributed under the License is distributed on an
 * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 */

 #include "src/common/utils.h"

 #include <algorithm>
 #include <vector>
 #include "do_max_pooling_3x3_s2x2_float.h"
 #include "src/common/macro_helper.h"

 namespace megdnn {
 namespace fallback {

 #define GI_UZP(s0, s1, d0, d1)              \
    do {                                    \
        auto tmp__ = GiUzpqFloat32(s0, s1); \
        d0 = tmp__.val[0];                  \
        d1 = tmp__.val[1];                  \
    } while (0)

 void do_max_pooling_3x3_s2x2_float_gi(
        const float* src, float* dst, size_t IH_, size_t IW_, size_t OH_, size_t OW_,
        size_t PH_, size_t PW_, const WorkspaceBundle& ws) {
    int IH = IH_, IW = IW_, OH = OH_, OW = OW_, PH = PH_, PW = PW_;
    // cache[i] stores the answer of the i-th line after
    // pooling along the W dimension.
    float* cache[3] = {
            static_cast<float*>(ws.get(0)), static_cast<float*>(ws.get(1)),
            static_cast<float*>(ws.get(2))};
    float* odd = static_cast<float*>(ws.get(3));
    float* even = static_cast<float*>(ws.get(4));
    int ih_next = 0;
    // "good" area means we can use SIMD to accelerate.
    auto get_good_area = [](int I, int /* O */, int P, int& O_from, int& O_to) {
        // x*2 - P >= 0; 2x >= P; x >= P/2
        O_from = (P + 1) / 2;
        // x*2 - P + 3 <= I; x*2 <= I+P-3; x <= (I+P-3)/2
        O_to = (I + P - 3) / 2 + 1;
        // we must have I >= 2 to ensure O_from <= O_to
    };
    int OW_from, OW_to;
    get_good_area(IW, OW, PW, OW_from, OW_to);
    auto process_cache = [&](int ih) {
        const float* __restrict sptr = src + ih * IW;
        auto tmp = cache[2];
        cache[2] = cache[1];
        cache[1] = cache[0];
        cache[0] = tmp;
        // cache 0 is used to store the current answer.
        auto run_single = [&](int ow) {
            int iw = ow * 2 - PW;
            float res = std::numeric_limits<float>::lowest();
            if (iw + 0 >= 0 && iw + 0 < IW) {
                res = std::max(res, sptr[iw + 0]);
            }
            if (iw + 1 >= 0 && iw + 1 < IW) {
                res = std::max(res, sptr[iw + 1]);
            }
            if (iw + 2 >= 0 && iw + 2 < IW) {
                res = std::max(res, sptr[iw + 2]);
            }
            cache[0][ow] = res;
        };
        // build odd/even
        int iw = 0;
        int odd_offset = 0, even_offset = 0;

        for (; iw + 2 * 4 <= IW; iw += 2 * 4) {
            GI_FLOAT32_t s0, s1, d0, d1;
            s0 = GiLoadFloat32(sptr + iw);
            s1 = GiLoadFloat32(sptr + iw + 4);
            GI_UZP(s0, s1, d0, d1);
            GiStoreFloat32(even + even_offset, d0);
            GiStoreFloat32(odd + odd_offset, d1);
            even_offset += 4;
            odd_offset += 4;
        }
        for (; iw < IW; ++iw) {
            if (iw & 1)
                odd[odd_offset++] = sptr[iw];
            else
                even[even_offset++] = sptr[iw];
        }
        int ow = 0;
        for (; ow < OW_from; ++ow)
            run_single(ow);
        if (PW & 1) {
            for (; ow + 4 <= OW_to; ow += 4) {
                GI_FLOAT32_t d, s0, s1, s2;
                s0 = GiLoadFloat32(odd + ow - (PW >> 1) - 1);
                s1 = GiLoadFloat32(even + ow - (PW >> 1));
                s2 = GiLoadFloat32(odd + ow - (PW >> 1));
                d = GiMaximumFloat32(GiMaximumFloat32(s0, s1), s2);
                GiStoreFloat32(cache[0] + ow, d);
            }
        } else {
            for (; ow + 4 <= OW_to; ow += 4) {
                GI_FLOAT32_t d, s0, s1, s2;
                s0 = GiLoadFloat32(even + ow - (PW >> 1));
                s1 = GiLoadFloat32(odd + ow - (PW >> 1));
                s2 = GiLoadFloat32(even + ow - (PW >> 1) + 1);
                d = GiMaximumFloat32(GiMaximumFloat32(s0, s1), s2);
                GiStoreFloat32(cache[0] + ow, d);
            }
        }
        for (; ow < OW; ++ow)
            run_single(ow);
    };
    for (int oh = 0; oh < OH; ++oh) {
        float* __restrict dptr = dst + oh * OW;
        int ih_from = std::min(IH, std::max(0, oh * 2 - PH));
        int ih_to = std::min(IH, std::max(0, oh * 2 - PH + 3));
        while (ih_next < ih_to) {
            process_cache(ih_next++);
        }
        if (ih_to - ih_from == 3) {
            int ow = 0;
            for (; ow + 4 <= OW; ow += 4) {
                GI_FLOAT32_t d, s0, s1, s2;
                s0 = GiLoadFloat32(cache[0] + ow);
                s1 = GiLoadFloat32(cache[1] + ow);
                s2 = GiLoadFloat32(cache[2] + ow);
                d = GiMaximumFloat32(GiMaximumFloat32(s0, s1), s2);
                GiStoreFloat32(dptr + ow, d);
            }
            for (; ow < OW; ++ow) {
                dptr[ow] = std::max(std::max(cache[0][ow], cache[1][ow]), cache[2][ow]);
            }
        } else {
            std::memcpy(dptr, cache[0], sizeof(float) * OW);
            for (int i = 1; i < ih_to - ih_from; ++i) {
                int ow = 0;
                for (; ow + 4 <= OW; ow += 4) {
                    GI_FLOAT32_t d, s;
                    s = GiLoadFloat32(cache[i] + ow);
                    d = GiLoadFloat32(dptr + ow);
                    d = GiMaximumFloat32(d, s);
                    GiStoreFloat32(dptr + ow, d);
                }
                for (; ow < OW; ++ow) {
                    dptr[ow] = std::max(dptr[ow], cache[i][ow]);
                }
            }
        }
    }
 }

 }  // namespace fallback
 }  // namespace megdnn
--- a/dnn/src/fallback/pooling/gi/do_max_pooling_3x3_s2x2_float.h
+++ b/dnn/src/fallback/pooling/gi/do_max_pooling_3x3_s2x2_float.h
@@ -0,0 +1,26 @@
 /**
 * \file dnn/src/fallback/pooling/gi/do_max_pooling_3x3_s2x2_float.h
 * MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
 *
 * Copyright (c) 2014-2022 Megvii Inc. All rights reserved.
 *
 * Unless required by applicable law or agreed to in writing,
 * software distributed under the License is distributed on an
 * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 */

 #include "src/common/utils.h"

 #include "megdnn/arch.h"

 #include "src/fallback/general_intrinsic/gi_float.h"

 namespace megdnn {
 namespace fallback {

 void do_max_pooling_3x3_s2x2_float_gi(
        const float* src, float* dst, size_t IH_, size_t IW_, size_t OH_, size_t OW_,
        size_t PH_, size_t PW_, const WorkspaceBundle& ws);

 }  // namespace fallback
 }  // namespace megdnn
--- a/dnn/src/fallback/pooling/gi/do_max_pooling_w4x4_s2x2.cpp
+++ b/dnn/src/fallback/pooling/gi/do_max_pooling_w4x4_s2x2.cpp
@@ -0,0 +1,89 @@
 /**
 * \file dnn/src/fallback/pooling/gi/do_max_pooling_w4x4_s2x2.cpp
 * MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
 *
 * Copyright (c) 2014-2021 Megvii Inc. All rights reserved.
 *
 * Unless required by applicable law or agreed to in writing,
 * software distributed under the License is distributed on an
 * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or
 * implied.
 */
 #include "do_max_pooling_w4x4_s2x2.h"
 #include "pooling_helper.h"

 namespace megdnn {
 namespace fallback {

 void do_max_pooling_w4x4_s2x2_float_gi(
        const dt_float32* src, dt_float32* dst, DType src_dtype, const int IH,
        const int IW, const int OH, const int OW, const int PH, const int PW) {
    const int window = 4;
    const int stride = 2;
    using Pooler = MaxPooler<16, dt_float32, float, float>;
    int oh = 0;
    for (; oh < OH && -PH + stride * oh < 0; ++oh) {
        int ow = 0;
        for (; ow < OW; ++ow) {
            do_pxl_naive<Pooler, window>(
                    oh, ow, src, dst, src_dtype, IH, IW, OH, OW, PH, PW, stride,
                    stride);
        }
    }
    for (; oh < OH && -PH + stride * oh + window <= IH; ++oh) {
        int ow = 0;
        for (; ow < OW && -PW + stride * ow < 0; ++ow) {
            do_pxl_naive<Pooler, window>(
                    oh, ow, src, dst, src_dtype, IH, IW, OH, OW, PH, PW, stride,
                    stride);
        }
        dt_float32 last_hf_res = -std::numeric_limits<dt_float32>::infinity();
        int ih = -PH + stride * oh, iw = -PW + stride * ow;
        if (-PW + stride * ow + window <= IW) {
            GI_FLOAT32_t i0 = GiLoadFloat32(src + (ih + 0) * IW + iw),
                         i1 = GiLoadFloat32(src + (ih + 1) * IW + iw),
                         i2 = GiLoadFloat32(src + (ih + 2) * IW + iw),
                         i3 = GiLoadFloat32(src + (ih + 3) * IW + iw);
            GI_FLOAT32_t sum0 = GiMaximumFloat32(
                    GiMaximumFloat32(i0, i1), GiMaximumFloat32(i2, i3));
            float32x2_t t =
                    GiPmaxFloat32(GiGetLowFloat32(sum0), GiGetHighFloat32(sum0));
            dst[oh * OW + ow] =
                    std::max(GiGetLaneFloat32(t, 0), GiGetLaneFloat32(t, 1));
            last_hf_res = GiGetLaneFloat32(t, 1);
            ow += 1;
        }
        for (; ow + 1 < OW && -PW + stride * (ow + 1) + window <= IW; ow += 2) {
            iw = -PW + stride * (ow + 1);
            GI_FLOAT32_t i0 = GiLoadFloat32(src + (ih + 0) * IW + iw),
                         i1 = GiLoadFloat32(src + (ih + 1) * IW + iw),
                         i2 = GiLoadFloat32(src + (ih + 2) * IW + iw),
                         i3 = GiLoadFloat32(src + (ih + 3) * IW + iw);
            GI_FLOAT32_t sum0 = GiMaximumFloat32(
                    GiMaximumFloat32(i0, i1), GiMaximumFloat32(i2, i3));
            float32x2_t t =
                    GiPmaxFloat32(GiGetLowFloat32(sum0), GiGetHighFloat32(sum0));
            dst[oh * OW + ow + 0] = std::max(GiGetLaneFloat32(t, 0), last_hf_res);
            dst[oh * OW + ow + 1] =
                    std::max(GiGetLaneFloat32(t, 0), GiGetLaneFloat32(t, 1));
            last_hf_res = GiGetLaneFloat32(t, 1);
        }
        for (; ow < OW; ++ow) {
            do_pxl_naive<Pooler, window>(
                    oh, ow, src, dst, src_dtype, IH, IW, OH, OW, PH, PW, stride,
                    stride);
        }
    }
    for (; oh < OH; ++oh) {
        int ow = 0;
        for (; ow < OW; ++ow) {
            do_pxl_naive<Pooler, window>(
                    oh, ow, src, dst, src_dtype, IH, IW, OH, OW, PH, PW, stride,
                    stride);
        }
    }
 }

 }  // namespace fallback
 }  // namespace megdnn
 // vim: syntax=cpp.doxygen
--- a/dnn/src/fallback/pooling/gi/do_max_pooling_w4x4_s2x2.h
+++ b/dnn/src/fallback/pooling/gi/do_max_pooling_w4x4_s2x2.h
@@ -0,0 +1,24 @@
 /**
 * \file dnn/src/fallback/pooling/gi/do_max_pooling_w4x4_s2x2.h
 * MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
 *
 * Copyright (c) 2014-2021 Megvii Inc. All rights reserved.
 *
 * Unless required by applicable law or agreed to in writing,
 * software distributed under the License is distributed on an
 * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or
 * implied.
 */
 #pragma once
 #include "src/fallback/pooling/opr_impl.h"

 namespace megdnn {
 namespace fallback {

 void do_max_pooling_w4x4_s2x2_float_gi(
        const dt_float32* src, dt_float32* dst, DType src_dtype, const int IH,
        const int IW, const int OH, const int OW, const int PH, const int PW);
 }  // namespace fallback
 }  // namespace megdnn

 // vim: syntax=cpp.doxygen
--- a/dnn/src/fallback/pooling/gi/kern_fp32_pooling_nchw44.h
+++ b/dnn/src/fallback/pooling/gi/kern_fp32_pooling_nchw44.h
@@ -0,0 +1,306 @@
 /**
 * \file dnn/src/fallback/pooling/gi/kern_fp32_pooling_nchw44.h
 * MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
 *
 * Copyright (c) 2014-2021 Megvii Inc. All rights reserved.
 *
 * Unless required by applicable law or agreed to in writing,
 * software distributed under the License is distributed on an
 * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or
 * implied.
 */
 #pragma once
 #include <limits>
 #include "megdnn/opr_param_defs.h"
 #include "src/common/unroll_macro.h"
 #include "src/fallback/general_intrinsic/gi_float.h"
 #include "src/fallback/gi_intrinsic_helper.h"

 namespace megdnn {
 namespace fallback {
 namespace {

 template <
        int filter, int stride, int ow_step, PoolingBase::Mode mode, typename T1,
        typename T2>
 struct CalXsXNchw44 {
    static void impl(T1 result, T2 src);
 };

 struct GiD1Qf32 {
    static GI_FORCEINLINE GI_FLOAT32_t impl(const float32_t* ptr) {
        return GiLoadFloat32(ptr);
    }
 };

 template <
        int filter, int stride, int ow_step, PoolingBase::Mode mode, typename T1,
        typename T2>
 void calculate_xsx_nchw44(T1 result, T2 src) {
    CalXsXNchw44<filter, stride, ow_step, mode, T1, T2>::impl(result, src);
 };

 #define CALCULATE_MAX_CB(step)                                       \
    result[0] = GiMaximumFloat32(result[0], src[0 * stride + step]); \
    result[1] = GiMaximumFloat32(result[1], src[1 * stride + step]); \
    result[2] = GiMaximumFloat32(result[2], src[2 * stride + step]); \
    result[3] = GiMaximumFloat32(result[3], src[3 * stride + step]);

 #define CALCULATE_AVG_CB(step)                                   \
    result[0] = GiAddFloat32(result[0], src[0 * stride + step]); \
    result[1] = GiAddFloat32(result[1], src[1 * stride + step]); \
    result[2] = GiAddFloat32(result[2], src[2 * stride + step]); \
    result[3] = GiAddFloat32(result[3], src[3 * stride + step]);

 #define INSTANCE_CAL(filter)                                                     \
    template <int stride, typename T1, typename T2>                              \
    struct CalXsXNchw44<filter, stride, 4, PoolingBase::Mode::MAX, T1, T2> {     \
        static void impl(T1 result, T2 src) {                                    \
            UNROLL_CALL_RAW(filter, CALCULATE_MAX_CB);                           \
        }                                                                        \
    };                                                                           \
    template <int stride, typename T1, typename T2>                              \
    struct CalXsXNchw44<filter, stride, 4, PoolingBase::Mode::AVERAGE, T1, T2> { \
        static void impl(T1 result, T2 src) {                                    \
            UNROLL_CALL_RAW(filter, CALCULATE_AVG_CB);                           \
        }                                                                        \
    };

 INSTANCE_CAL(2)
 INSTANCE_CAL(3)
 INSTANCE_CAL(4)
 INSTANCE_CAL(5)
 INSTANCE_CAL(9)
 INSTANCE_CAL(13)

 #undef INSTANCE_CAL
 #undef CALCULATE_AVG_CB
 #undef CALCULATE_MAX_CB

 template <int filter, int stride, int ow_step, PoolingBase::Mode mode>
 struct KerPoolingFilterXStrideXNchw44 {
    static void impl(const float32_t* src_ptr, float32_t* dst_ptr, size_t iw);
 };

 template <int filter, int stride, int ow_step>
 struct KerPoolingFilterXStrideXNchw44<filter, stride, ow_step, PoolingBase::Mode::MAX> {
    static void impl(const float32_t* src_ptr, float32_t* dst_ptr, size_t iw) {
        constexpr int src_reg_size = ow_step * stride + filter - stride;
        constexpr int packed_ic = 4;
        constexpr int simd_len = 4;
        constexpr float default_float = std::numeric_limits<float>::lowest();
        GI_FLOAT32_t result[ow_step];
        GI_FLOAT32_t src[src_reg_size];

        result[0] = GiBroadcastFloat32(default_float);
        result[1] = GiBroadcastFloat32(default_float);
        result[2] = GiBroadcastFloat32(default_float);
        result[3] = GiBroadcastFloat32(default_float);

        for (int fh_idx = 0; fh_idx < filter; ++fh_idx) {
            load_helper<src_reg_size, 0, simd_len, 0, GiD1Qf32>(
                    src, src_ptr + fh_idx * iw * packed_ic, 0);
            calculate_xsx_nchw44<filter, stride, ow_step, PoolingBase::Mode::MAX>(
                    result, src);
        }

        GiStoreFloat32(dst_ptr + 0 * packed_ic, result[0]);
        GiStoreFloat32(dst_ptr + 1 * packed_ic, result[1]);
        GiStoreFloat32(dst_ptr + 2 * packed_ic, result[2]);
        GiStoreFloat32(dst_ptr + 3 * packed_ic, result[3]);
    }
 };

 template <int filter, int stride, int ow_step>
 struct KerPoolingFilterXStrideXNchw44<
        filter, stride, ow_step, PoolingBase::Mode::AVERAGE> {
    static void impl(const float32_t* src_ptr, float32_t* dst_ptr, size_t iw) {
        constexpr int src_reg_size = ow_step * stride + filter - stride;
        constexpr int packed_ic = 4;
        constexpr int simd_len = 4;
        constexpr float default_float = 0;
        constexpr float div_filter_size = 1.f / (filter * filter);
        const GI_FLOAT32_t div_filter_size_vec = GiBroadcastFloat32(div_filter_size);
        GI_FLOAT32_t result[ow_step];
        GI_FLOAT32_t src[src_reg_size];

        result[0] = GiBroadcastFloat32(default_float);
        result[1] = GiBroadcastFloat32(default_float);
        result[2] = GiBroadcastFloat32(default_float);
        result[3] = GiBroadcastFloat32(default_float);

        for (int fh_idx = 0; fh_idx < filter; ++fh_idx) {
            load_helper<src_reg_size, 0, simd_len, 0, GiD1Qf32>(
                    src, src_ptr + fh_idx * iw * packed_ic, 0);
            calculate_xsx_nchw44<filter, stride, ow_step, PoolingBase::Mode::AVERAGE>(
                    result, src);
        }
        result[0] = GiMultiplyFloat32(result[0], div_filter_size_vec);
        result[1] = GiMultiplyFloat32(result[1], div_filter_size_vec);
        result[2] = GiMultiplyFloat32(result[2], div_filter_size_vec);
        result[3] = GiMultiplyFloat32(result[3], div_filter_size_vec);
        GiStoreFloat32(dst_ptr + 0 * packed_ic, result[0]);
        GiStoreFloat32(dst_ptr + 1 * packed_ic, result[1]);
        GiStoreFloat32(dst_ptr + 2 * packed_ic, result[2]);
        GiStoreFloat32(dst_ptr + 3 * packed_ic, result[3]);
    }
 };

 template <PoolingBase::Mode mode>
 void ker_pooling_nchw44_remain_pad(
        const float32_t* src_ptr, float32_t* dst_ptr, const int iw, const int pad_top,
        const int pad_bottom, const int pad_left, const int pad_right,
        const int filter);
 template <>
 void ker_pooling_nchw44_remain_pad<PoolingBase::Mode::MAX>(
        const float32_t* src_ptr, float32_t* dst_ptr, const int iw, const int pad_top,
        const int pad_bottom, const int pad_left, const int pad_right,
        const int filter) {
    constexpr int ic_step = 4;
    const int ih_end = filter - pad_bottom;
    const int iw_end = filter - pad_right;
    GI_FLOAT32_t result = GiBroadcastFloat32(std::numeric_limits<float>::lowest());
    for (int ih_idx = pad_top; ih_idx < ih_end; ++ih_idx) {
        for (int iw_idx = pad_left; iw_idx < iw_end; ++iw_idx) {
            GI_FLOAT32_t src = GiLoadFloat32(src_ptr + (iw_idx - pad_left) * ic_step);
            result = GiMaximumFloat32(result, src);
        }
        src_ptr += iw * ic_step;
    }
    GiStoreFloat32(dst_ptr, result);
 }

 template <>
 void ker_pooling_nchw44_remain_pad<PoolingBase::Mode::AVERAGE>(
        const float32_t* src_ptr, float32_t* dst_ptr, const int iw, const int pad_top,
        const int pad_bottom, const int pad_left, const int pad_right,
        const int filter) {
    constexpr int ic_step = 4;
    const int ih_end = filter - pad_bottom;
    const int iw_end = filter - pad_right;
    const float div_filter_size = 1.f / (filter * filter);
    const GI_FLOAT32_t div_filter_size_vec = GiBroadcastFloat32(div_filter_size);
    GI_FLOAT32_t result = GiBroadcastFloat32(0.f);

    for (int ih_idx = pad_top; ih_idx < ih_end; ++ih_idx) {
        for (int iw_idx = pad_left; iw_idx < iw_end; ++iw_idx) {
            GI_FLOAT32_t src = GiLoadFloat32(src_ptr + (iw_idx - pad_left) * ic_step);
            result = GiAddFloat32(result, src);
        }
        src_ptr += iw * ic_step;
    }
    result = GiMultiplyFloat32(result, div_filter_size_vec);
    GiStoreFloat32(dst_ptr, result);
 }

 template <PoolingBase::Mode mode>
 static inline void kern_pooling_with_pad_nchw44(
        const float32_t* src, float32_t* dst, const int filter, const int ow_start,
        const int ow_end, const int iw, const int ow, const int stride_w, const int pw,
        const int real_ih_idx, const int oh_idx, const int pad_top,
        const int pad_bottom) {
    constexpr int ic_step = 4;
    constexpr int oc_step = 4;
    for (int ow_idx = ow_start; ow_idx < ow_end; ++ow_idx) {
        const int iw_idx = ow_idx * stride_w;
        const int real_iw_idx = std::max(iw_idx - pw, 0);
        const int pad_left = std::max(0, pw - iw_idx);
        const int pad_right = std::max(0, iw_idx - pw + filter - iw);
        const int src_offset = (real_ih_idx * iw + real_iw_idx) * ic_step;
        const int dst_offset = (oh_idx * ow + ow_idx) * oc_step;
        ker_pooling_nchw44_remain_pad<mode>(
                src + src_offset, dst + dst_offset, iw, pad_top, pad_bottom, pad_left,
                pad_right, filter);
    }
 }

 template <int filter, int stride, PoolingBase::Mode mode>
 static inline void pooling_fp32_nchw44_pad(
        const float32_t* src, float32_t* dst, int ih, int iw, int oh, int ow, int ph,
        int pw) {
    constexpr int stride_h = stride;
    constexpr int stride_w = stride;
    constexpr int ic_step = 4;
    constexpr int oc_step = 4;
    constexpr int ow_step = 4;
    const int ow_pad_left_end = div_ceil(pw, stride_w);
    const int ow_pad_right_end = (iw - filter + pw - 1) / stride_w;
    const int ow_pad_right_step_end =
            (ow_pad_right_end - ow_pad_left_end) / ow_step * ow_step + ow_pad_left_end;

    rep(oh_idx, oh) {
        const int ih_idx = oh_idx * stride_h;
        const int real_ih_idx = std::max(ih_idx - ph, 0);
        const int pad_top = std::max(0, ph - ih_idx);
        const int pad_bottom = std::max(0, ih_idx - ph + filter - ih);
        if (pad_top > 0 || pad_bottom > 0) {
            kern_pooling_with_pad_nchw44<mode>(
                    src, dst, filter, 0, ow, iw, ow, stride_w, pw, real_ih_idx, oh_idx,
                    pad_top, pad_bottom);

        } else {
            kern_pooling_with_pad_nchw44<mode>(
                    src, dst, filter, 0, ow_pad_left_end, iw, ow, stride_w, pw,
                    real_ih_idx, oh_idx, pad_top, pad_bottom);
            for (int ow_idx = ow_pad_left_end; ow_idx < ow_pad_right_step_end;
                 ow_idx += ow_step) {
                const int iw_idx = ow_idx * stride_w;
                const int real_iw_idx = std::max(iw_idx - pw, 0);
                const int src_offset = (real_ih_idx * iw + real_iw_idx) * ic_step;
                const int dst_offset = (oh_idx * ow + ow_idx) * oc_step;
                KerPoolingFilterXStrideXNchw44<filter, stride, ow_step, mode>::impl(
                        src + src_offset, dst + dst_offset, iw);
            }
            kern_pooling_with_pad_nchw44<mode>(
                    src, dst, filter, ow_pad_right_step_end, ow, iw, ow, stride_w, pw,
                    real_ih_idx, oh_idx, pad_top, pad_bottom);
        }
    }
 }

 template <int filter, int stride, PoolingBase::Mode mode>
 static inline void pooling_fp32_nchw44_no_pad(
        const float32_t* src, float32_t* dst, int, int iw, int oh, int ow) {
    constexpr int stride_h = stride;
    constexpr int stride_w = stride;
    constexpr int ic_step = 4;
    constexpr int oc_step = 4;
    constexpr int ow_step = 4;
    const int ow_end = ow / ow_step * ow_step;
    const int ow_remain = ow - ow_end;

    rep(oh_idx, oh) {
        const int ih_idx = oh_idx * stride_h;
        const int src_ih_offset = ih_idx * iw;
        const int dst_oh_offset = oh_idx * ow;
        for (int ow_idx = 0; ow_idx < ow_end; ow_idx += ow_step) {
            const int iw_idx = ow_idx * stride_w;
            const int src_offset = (src_ih_offset + iw_idx) * ic_step;
            const int dst_offset = (dst_oh_offset + ow_idx) * oc_step;
            KerPoolingFilterXStrideXNchw44<filter, stride, ow_step, mode>::impl(
                    src + src_offset, dst + dst_offset, iw);
        }
        if (ow_remain > 0) {
            kern_pooling_with_pad_nchw44<mode>(
                    src, dst, filter, ow_end, ow, iw, ow, stride_w, 0, ih_idx, oh_idx,
                    0, 0);
        }
    }
 }

 template <int filter, int stride, PoolingBase::Mode mode>
 static inline void pooling_fp32_nchw44(
        const float32_t* src, float32_t* dst, int ih, int iw, int oh, int ow, int ph,
        int pw) {
    if (ph > 0 || pw > 0) {
        pooling_fp32_nchw44_pad<filter, stride, mode>(src, dst, ih, iw, oh, ow, ph, pw);
    } else {
        pooling_fp32_nchw44_no_pad<filter, stride, mode>(src, dst, ih, iw, oh, ow);
    }
 }

 }  // namespace
 }  // namespace fallback
 }  // namespace megdnn

 // vim: syntax=cpp.doxygen
--- a/dnn/src/fallback/pooling/gi/pooling_helper.h
+++ b/dnn/src/fallback/pooling/gi/pooling_helper.h
@@ -0,0 +1,572 @@
 /**
 * \file dnn/src/fallback/pooling/gi/pooling_helper.h
 * MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
 *
 * Copyright (c) 2014-2021 Megvii Inc. All rights reserved.
 *
 * Unless required by applicable law or agreed to in writing,
 * software distributed under the License is distributed on an
 * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or
 * implied.
 */

 #include "do_max_pooling_3x3_s2x2_float.h"
 #include "megdnn/dtype.h"
 #include "src/common/unroll_macro.h"
 #include "src/common/utils.h"

 namespace {

 /* ======================= MeanPooler ======================== */
 using namespace megdnn;
 /**
 * \brief  Mean mode for pooling
 * \tparam area the pooling area size, FH * FW
 * \tparam dtype the input type
 * \tparam ctype the inner raw type
 * \tparam comp_type compute type
 */
 template <int area, typename dtype, typename ctype, typename comp_type>
 struct MeanPoolerCommon {
    //! the gi imp register size is 16 bytes(128 bits)
    static constexpr int SIMD_WIDTH = 16 / sizeof(ctype);
    static constexpr comp_type coef = static_cast<comp_type>(1.0f) / area;
    comp_type res;
    MeanPoolerCommon() : res(0) {}
    void feed(const ctype* val) { res += *val; }
 };
 template <int area, typename dtype, typename ctype, typename comp_type>
 constexpr comp_type MeanPoolerCommon<area, dtype, ctype, comp_type>::coef;

 template <int area, typename dtype, typename _ctype, typename comp_type>
 struct MeanInPooler : MeanPoolerCommon<area, dtype, _ctype, comp_type> {
    using ctype = _ctype;
    //! `MIDOUT_CASE_NUM` is a unique int id
    static constexpr int MIDOUT_CASE_NUM = 1;
    MeanInPooler(DType) : MeanPoolerCommon<area, dtype, _ctype, comp_type>() {}
    void post(ctype* dst) {
        this->res *= this->coef;
        *dst = this->res;
    }
 };

 template <int area, typename dtype, typename _ctype>
 struct MeanInRoundPooler : MeanPoolerCommon<area, dtype, _ctype, float> {
    using ctype = _ctype;
    void post(ctype* dst) {
        this->res *= this->coef;
        *dst = std::round(this->res);
    }
 };

 template <int area, typename dtype, typename ctype, typename comp_type>
 struct GiMeanPooler;

 template <int area>
 struct GiMeanPooler<area, dt_float32, float, float> {
    using ctype = float;
    static constexpr int MIDOUT_CASE_NUM = 1;
    static constexpr int SIMD_WIDTH = 4;

    static const GI_FLOAT32_t coef;
    GI_FLOAT32_t res;
    GiMeanPooler(DType) : res(GiBroadcastFloat32(0.0f)) {}
    void feed(const float* val) { res = GiAddFloat32(res, GiLoadFloat32(val)); }
    void post(float* dst) {
        res = GiMultiplyFloat32(res, coef);
        GiStoreFloat32(dst, res);
    }
 };
 template <int area>
 const GI_FLOAT32_t GiMeanPooler<area, dt_float32, float, float>::coef =
        GiBroadcastFloat32(1.0f / area);

 /* ======================= MaxPooler ======================== */

 template <int area, typename dtype, typename _ctype, typename comp_type>
 struct MaxPooler {
    using ctype = _ctype;
    static constexpr int MIDOUT_CASE_NUM = 11;
    static constexpr int SIMD_WIDTH = 16 / sizeof(ctype);

    static const ctype outsider;
    ctype res;
    MaxPooler(DType) : res(DTypeTrait<dtype>::min()) {}
    void feed(const ctype* val) { res = std::max(res, *val); }
    void post(ctype* dst) { *dst = res; }
 };
 template <int area, typename dtype, typename ctype, typename comp_type>
 const ctype MaxPooler<area, dtype, ctype, comp_type>::outsider =
        DTypeTrait<dtype>::min();

 template <int area, typename dtype, typename ctype, typename comp_type>
 struct GiMaxPooler;

 template <int area>
 struct GiMaxPooler<area, dt_float32, float, float> {
    using ctype = float;
    static constexpr int MIDOUT_CASE_NUM = 11;
    static constexpr int SIMD_WIDTH = 4;

    GI_FLOAT32_t res;
    GiMaxPooler(DType) : res(GiBroadcastFloat32(DTypeTrait<dt_float32>::min())) {}
    void feed(const float* val) { res = GiMaximumFloat32(res, GiLoadFloat32(val)); }
    void post(float* dst) { GiStoreFloat32(dst, res); }
 };

 template <typename Pooler, int window>
 void do_pxl_naive(
        int oh, int ow, const typename Pooler::ctype* src, typename Pooler::ctype* dst,
        DType src_dtype, const int IH, const int IW, const int OH, const int OW,
        const int PH, const int PW, const int SH, const int SW) {
    MEGDNN_MARK_USED_VAR(OH);
    Pooler pooler(src_dtype);
    rep(wh, window) rep(ww, window) {
        int ih = -PH + oh * SH + wh;
        int iw = -PW + ow * SW + ww;
        if (ih >= 0 && iw >= 0 && ih < IH && iw < IW) {
            pooler.feed(src + ih * IW + iw);
        }
    }
    pooler.post(dst + oh * OW + ow);
 }

 namespace detail {

 template <typename Pooler, Pooling::Mode mode>
 struct do_pxl_2x2_pack_proxy {
    static void gao(
            int oh, int ow, const typename Pooler::ctype* src,
            typename Pooler::ctype* dst, DType, const int IH, const int IW,
            const int OH, const int OW, const int PH, const int PW);
 };

 template <>
 struct do_pxl_2x2_pack_proxy<
        MeanInPooler<4, dt_float32, float, float>, Pooling::Mode::AVERAGE> {
    static void gao(
            int oh, int ow, const dt_float32* src, dt_float32* dst, DType, const int IH,
            const int IW, const int OH, const int OW, const int PH, const int PW) {
        MEGDNN_MARK_USED_VAR(IH);
        MEGDNN_MARK_USED_VAR(OH);
        static const auto avg_coef = GiBroadcastFloat32(0.25f);
        int ih = -PH + 2 * oh;
        int iw = -PW + 2 * ow;
        auto i00 = GiLoadFloat32(src + (ih + 0) * IW + (iw + 0)),
             i01 = GiLoadFloat32(src + (ih + 0) * IW + (iw + 4)),
             i10 = GiLoadFloat32(src + (ih + 1) * IW + (iw + 0)),
             i11 = GiLoadFloat32(src + (ih + 1) * IW + (iw + 4));
        auto sum0 = GiAddFloat32(i00, i10), sum1 = GiAddFloat32(i01, i11);
        auto vlow = GiPaddFloat32(GiGetLowFloat32(sum0), GiGetHighFloat32(sum0));
        auto vhigh = GiPaddFloat32(GiGetLowFloat32(sum1), GiGetHighFloat32(sum1));
        auto comb = GiCombineFloat32(vlow, vhigh);
        auto result = GiMultiplyFloat32(comb, avg_coef);
        GiStoreFloat32(dst + oh * OW + ow, result);
    }
 };

 template <>
 struct do_pxl_2x2_pack_proxy<
        MaxPooler<4, dt_float32, float, float>, Pooling::Mode::MAX> {
    static void gao(
            int oh, int ow, const dt_float32* src, dt_float32* dst, DType, const int IH,
            const int IW, const int OH, const int OW, const int PH, const int PW) {
        MEGDNN_MARK_USED_VAR(IH);
        MEGDNN_MARK_USED_VAR(OH);
        int ih = -PH + 2 * oh;
        int iw = -PW + 2 * ow;
        auto i00 = GiLoadFloat32(src + (ih + 0) * IW + (iw + 0)),
             i01 = GiLoadFloat32(src + (ih + 0) * IW + (iw + 4)),
             i10 = GiLoadFloat32(src + (ih + 1) * IW + (iw + 0)),
             i11 = GiLoadFloat32(src + (ih + 1) * IW + (iw + 4));
        auto sum0 = GiMaximumFloat32(i00, i10), sum1 = GiMaximumFloat32(i01, i11);
        auto vlow = GiPmaxFloat32(GiGetLowFloat32(sum0), GiGetHighFloat32(sum0));
        auto vhigh = GiPmaxFloat32(GiGetLowFloat32(sum1), GiGetHighFloat32(sum1));
        auto comb = GiCombineFloat32(vlow, vhigh);
        GiStoreFloat32(dst + oh * OW + ow, comb);
    }
 };

 }  // namespace detail

 template <typename Pooler, Pooling::Mode mode>
 void do_pxl_2x2_pack(
        int oh, int ow, const typename Pooler::ctype* src, typename Pooler::ctype* dst,
        DType src_dtype, const int IH, const int IW, const int OH, const int OW,
        const int PH, const int PW) {
    detail::do_pxl_2x2_pack_proxy<Pooler, mode>::gao(
            oh, ow, src, dst, src_dtype, IH, IW, OH, OW, PH, PW);
 }

 template <typename GiPooler, int window>
 void do_pxl_compact_packed(
        int oh, int ow, const typename GiPooler::ctype* src,
        typename GiPooler::ctype* dst, DType src_dtype, const int IH, const int IW,
        const int OH, const int OW, const int PH, const int PW) {
    MEGDNN_MARK_USED_VAR(IH);
    MEGDNN_MARK_USED_VAR(OH);
    GiPooler pooler(src_dtype);
    rep(wh, window) rep(ww, window) {
        int ih = -PH + oh + wh;
        int iw = -PW + ow + ww;
        pooler.feed(src + ih * IW + iw);
    }
    pooler.post(dst + oh * OW + ow);
 }

 template <typename Pooler, typename GiPooler, int window>
 void do_pooling_compact(
        const typename Pooler::ctype* src, typename Pooler::ctype* dst, DType src_dtype,
        const int IH, const int IW, const int OH, const int OW, const int PH,
        const int PW) {
    static_assert(
            std::is_same<typename Pooler::ctype, typename GiPooler::ctype>::value,
            "ctype of Pooler and GiPooler is not the same");
    const int stride = 1;
    int oh = 0;
    for (; oh < OH && oh - PH < 0; ++oh) {
        int ow = 0;
        for (; ow < OW; ++ow) {
            do_pxl_naive<Pooler, window>(
                    oh, ow, src, dst, src_dtype, IH, IW, OH, OW, PH, PW, stride,
                    stride);
        }
    }
    for (; oh < OH && oh - PH + window <= IH; ++oh) {
        int ow = 0;
        for (; ow < OW && ow - PW < 0; ++ow) {
            do_pxl_naive<Pooler, window>(
                    oh, ow, src, dst, src_dtype, IH, IW, OH, OW, PH, PW, stride,
                    stride);
        }
        for (; ow + GiPooler::SIMD_WIDTH <= OW &&
               ow + GiPooler::SIMD_WIDTH - 1 - PW + window <= IW;
             ow += GiPooler::SIMD_WIDTH) {
            do_pxl_compact_packed<GiPooler, window>(
                    oh, ow, src, dst, src_dtype, IH, IW, OH, OW, PH, PW);
        }
        for (; ow < OW; ++ow) {
            do_pxl_naive<Pooler, window>(
                    oh, ow, src, dst, src_dtype, IH, IW, OH, OW, PH, PW, stride,
                    stride);
        }
    }
    for (; oh < OH; ++oh) {
        int ow = 0;
        for (; ow < OW; ++ow) {
            do_pxl_naive<Pooler, window>(
                    oh, ow, src, dst, src_dtype, IH, IW, OH, OW, PH, PW, stride,
                    stride);
        }
    }
 }

 template <typename Pooler, Pooling::Mode mode>
 void do_pooling_2x2(
        const typename Pooler::ctype* src, typename Pooler::ctype* dst, DType src_dtype,
        const int IH, const int IW, const int OH, const int OW, const int PH,
        const int PW) {
    const int window = 2;
    const int stride = 2;
    int oh = 0;
    for (; oh < OH && -PH + stride * oh < 0; ++oh) {
        int ow = 0;
        for (; ow < OW; ++ow) {
            do_pxl_naive<Pooler, window>(
                    oh, ow, src, dst, src_dtype, IH, IW, OH, OW, PH, PW, stride,
                    stride);
        }
    }
    for (; oh < OH && -PH + stride * oh + window <= IH; ++oh) {
        int ow = 0;
        for (; ow < OW && -PW + stride * ow < 0; ++ow) {
            do_pxl_naive<Pooler, window>(
                    oh, ow, src, dst, src_dtype, IH, IW, OH, OW, PH, PW, stride,
                    stride);
        }
        for (; ow + Pooler::SIMD_WIDTH <= OW &&
               -PW + stride * (ow + Pooler::SIMD_WIDTH - 1) + window <= IW;
             ow += Pooler::SIMD_WIDTH) {
            do_pxl_2x2_pack<Pooler, mode>(
                    oh, ow, src, dst, src_dtype, IH, IW, OH, OW, PH, PW);
        }
        for (; ow < OW; ++ow) {
            do_pxl_naive<Pooler, window>(
                    oh, ow, src, dst, src_dtype, IH, IW, OH, OW, PH, PW, stride,
                    stride);
        }
    }
    for (; oh < OH; ++oh) {
        int ow = 0;
        for (; ow < OW; ++ow) {
            do_pxl_naive<Pooler, window>(
                    oh, ow, src, dst, src_dtype, IH, IW, OH, OW, PH, PW, stride,
                    stride);
        }
    }
 }

 template <typename dtype, typename ctype>
 void do_max_pooling_w5x5_s2x2_gi(
        const ctype* src, ctype* dst, const int IH, const int IW, const int OH,
        const int OW, const int PH, const int PW, const WorkspaceBundle& ws,
        const int MEGDNN_SIMD_WIDTH) {
    ctype* cache[5] = {
            static_cast<ctype*>(ws.get(0)), static_cast<ctype*>(ws.get(1)),
            static_cast<ctype*>(ws.get(2)), static_cast<ctype*>(ws.get(3)),
            static_cast<ctype*>(ws.get(4))};
    ctype* odd = static_cast<ctype*>(ws.get(5));
    ctype* even = static_cast<ctype*>(ws.get(6));
    int ih_next = 0;
    int OW_from = (PW + 1) / 2, OW_to = (IW + PW - 5) / 2 + 1;
    auto process_cache = [&](int ih) {
        const ctype* __restrict sptr = src + ih * IW;
        auto tmp = cache[4];
        for (auto i = 4; i >= 1; --i)
            cache[i] = cache[i - 1];
        cache[0] = tmp;
        auto run_single = [&](int ow) {
            int iw = ow * 2 - PW;
            ctype res = std::numeric_limits<dtype>::lowest();
            for (auto i = 0; i < 5; ++i)
                if (iw + i >= 0 && iw + i < IW)
                    res = std::max(res, sptr[iw + i]);
            cache[0][ow] = res;
        };
        int iw = 0;
        int odd_offset = 0, even_offset = 0;
        for (; iw + 2 * MEGDNN_SIMD_WIDTH <= IW; iw += 2 * MEGDNN_SIMD_WIDTH) {
            auto s0 = GiLoadFloat32(sptr + iw + 0);
            auto s1 = GiLoadFloat32(sptr + iw + MEGDNN_SIMD_WIDTH);
            auto d = GiUzpqFloat32(s0, s1);
            GiStoreFloat32(even + even_offset, d.val[0]);
            GiStoreFloat32(odd + odd_offset, d.val[1]);
            even_offset += MEGDNN_SIMD_WIDTH;
            odd_offset += MEGDNN_SIMD_WIDTH;
        }
        for (; iw < IW; ++iw) {
            if (iw & 1)
                odd[odd_offset++] = sptr[iw];
            else
                even[even_offset++] = sptr[iw];
        }
        int ow = 0;
        for (; ow < OW_from; ++ow)
            run_single(ow);
        if (PW & 1) {
            for (; ow + MEGDNN_SIMD_WIDTH <= OW_to; ow += MEGDNN_SIMD_WIDTH) {
                auto s0 = GiLoadFloat32(odd + ow - (PW >> 1) - 1);
                auto s1 = GiLoadFloat32(even + ow - (PW >> 1));
                auto s2 = GiLoadFloat32(odd + ow - (PW >> 1));
                auto s3 = GiLoadFloat32(even + ow - (PW >> 1) + 1);
                auto s4 = GiLoadFloat32(odd + ow - (PW >> 1) + 1);
                auto d = GiMaximumFloat32(
                        s0,
                        GiMaximumFloat32(
                                GiMaximumFloat32(s1, s2), GiMaximumFloat32(s3, s4)));
                GiStoreFloat32(cache[0] + ow, d);
            }
        } else {
            for (; ow + MEGDNN_SIMD_WIDTH <= OW_to; ow += MEGDNN_SIMD_WIDTH) {
                auto s0 = GiLoadFloat32(even + ow - (PW >> 1));
                auto s1 = GiLoadFloat32(odd + ow - (PW >> 1));
                auto s2 = GiLoadFloat32(even + ow - (PW >> 1) + 1);
                auto s3 = GiLoadFloat32(odd + ow - (PW >> 1) + 1);
                auto s4 = GiLoadFloat32(even + ow - (PW >> 1) + 2);
                auto d = GiMaximumFloat32(
                        s0,
                        GiMaximumFloat32(
                                GiMaximumFloat32(s1, s2), GiMaximumFloat32(s3, s4)));
                GiStoreFloat32(cache[0] + ow, d);
            }
        }
        for (; ow < OW; ++ow)
            run_single(ow);
    };

    for (int oh = 0; oh < OH; ++oh) {
        ctype* __restrict dptr = dst + oh * OW;
        int ih_from = std::min(IH, std::max(0, oh * 2 - PH));
        int ih_to = std::min(IH, std::max(0, oh * 2 - PH + 5));
        while (ih_next < ih_to)
            process_cache(ih_next++);
        if (ih_to - ih_from == 5) {
            int ow = 0;
            for (; ow + MEGDNN_SIMD_WIDTH <= OW; ow += MEGDNN_SIMD_WIDTH) {
                auto s0 = GiLoadFloat32(cache[0] + ow);
                auto s1 = GiLoadFloat32(cache[1] + ow);
                auto s2 = GiLoadFloat32(cache[2] + ow);
                auto s3 = GiLoadFloat32(cache[3] + ow);
                auto s4 = GiLoadFloat32(cache[4] + ow);
                auto d = GiMaximumFloat32(
                        s0,
                        GiMaximumFloat32(
                                GiMaximumFloat32(s1, s2), GiMaximumFloat32(s3, s4)));
                GiStoreFloat32(dptr + ow, d);
            }
            for (; ow < OW; ++ow)
                dptr[ow] = std::max(
                        {cache[0][ow], cache[1][ow], cache[2][ow], cache[3][ow],
                         cache[4][ow]});
        } else {
            std::memcpy(dptr, cache[0], sizeof(ctype) * OW);
            for (int i = 1; i < ih_to - ih_from; ++i) {
                int ow = 0;
                for (; ow + MEGDNN_SIMD_WIDTH <= OW; ow += MEGDNN_SIMD_WIDTH) {
                    auto s = GiLoadFloat32(cache[i] + ow);
                    auto d = GiLoadFloat32(dptr + ow);
                    d = GiMaximumFloat32(d, s);
                    GiStoreFloat32(dptr + ow, d);
                }
                for (; ow < OW; ++ow)
                    dptr[ow] = std::max(dptr[ow], cache[i][ow]);
            }
        }
    }
 }

 template <typename ctype>
 void do_average_pooling_3x3_s2x2_gi(
        const ctype* src, ctype* dst, size_t IH_, size_t IW_, size_t OH_, size_t OW_,
        size_t PH_, size_t PW_, const WorkspaceBundle& ws,
        const int MEGDNN_SIMD_WIDTH) {
    int IH = IH_, IW = IW_, OH = OH_, OW = OW_, PH = PH_, PW = PW_;
    // cache[i] stores the answer of the i-th line after
    // pooling along the W dimension.
    ctype* cache[3] = {
            static_cast<ctype*>(ws.get(0)), static_cast<ctype*>(ws.get(1)),
            static_cast<ctype*>(ws.get(2))};
    ctype* odd = static_cast<ctype*>(ws.get(3));
    ctype* even = static_cast<ctype*>(ws.get(4));
    int ih_next = 0;
    // "good" area means we can use SIMD to accelerate.
    auto get_good_area = [](int I, int /* O */, int P, int& O_from, int& O_to) {
        // x*2 - P >= 0; 2x >= P; x >= P/2
        O_from = (P + 1) / 2;
        // x*2 - P + 3 <= I; x*2 <= I+P-3; x <= (I+P-3)/2
        O_to = (I + P - 3) / 2 + 1;
        // we must have I >= 2 to ensure O_from <= O_to
    };
    int OW_from, OW_to;
    get_good_area(IW, OW, PW, OW_from, OW_to);
    auto process_cache = [&](int ih) {
        const ctype* __restrict sptr = src + ih * IW;
        auto tmp = cache[2];
        cache[2] = cache[1];
        cache[1] = cache[0];
        cache[0] = tmp;
        // cache 0 is used to store the current answer.
        auto run_single = [&](int ow) {
            int iw = ow * 2 - PW;
            ctype res = 0;
            if (iw + 0 >= 0 && iw + 0 < IW) {
                res += sptr[iw + 0];
            }
            if (iw + 1 >= 0 && iw + 1 < IW) {
                res += sptr[iw + 1];
            }
            if (iw + 2 >= 0 && iw + 2 < IW) {
                res += sptr[iw + 2];
            }
            cache[0][ow] = res;
        };
        // build odd/even
        int iw = 0;
        int odd_offset = 0, even_offset = 0;

        for (; iw + 2 * MEGDNN_SIMD_WIDTH <= IW; iw += 2 * MEGDNN_SIMD_WIDTH) {
            auto s0 = GiLd2qFloat32(sptr + iw);
            GiStoreFloat32(even + even_offset, s0.val[0]);
            GiStoreFloat32(odd + odd_offset, s0.val[1]);
            even_offset += MEGDNN_SIMD_WIDTH;
            odd_offset += MEGDNN_SIMD_WIDTH;
        }
        for (; iw < IW; ++iw) {
            if (iw & 1)
                odd[odd_offset++] = sptr[iw];
            else
                even[even_offset++] = sptr[iw];
        }
        int ow = 0;
        for (; ow < OW_from; ++ow)
            run_single(ow);
        if (PW & 1) {
            for (; ow + MEGDNN_SIMD_WIDTH <= OW_to; ow += MEGDNN_SIMD_WIDTH) {
                auto s0 = GiLoadFloat32(odd + ow - (PW >> 1) - 1);
                auto s1 = GiLoadFloat32(even + ow - (PW >> 1));
                auto s2 = GiLoadFloat32(odd + ow - (PW >> 1));
                auto d = GiAddFloat32(GiAddFloat32(s0, s1), s2);
                GiStoreFloat32(cache[0] + ow, d);
            }
        } else {
            for (; ow + MEGDNN_SIMD_WIDTH <= OW_to; ow += MEGDNN_SIMD_WIDTH) {
                auto s0 = GiLoadFloat32(even + ow - (PW >> 1));
                auto s1 = GiLoadFloat32(odd + ow - (PW >> 1));
                auto s2 = GiLoadFloat32(even + ow - (PW >> 1) + 1);
                auto d = GiAddFloat32(GiAddFloat32(s0, s1), s2);
                GiStoreFloat32(cache[0] + ow, d);
            }
        }
        for (; ow < OW; ++ow)
            run_single(ow);
    };
    for (int oh = 0; oh < OH; ++oh) {
        ctype* __restrict dptr = dst + oh * OW;
        int ih_from = std::min(IH, std::max(0, oh * 2 - PH));
        int ih_to = std::min(IH, std::max(0, oh * 2 - PH + 3));
        while (ih_next < ih_to) {
            process_cache(ih_next++);
        }
        ctype factor = (1.0f / 9);
        auto coef = GiBroadcastFloat32(factor);
        if (ih_to - ih_from == 3) {
            int ow = 0;
            for (; ow + MEGDNN_SIMD_WIDTH <= OW; ow += MEGDNN_SIMD_WIDTH) {
                auto s0 = GiLoadFloat32(cache[0] + ow);
                auto s1 = GiLoadFloat32(cache[1] + ow);
                auto s2 = GiLoadFloat32(cache[2] + ow);
                auto d = GiAddFloat32(GiAddFloat32(s0, s1), s2);
                d = GiMultiplyFloat32(d, coef);
                GiStoreFloat32(dptr + ow, d);
            }
 #if MEGDNN_FIX_AARCH32_BUG
 // FIXME: as llvm may cause cannot select error if enable vectorize
 #pragma clang loop vectorize(disable)
 #endif
            for (; ow < OW; ++ow) {
                dptr[ow] = (cache[0][ow] + cache[1][ow] + cache[2][ow]) * factor;
            }
        } else {
            std::memcpy(dptr, cache[0], sizeof(ctype) * OW);
            int i = 1;
            for (; i < ih_to - ih_from; ++i) {
                int ow = 0;
                for (; ow + MEGDNN_SIMD_WIDTH <= OW; ow += MEGDNN_SIMD_WIDTH) {
                    auto s = GiLoadFloat32(cache[i] + ow);
                    auto d = GiLoadFloat32(dptr + ow);
                    d = GiAddFloat32(d, s);
                    GiStoreFloat32(dptr + ow, d);
                }
                for (; ow < OW; ++ow) {
                    dptr[ow] = (dptr[ow] + cache[i][ow]);
                }
            }
            int ow = 0;
            for (; ow + MEGDNN_SIMD_WIDTH <= OW; ow += MEGDNN_SIMD_WIDTH) {
                auto d = GiLoadFloat32(dptr + ow);
                d = GiMultiplyFloat32(d, coef);
                GiStoreFloat32(dptr + ow, d);
            }
 #if MEGDNN_FIX_AARCH32_BUG
 // FIXME: as llvm may cause cannot select error if enable vectorize
 #pragma clang loop vectorize(disable)
 #endif
            for (; ow < OW; ++ow) {
                dptr[ow] *= factor;
            }
        }
    }
 }
 }  // anonymous namespace

 // vim: syntax=cpp.doxygen
--- a/dnn/src/fallback/pooling/opr_impl.cpp
+++ b/dnn/src/fallback/pooling/opr_impl.cpp
@@ -6,18 +6,186 @@
 *
 * Unless required by applicable law or agreed to in writing,
 * software distributed under the License is distributed on an
 * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or
 * implied.
 */
 #include "src/fallback/pooling/opr_impl.h"

 #include <cstring>
 #include "src/common/utils.h"
 #include "src/naive/handle.h"
 #include "src/common/algo_chooser.h"
 #include "src/common/metahelper.h"
 #include "src/fallback/pooling/gi/algo.h"

 #include "midout.h"

 MIDOUT_DECL(megdnn_fallback_pooling)

 using namespace megdnn;
 using namespace fallback;

 class PoolingImpl::AlgoPack : NonCopyableObj {
 private:
    AlgoBase::Mapper m_all_algos_map;
    AlgoGiFilterxModexStride1 algo_gi_filterx_modex_stride1;
    AlgoGiFilter2ModexStride2 algo_gi_filter2_modex_stride2;
    AlgoGiFilter3MaxStride2 algo_gi_filter3_max_stride2;
    AlgoGiFilter3AverageStride2 algo_gi_filter3_average_stride2;
    AlgoGiFilter4MaxStride2 algo_gi_filter4_max_stride2;
    AlgoGiFilter5MaxStride2 algo_gi_filter5_max_stride2;
    AlgoGiFp32ModexStridexNCHW44 algo_gi_fp32_modex_stridex_nchw44;
    AlgoFallback algo_fallback;

 public:
    AlgoPack() {
        all_algos.emplace_back(&algo_gi_filterx_modex_stride1);
        all_algos.emplace_back(&algo_gi_filter2_modex_stride2);
        all_algos.emplace_back(&algo_gi_filter3_max_stride2);
        all_algos.emplace_back(&algo_gi_filter3_average_stride2);
        all_algos.emplace_back(&algo_gi_filter4_max_stride2);
        all_algos.emplace_back(&algo_gi_filter5_max_stride2);
        all_algos.emplace_back(&algo_gi_fp32_modex_stridex_nchw44);
        all_algos.emplace_back(&algo_fallback);

        for (auto&& algo : all_algos) {
            m_all_algos_map.emplace(algo->info().desc, algo);
        }
    }
    SmallVector<AlgoBase*> all_algos;
    const AlgoBase::Mapper& all_algos_map() const { return m_all_algos_map; }
 };

 PoolingImpl::AlgoPack PoolingImpl::sm_algo_pack;

 PoolingImpl::PoolingKernSizeParam PoolingImpl::make_pooling_kern_szie_param(
        fallback::PoolingImpl* opr, const TensorLayout& src, const TensorLayout& dst) {
    auto safe_u32 = [](size_t v) -> uint32_t {
        megdnn_assert(
                v <= std::numeric_limits<uint32_t>::max(), "value too large: %zu", v);
        return v;
    };
    return {safe_u32(src.shape[0]),
            safe_u32(src.shape[1]),
            {{safe_u32(src.shape[2]), safe_u32(src.shape[3])}},
            {{safe_u32(dst.shape[2]), safe_u32(dst.shape[3])}},
            {{safe_u32(opr->param().pad_h), safe_u32(opr->param().pad_w)}},
            {{safe_u32(opr->param().window_h), safe_u32(opr->param().window_w)}},
            {{safe_u32(opr->param().stride_h), safe_u32(opr->param().stride_w)}},
            src.dtype,
            dst.dtype,
            opr->handle(),
            opr->param().format,
            opr->param().mode};
 };

 PoolingImpl::PoolingKernParam PoolingImpl::make_pooling_kern_param(
        fallback::PoolingImpl* opr, _megdnn_tensor_in src, _megdnn_tensor_out dst,
        _megdnn_workspace workspace) {
    PoolingKernParam ret;
    static_cast<PoolingKernSizeParam&>(ret) =
            make_pooling_kern_szie_param(opr, src.layout, dst.layout);
    ret.src_ptr = src.get_ref_ptr();
    ret.dst_ptr = dst.get_ref_ptr();
    ret.workspace_ptr = workspace.raw_ptr;
    ret.workspace_size = workspace.size;
    return ret;
 };

 MEGDNN_DEF_GET_ALGO_FROM_DESC(PoolingImpl);

 std::vector<Algorithm*> PoolingImpl::get_all_algorithms(
        const TensorLayout& src, const TensorLayout& dst) {
    auto param = make_pooling_kern_szie_param(this, src, dst);
    std::vector<Algorithm*> ret;
    ret.reserve(algo_pack().all_algos.size());
    for (auto i : algo_pack().all_algos) {
        if (i->usable(param)) {
            ret.push_back(i);
        }
    }
    return ret;
 }

 size_t PoolingImpl::get_workspace_in_bytes(
        const TensorLayout& src, const TensorLayout& dst) {
    TensorLayoutArray layouts{src, dst};
    AlgorithmCache::Key key{this->handle(), this->get_opr_type(),
                            layouts.data(), layouts.size(),
                            &this->param(), sizeof(this->param())};
    auto rst = AlgorithmCache::instance().get(key);
    if (rst.policy.algo.valid()) {
        return rst.workspace;
    }

    auto param = make_pooling_kern_szie_param(this, src, dst);
    auto algo = static_cast<AlgoBase*>(fallback::PoolingImpl::get_algorithm_heuristic(
            src, dst, std::numeric_limits<size_t>::max(), AlgoAttribute::DEFAULT,
            AlgoAttribute::DEFAULT));
    if (!is_fallback_non_gi_algo(algo)) {
        size_t fallback_gi_workspace = 0;

        //! When multi-thread, every thread has its own workspace
        size_t nr_threads = static_cast<naive::HandleImpl*>(handle())
                                    ->megcore_dispatcher()
                                    ->nr_threads();
        if (param.src_type.category() == DTypeCategory::FLOAT &&
            param.filter[0] == param.filter[1] &&
            (param.filter[0] == 3 || param.filter[0] == 5) &&
            param.format == Param::Format::NCHW &&
            (param.mode == Mode::MAX ||
             (param.mode == Mode::AVERAGE && param.filter[0] == 3)) &&
            param.stride[0] == 2 && param.stride[1] == 2 && param.isz[0] >= 2 &&
            param.isz[1] >= 2) {
            WorkspaceBundle ws = get_bundle(param);
            fallback_gi_workspace = ws.total_size_in_bytes() * nr_threads;
        }

        return fallback_gi_workspace;
    } else {
        auto naive_worksapce =
                naive::PoolingForwardImpl::get_workspace_in_bytes(src, dst);
        return naive_worksapce;
    }
 }

 void PoolingImpl::exec(
        _megdnn_tensor_in src, _megdnn_tensor_out dst, _megdnn_workspace workspace) {
    check_exec(src.layout, dst.layout, workspace.size);
    auto param = make_pooling_kern_param(this, src, dst, workspace);
    auto algo = static_cast<AlgoBase*>(fallback::PoolingImpl::get_algorithm_heuristic(
            src.layout, dst.layout, std::numeric_limits<size_t>::max(),
            AlgoAttribute::DEFAULT, AlgoAttribute::DEFAULT));
    if (!is_fallback_non_gi_algo(algo)) {
        algo->exec(param);
    } else {
        exec_fallback(src, dst, workspace);
    }
 }

 std::vector<Algorithm*> PoolingImpl::get_all_algorithms_safe(
        const TensorLayout& src, const TensorLayout& dst) {
    auto ret_safe = get_all_algorithms(src, dst);
    megdnn_assert(!ret_safe.empty(), "no usable pooling fwd algorithm");
    return ret_safe;
 }

 Algorithm* PoolingImpl::get_algorithm_heuristic(
        const TensorLayout& src, const TensorLayout& dst,
        size_t workspace_limit_in_bytes, const AlgoAttribute& positive_attr,
        const AlgoAttribute& negative_attr) {
    MEGDNN_MARK_USED_VAR(workspace_limit_in_bytes);

    auto param = make_pooling_kern_szie_param(this, src, dst);
    for (auto&& iter : sm_algo_pack.all_algos) {
        if (iter->is_available_attribute(param, positive_attr, negative_attr)) {
            return iter;
        }
    }
    megdnn_throw(ssprintf(
            "require algorithm with attribute(%s) and without "
            "attribute(%s), but can't get suitable algo.\n",
            Algorithm::attribute_str(positive_attr).c_str(),
            Algorithm::attribute_str(negative_attr).c_str()));
    return nullptr;
 }
 //! fallback not gi imp
 namespace megdnn {
 namespace fallback {
 namespace pooling {
@@ -140,9 +308,6 @@ void w2x2_s2x2_avg_int8(
 }  // namespace fallback
 }  // namespace megdnn

 namespace megdnn {
 namespace fallback {

 void PoolingImpl::exec_w3x3_s1x1(
        _megdnn_tensor_in src, _megdnn_tensor_out dst, const Param& param) {
    auto N = src.layout.shape[0], C = src.layout.shape[1];
@@ -179,7 +344,7 @@ void PoolingImpl::exec_w2x2_s2x2_avg_int8(
    }
 }

 void PoolingImpl::exec(
 void PoolingImpl::exec_fallback(
        _megdnn_tensor_in src, _megdnn_tensor_out dst, _megdnn_workspace workspace) {
    Param param = this->param();
    check_exec(src.layout, dst.layout, workspace.size);
@@ -219,7 +384,4 @@ void PoolingImpl::exec(
    naive::PoolingForwardImpl::exec(src, dst, workspace);
 }

 }  // namespace fallback
 }  // namespace megdnn

 // vim: syntax=cpp.doxygen
--- a/dnn/src/fallback/pooling/opr_impl.h
+++ b/dnn/src/fallback/pooling/opr_impl.h
@@ -10,6 +10,7 @@
 * implied.
 */
 #pragma once
 #include <unordered_map>
 #include "megdnn/oprs/base.h"
 #include "src/naive/pooling/opr_impl.h"

@@ -17,19 +18,143 @@ namespace megdnn {
 namespace fallback {

 class PoolingImpl : public naive::PoolingForwardImpl {
 private:
    class AlgoGiFilterxModexStride1;
    class AlgoGiFilter2ModexStride2;
    class AlgoGiFilter3MaxStride2;
    class AlgoGiFilter3AverageStride2;
    class AlgoGiFilter4MaxStride2;
    class AlgoGiFilter5MaxStride2;
    class AlgoGiFp32ModexStridexNCHW44;
    class AlgoFallback;
    class AlgoPack;
    static AlgoPack sm_algo_pack;

    void exec_w3x3_s1x1(
            _megdnn_tensor_in src, _megdnn_tensor_out dst, const Param& param);
    void exec_w2x2_s2x2_int8(_megdnn_tensor_in src, _megdnn_tensor_out dst);
    void exec_w2x2_s2x2_avg_int8(_megdnn_tensor_in src, _megdnn_tensor_out dst);

 public:
    using naive::PoolingForwardImpl::PoolingForwardImpl;
    using Param = param::Pooling;

    void exec(
            _megdnn_tensor_in src, _megdnn_tensor_out dst,
            _megdnn_workspace workspace) override;

 private:
    void exec_w3x3_s1x1(
            _megdnn_tensor_in src, _megdnn_tensor_out dst, const Param& param);
    void exec_w2x2_s2x2_int8(_megdnn_tensor_in src, _megdnn_tensor_out dst);
    void exec_w2x2_s2x2_avg_int8(_megdnn_tensor_in src, _megdnn_tensor_out dst);
    void exec_fallback(
            _megdnn_tensor_in src, _megdnn_tensor_out dst, _megdnn_workspace workspace);

    size_t get_workspace_in_bytes(const TensorLayout&, const TensorLayout&) override;

    static size_t constexpr MAX_SPATIAL_DIM = 2;

    struct PoolingKernSizeParam {
        uint32_t n, ic;
        std::array<uint32_t, MAX_SPATIAL_DIM> isz, osz;
        std::array<uint32_t, MAX_SPATIAL_DIM> padding, filter, stride;
        DType src_type, dst_type;
        Handle* handle;
        Param::Format format;
        Mode mode;
    };

    struct PoolingKernParam : public PoolingKernSizeParam {
        RefPtr src_ptr;
        RefPtr dst_ptr;
        void* workspace_ptr;
        size_t workspace_size;

        template <typename T>
        const T* src() const {
            src_type.assert_is_compatible_ctype<T>();
            return static_cast<const T*>(src_ptr.get_ptr());
        }

        template <typename T>
        T* dst() const {
            dst_type.assert_is_compatible_ctype<T>();
            return static_cast<T*>(dst_ptr.get_ptr());
        }

        template <typename T>
        T* workspace() const {
            return static_cast<T*>(workspace_ptr);
        }
    };

    PoolingKernSizeParam make_pooling_kern_szie_param(
            fallback::PoolingImpl* opr, const TensorLayout& src,
            const TensorLayout& dst);

    PoolingKernParam make_pooling_kern_param(
            fallback::PoolingImpl* opr, _megdnn_tensor_in src, _megdnn_tensor_out dst,
            _megdnn_workspace workspace);
    class AlgoBase : public detail::Algorithm {
    public:
        enum class AlgoType : uint32_t {
            GI_FilterxModexStride1,
            GI_Filter2ModexStride2,
            GI_Filter3MaxStride2,
            GI_Filter3AverageStride2,
            GI_Filter4MaxStride2,
            GI_Filter5MaxStride2,
            GI_Filter2ModexStridexNCHW44,
            GI_Filter3ModexStridexNCHW44,
            GI_Filter4ModexStridexNCHW44,
            GI_Filter5ModexStridexNCHW44,
            GI_Fp32ModexStridexNCHW44,
            FallbackNotGI
        };

        using Mapper = std::unordered_map<AlgorithmDesc, AlgoBase*>;
        AlgoBase() : Algorithm() { m_handle_type = Handle::HandleType::FALLBACK; }
        virtual ~AlgoBase() = default;
        virtual bool usable(const PoolingKernSizeParam& param) const = 0;
        virtual void exec(const PoolingKernParam& param) const = 0;

        uint32_t type() const override { return INVALID_ALGO_TYPE; };
        bool is_available_attribute(
                const PoolingKernSizeParam& param,
                const AlgoAttribute& positive_attr = AlgoAttribute::REPRODUCIBLE,
                const AlgoAttribute& negative_attr = AlgoAttribute::DEFAULT) {
            return contain_attribute_all(positive_attr) &&
                   !contain_attribute_any(negative_attr) && usable(param);
        }
    };

    const char* get_algorithm_set_name() const override {
        return "FALLBACK_POOLING_FORWARD";
    }

    Algorithm* get_algorithm_from_desc(const AlgorithmDesc&) override;

    std::vector<Algorithm*> get_all_algorithms(
            const TensorLayout& src, const TensorLayout& dst) override;
    std::vector<Algorithm*> get_all_algorithms_safe(
            const TensorLayout& src, const TensorLayout& dst) override;

    Algorithm* get_algorithm_heuristic(
            const TensorLayout& src, const TensorLayout& dst,
            size_t workspace_limit_in_bytes, const AlgoAttribute& positive_attr,
            const AlgoAttribute& negative_attr) override;

    AlgorithmInfo get_algorithm_info_heuristic(
            const TensorLayout& src, const TensorLayout& dst,
            size_t workspace_limit_in_bytes, const AlgoAttribute& positive_attr,
            const AlgoAttribute& negative_attr) {
        return fallback::PoolingImpl::get_algorithm_heuristic(
                       src, dst, workspace_limit_in_bytes, positive_attr, negative_attr)
                ->info();
    }

    static const AlgoPack& algo_pack() { return sm_algo_pack; }
    bool is_fallback_non_gi_algo(Algorithm* algo) {
        return strcmp(algo->name(), "FALLBACK_NOT_GI_POOLING") == 0;
    }
 };
 }  // namespace fallback
 }  // namespace megdnn

 // vim: syntax=cpp.doxygen
--- a/dnn/src/x86/pooling/algo.h
+++ b/dnn/src/x86/pooling/algo.h
@@ -103,7 +103,9 @@ public:
    AlgoAttribute attribute() const override { return AlgoAttribute::REPRODUCIBLE; };
    const char* name() const override { return m_algo_name.c_str(); }
    bool is_available(const SizeArgs&) const override { return true; }
    void exec(const ExecArgs&) const override {}
    void exec(const ExecArgs&) const override {
        megdnn_assert(false, "code issue happened!!");
    }
    MEGDNN_DECL_ALGO_TYPE(X86_Fallback)
 };

--- a/dnn/test/fallback/gi.cpp
+++ b/dnn/test/fallback/gi.cpp
@@ -3161,6 +3161,44 @@ TEST_F(FALLBACK, GiGetHighFloat32) {
    ASSERT_EQ(*(r + 1), s0[3]);
 }

 TEST_F(FALLBACK, GiPaddFloat32) {
    float32x2_t src0, src1, ret;
    std::vector<float> s0{1.1f, -3.1415f};
    std::vector<float> s1{2.3f, 3.14777f};
    memcpy(&src0, s0.data(), sizeof(float32x2_t));
    memcpy(&src1, s1.data(), sizeof(float32x2_t));

    ret = GiPaddFloat32(src0, src1);

    std::vector<float> naive;
    naive.push_back(s0[0] + s0[1]);
    naive.push_back(s1[0] + s1[1]);

    auto r = (float*)&ret;
    ASSERT_LT(std::abs(naive[0] - r[0]), 1e-3);
    ASSERT_LT(std::abs(naive[1] - r[1]), 1e-3);
 }

 TEST_F(FALLBACK, GiPmaxFloat32) {
    float32x2_t src0, src1, ret;
    std::vector<float> s0{1.1f, -3.1415f};
    std::vector<float> s1{2.3f, 3.14777f};
    memcpy(&src0, s0.data(), sizeof(float32x2_t));
    memcpy(&src1, s1.data(), sizeof(float32x2_t));

    ret = GiPmaxFloat32(src0, src1);

    std::vector<float> naive;
    auto t0 = MAX_NAN(s0[0], s0[1]);
    auto t1 = MAX_NAN(s1[0], s1[1]);
    naive.push_back(t0);
    naive.push_back(t1);

    auto r = (float*)&ret;
    ASSERT_LT(std::abs(naive[0] - r[0]), 1e-3);
    ASSERT_LT(std::abs(naive[1] - r[1]), 1e-3);
 }

 }  // namespace test
 }  // namespace megdnn

--- a/dnn/test/fallback/pooling.cpp
+++ b/dnn/test/fallback/pooling.cpp
@@ -0,0 +1,560 @@
 /**
 * \file dnn/test/fallback/pooling.cpp
 * MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
 *
 * Copyright (c) 2014-2022 Megvii Inc. All rights reserved.
 *
 * Unless required by applicable law or agreed to in writing,
 * software distributed under the License is distributed on an
 * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 */
 #include "test/fallback/fixture.h"

 #include "test/common/benchmarker.h"
 #include "test/common/checker.h"
 #include "test/common/pooling.h"
 #include "test/common/rng.h"
 #include "test/common/task_record_check.h"

 namespace megdnn {
 namespace test {

 namespace {
 std::vector<std::pair<param::Pooling, TensorShapeArray>> get_nchw44_pool_args(
        size_t filter, size_t stride) {
    constexpr size_t ic_step = 4;
    std::vector<std::pair<param::Pooling, TensorShapeArray>> args;

    for (size_t n : {1, 2})
        for (size_t c : {4, 8})
            for (size_t ih : {3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13})
                for (size_t iw : {3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13})
                    for (size_t ph : {0, 1, 2})
                        for (size_t pw : {0, 1, 2})
                            for (auto mode :
                                 {param::Pooling::Mode::MAX,
                                  param::Pooling::Mode::AVERAGE})
                                if (ih + 2 * ph >= filter && iw + 2 * pw >= filter &&
                                    filter > ph && filter > pw) {
                                    param::Pooling param;
                                    param.mode = mode;
                                    param.format = param::Pooling::Format::NCHW44;
                                    param.pad_h = ph;
                                    param.pad_w = pw;
                                    param.stride_h = param.stride_w = stride;
                                    param.window_h = param.window_w = filter;
                                    args.emplace_back(std::make_pair(
                                            param,
                                            TensorShapeArray{
                                                    {n, c / ic_step, ih, iw, ic_step},
                                                    {}}));
                                }
    return args;
 }

 void run_pooling_check(
        Handle* handle, std::vector<std::pair<param::Pooling, TensorShapeArray>> args,
        bool is_int8) {
    Checker<Pooling> checker(handle);
    UniformIntRNG rng_int8{INT8_MIN >> 1, INT8_MAX >> 1};
    UniformIntRNG rng_fp32{-10, 10};
    if (is_int8) {
        checker.set_dtype(0, dtype::QuantizedS8(1.1f));
        checker.set_rng(0, &rng_int8);
    } else {
        checker.set_rng(0, &rng_fp32);
    }
    for (auto arg : args) {
        checker.set_param(arg.first).exec(arg.second);
    }
 }
 }  // namespace

 TEST_F(FALLBACK_MULTI_THREADS, POOLING_GI_NCHW44_FP32) {
    for (auto filter : {2, 3, 4, 5})
        for (auto stride : {1, 2}) {
            run_pooling_check(handle(), get_nchw44_pool_args(filter, stride), false);
        }
 }

 TEST_F(FALLBACK, POOLING_GI) {
    using Param = param::Pooling;
    // clang-format off
    for (size_t ih: {2, 3, 5, 7, 11, 13, 17, 19, 23, 24, 25, 26, 27, 28, 29, 30})
    for (size_t iw: {2, 3, 5, 7, 11, 13, 17, 19, 23, 24, 25, 26, 27, 28, 29, 30})
    for (size_t p: {1, 2})
    {
        Param param;
        param.mode = Param::Mode::MAX;
        param.window_h = param.window_w = 3;
        param.stride_h = param.stride_w = 2;
        param.pad_h = param.pad_w = p;
        Checker<Pooling> checker(handle());
        checker.set_param(param).exec({{2, 3, ih, iw}, {}});

        param.mode = Param::Mode::AVERAGE;
        param.window_h = param.window_w = 3;
        param.stride_h = param.stride_w = 2;
        param.pad_h = param.pad_w = p;
        checker.set_param(param).exec({{2, 3, ih, iw}, {}});

        param.mode = Param::Mode::MAX;
        param.window_h = param.window_w = 4;
        param.stride_h = param.stride_w = 2;
        param.pad_h = param.pad_w = p;
        checker.set_param(param).exec({{2, 3, ih, iw}, {}});

        param.mode = Param::Mode::MAX;
        param.window_h = param.window_w = 5;
        param.stride_h = param.stride_w = 2;
        param.pad_h = param.pad_w = p;
        if (ih + p * 2 >= 5 && iw + p * 2 >= 5)
            checker.set_param(param).exec({{2, 3, ih, iw}, {}});
    }
    for (size_t ih: {2, 3, 5, 7, 11, 13, 17, 19, 23, 24, 25, 26, 27, 28, 29, 30})
    for (size_t iw: {2, 3, 5, 7, 11, 13, 17, 19, 23, 24, 25, 26, 27, 28, 29, 30})
    for (size_t p: {1, 2})
    {
        Param param;
        param.mode = Param::Mode::AVERAGE_COUNT_EXCLUDE_PADDING;
        param.window_h = param.window_w = 3;
        param.stride_h = param.stride_w = 1;
        param.pad_h = param.pad_w = p;
        Checker<Pooling> checker(handle());
        checker.set_param(param).exec({{2, 3, ih, iw}, {}});
    }
    // clang-format on
 }

 TEST_F(FALLBACK, POOLING_GI_RECORD) {
    using Param = param::Pooling;
    TaskRecordChecker<Pooling> checker(0);
    // clang-format off
    for (size_t ih: {2, 3, 5, 7, 11, 13, 17})
    for (size_t iw: {2, 3, 5, 7, 11, 13, 17})
    for (size_t p: {1, 2})
    {
        Param param;
        param.mode = Param::Mode::MAX;
        param.window_h = param.window_w = 3;
        param.stride_h = param.stride_w = 2;
        param.pad_h = param.pad_w = p;
        checker.set_param(param).exec({{2, 3, ih, iw}, {}});

        param.mode = Param::Mode::AVERAGE;
        param.window_h = param.window_w = 3;
        param.stride_h = param.stride_w = 2;
        param.pad_h = param.pad_w = p;
        checker.set_param(param).exec({{2, 3, ih, iw}, {}});

        param.mode = Param::Mode::MAX;
        param.window_h = param.window_w = 4;
        param.stride_h = param.stride_w = 2;
        param.pad_h = param.pad_w = p;
        checker.set_param(param).exec({{2, 3, ih, iw}, {}});

        param.mode = Param::Mode::MAX;
        param.window_h = param.window_w = 5;
        param.stride_h = param.stride_w = 2;
        param.pad_h = param.pad_w = p;
        if (ih + p * 2 >= 5 && iw + p * 2 >= 5)
            checker.set_param(param).exec({{2, 3, ih, iw}, {}});
    }
    for (size_t ih: {2, 3, 5, 7, 11, 13, 17, 19, 23, 24, 25, 26, 27, 28, 29, 30})
    for (size_t iw: {2, 3, 5, 7, 11, 13, 17, 19, 23, 24, 25, 26, 27, 28, 29, 30})
    for (size_t p: {1, 2})
    {
        Param param;
        param.mode = Param::Mode::AVERAGE_COUNT_EXCLUDE_PADDING;
        param.window_h = param.window_w = 3;
        param.stride_h = param.stride_w = 1;
        param.pad_h = param.pad_w = p;
        Checker<Pooling> checker(handle());
        checker.set_param(param).exec({{2, 3, ih, iw}, {}});
    }
    // clang-format on
 }

 TEST_F(FALLBACK_MULTI_THREADS, POOLING_GI_RECORD) {
    using Param = param::Pooling;
    TaskRecordChecker<Pooling> checker(0);
    for (size_t ih : {2, 3, 5, 7, 11, 13, 17})
        for (size_t iw : {2, 3, 5, 7, 11, 13, 17})
            for (size_t p : {1, 2}) {
                Param param;
                param.mode = Param::Mode::MAX;
                param.window_h = param.window_w = 3;
                param.stride_h = param.stride_w = 2;
                param.pad_h = param.pad_w = p;
                checker.set_param(param).exec({{2, 3, ih, iw}, {}});

                param.mode = Param::Mode::AVERAGE;
                param.window_h = param.window_w = 3;
                param.stride_h = param.stride_w = 2;
                param.pad_h = param.pad_w = p;
                checker.set_param(param).exec({{2, 3, ih, iw}, {}});

                param.mode = Param::Mode::MAX;
                param.window_h = param.window_w = 4;
                param.stride_h = param.stride_w = 2;
                param.pad_h = param.pad_w = p;
                checker.set_param(param).exec({{2, 3, ih, iw}, {}});

                param.mode = Param::Mode::MAX;
                param.window_h = param.window_w = 5;
                param.stride_h = param.stride_w = 2;
                param.pad_h = param.pad_w = p;
                if (ih + p * 2 >= 5 && iw + p * 2 >= 5)
                    checker.set_param(param).exec({{2, 3, ih, iw}, {}});
            }
 }

 TEST_F(FALLBACK_MULTI_THREADS, POOLING_GI_W9_w13_NCHW44) {
    UniformIntRNG rng{-10, 10};
    Checker<Pooling> checker(handle());
    checker.set_rng(0, &rng);
    // clang-format off
    for (size_t ih: {20, 15})
    for (size_t iw: {15, 20})
    for (size_t kernel: {9, 13})
    for (size_t pad: {4, 6})
    for(auto mode: {param::Pooling::Mode::MAX, param::Pooling::Mode::AVERAGE})
    if (kernel > pad)
    {
        param::Pooling param;
        param.mode = mode;
        param.format = param::Pooling::Format::NCHW44;
        param.pad_h = pad;
        param.pad_w = pad;
        param.stride_h = param.stride_w = 1;
        param.window_h = param.window_w = kernel ;
        checker.set_param(param).exec(TensorShapeArray{{2, 8, ih, iw, 4}, {}});
    }
    // clang-format on
 }

 TEST_F(FALLBACK_MULTI_THREADS, POOLING_GI_FALLBACK) {
    using Param = param::Pooling;
    for (size_t ih : {2, 3, 5, 7, 11, 13, 17, 19, 23, 24, 25, 26, 27, 28, 29, 30})
        for (size_t iw : {2, 3, 5, 7, 11, 13, 17, 19, 23, 24, 25, 26, 27, 28, 29, 30})
            for (size_t p : {1, 2}) {
                Param param;
                param.mode = Param::Mode::MAX;
                param.window_h = param.window_w = 3;
                param.stride_h = param.stride_w = 2;
                param.pad_h = param.pad_w = p;
                Checker<Pooling> checker(handle());
                checker.set_param(param).exec({{2, 3, ih, iw}, {}});
            }
 }

 TEST_F(FALLBACK_MULTI_THREADS, POOLING_GI) {
    using Param = param::Pooling;
    for (size_t ih : {2, 3, 5, 7, 11, 13, 17, 19, 23, 24, 25, 26, 27, 28, 29, 30})
        for (size_t iw : {2, 3, 5, 7, 11, 13, 17, 19, 23, 24, 25, 26, 27, 28, 29, 30})
            for (size_t p : {1, 2}) {
                Param param;
                param.mode = Param::Mode::MAX;
                param.window_h = param.window_w = 3;
                param.stride_h = param.stride_w = 2;
                param.pad_h = param.pad_w = p;
                Checker<Pooling> checker(handle());
                checker.set_param(param).exec({{2, 3, ih, iw}, {}});

                param.mode = Param::Mode::AVERAGE;
                param.window_h = param.window_w = 3;
                param.stride_h = param.stride_w = 2;
                param.pad_h = param.pad_w = p;
                checker.set_param(param).exec({{2, 3, ih, iw}, {}});

                param.mode = Param::Mode::MAX;
                param.window_h = param.window_w = 4;
                param.stride_h = param.stride_w = 2;
                param.pad_h = param.pad_w = p;
                checker.set_param(param).exec({{2, 3, ih, iw}, {}});

                param.mode = Param::Mode::MAX;
                param.window_h = param.window_w = 5;
                param.stride_h = param.stride_w = 2;
                param.pad_h = param.pad_w = p;
                if (ih + p * 2 >= 5 && iw + p * 2 >= 5)
                    checker.set_param(param).exec({{2, 3, ih, iw}, {}});
            }
 }

 #if MEGDNN_WITH_BENCHMARK
 namespace {
 void benchmark_nchw44_fp32(Handle* handle) {
    using Param = param::Pooling;
    auto run = [&](size_t n, size_t c, size_t h, size_t w, size_t filter, size_t stride,
                   size_t pad, Param::Mode mode) {
        Param param;
        param.window_h = param.window_w = filter;
        param.stride_h = param.stride_w = stride;
        param.pad_h = param.pad_w = pad;
        param.format = Param::Format::NCHW;
        param.mode = mode;
        TensorShape nchw_shape = {n, c, h, w};
        TensorShape nchw44_shape = {n, c / 4, h, w, 4};
        TensorLayout dst_layout;
        auto opr = handle->create_operator<Pooling>();
        opr->param() = param;
        opr->deduce_layout({nchw_shape, dtype::Float32()}, dst_layout);
        float calc_amount =
                dst_layout.total_nr_elems() * param.window_h * param.window_w;

        Benchmarker<Pooling> benchmarker_float_nchw(handle);
        Benchmarker<Pooling> benchmarker_float_nchw44(handle);
        Benchmarker<Pooling> benchmarker_int_nchw44(handle);
        size_t RUN = 500;
        auto t1 = benchmarker_float_nchw.set_display(false)
                          .set_times(RUN)
                          .set_param(param)
                          .exec({nchw_shape, {}});

        param.format = Param::Format::NCHW44;
        auto t2 = benchmarker_int_nchw44.set_display(false)
                          .set_times(RUN)
                          .set_param(param)
                          .execl({{nchw44_shape, dtype::QuantizedS8(1.0)},
                                  {{}, dtype::QuantizedS8(1.0)}});
        auto t3 = benchmarker_float_nchw44.set_display(false)
                          .set_times(RUN)
                          .set_param(param)
                          .exec({nchw44_shape, {}});

        printf("{%zu %zu %zu %zu} filter = %zu, stride = %zu pad = %zu\n"
               "nchw_fp32={%.3f ms, %.3f Mflops},  "
               "nchw44_int={%.3f ms, %.3f Mflops},  "
               "nchw44_fp32={%.3f ms, %.3f Mflops, speed_up %f}\n\n",
               n, c, h, w, filter, stride, pad, t1 / RUN,
               calc_amount / (t1 / RUN * 1000), t2 / RUN,
               calc_amount / (t2 / RUN * 1000), t3 / RUN,
               calc_amount / (t3 / RUN * 1000), t1 / t3);
    };
    // Resnet50
    run(1, 64, 112, 112, 3, 2, 1, param::Pooling::Mode::MAX);
    run(1, 2048, 7, 7, 7, 1, 0, param::Pooling::Mode::AVERAGE);

    // VGG16
    run(1, 64, 224, 224, 2, 2, 0, param::Pooling::Mode::MAX);
    run(1, 128, 112, 112, 2, 2, 0, param::Pooling::Mode::MAX);
    run(1, 256, 56, 56, 2, 2, 0, param::Pooling::Mode::MAX);
    run(1, 512, 28, 28, 2, 2, 0, param::Pooling::Mode::MAX);
    run(1, 512, 14, 14, 2, 2, 0, param::Pooling::Mode::MAX);
 }
 }  // namespace

 TEST_F(FALLBACK, BENCHMARK_POOLING_GI_NCHW44_FP32) {
    benchmark_nchw44_fp32(handle());
 }

 TEST_F(FALLBACK_MULTI_THREADS, BENCHMARK_POOLING_GI_NCHW44_FP32) {
    benchmark_nchw44_fp32(handle());
 }
 TEST_F(FALLBACK, BENCHMARK_POOLING_GI_W4x4_S2x2) {
    using Param = param::Pooling;
    auto run = [&](const TensorShapeArray& shapes, Param param) {
        std::cout << "N:" << shapes[0][0] << " "
                  << "IC:" << shapes[0][1] << " "
                  << "IH:" << shapes[0][2] << " "
                  << "IW:" << shapes[0][3] << std::endl;
        auto handle_naive = create_cpu_handle(2);
        Benchmarker<Pooling> benchmarker_naive(handle_naive.get());
        Benchmarker<Pooling> benchmarker_float(handle());
        size_t RUN = 10;
        auto t1 = benchmarker_naive.set_display(false)
                          .set_times(RUN)
                          .set_param(param)
                          .exec(shapes);
        auto t2 = benchmarker_float.set_display(false)
                          .set_times(RUN)
                          .set_param(param)
                          .exec(shapes);
        TensorLayout dst_layout;
        auto opr = handle()->create_operator<Pooling>();
        opr->param() = param;
        opr->deduce_layout({shapes[0], dtype::Float32()}, dst_layout);
        float calc_amount =
                dst_layout.total_nr_elems() * param.window_h * param.window_w;
        printf("naive={%.3fms, %.3fMflops}, neon={%.3fms, %.3fMflops}\n", t1 / RUN,
               calc_amount / (t1 / RUN * 1000), t2 / RUN,
               calc_amount / (t2 / RUN * 1000));
    };
    Param param;
    param.window_h = param.window_w = 4;
    param.stride_h = param.stride_w = 2;
    param.pad_h = param.pad_w = 1;
    std::cout << "4x4 with 2x2 stride max pooling:" << std::endl;
    run({{1, 24, 160, 128}, {}}, param);
    run({{1, 4, 240, 135}, {}}, param);
    run({{1, 32, 120, 67}, {}}, param);
    run({{1, 64, 60, 33}, {}}, param);
 }

 TEST_F(FALLBACK, BENCHMARK_POOLING_GI_W5x5_S2x2) {
    using Param = param::Pooling;
    auto run = [&](const TensorShapeArray& shapes, Param param) {
        std::cout << "N:" << shapes[0][0] << " "
                  << "IC:" << shapes[0][1] << " "
                  << "IH:" << shapes[0][2] << " "
                  << "IW:" << shapes[0][3] << std::endl;
        auto handle_naive = create_cpu_handle(2);
        Benchmarker<Pooling> benchmarker_naive(handle_naive.get());
        Benchmarker<Pooling> benchmarker_float(handle());
        size_t RUN = 10;
        auto t1 = benchmarker_naive.set_display(false)
                          .set_times(RUN)
                          .set_param(param)
                          .exec(shapes);
        auto t2 = benchmarker_float.set_display(false)
                          .set_times(RUN)
                          .set_param(param)
                          .exec(shapes);
        TensorLayout dst_layout;
        auto opr = handle()->create_operator<Pooling>();
        opr->param() = param;
        opr->deduce_layout({shapes[0], dtype::Float32()}, dst_layout);
        float calc_amount =
                dst_layout.total_nr_elems() * param.window_h * param.window_w;
        printf("naive={%.3fms, %.3fMflops}, neon={%.3fms, %.3fMflops}\n", t1 / RUN,
               calc_amount / (t1 / RUN * 1000), t2 / RUN,
               calc_amount / (t2 / RUN * 1000));
    };
    Param param;
    param.window_h = param.window_w = 5;
    param.stride_h = param.stride_w = 2;
    param.pad_h = param.pad_w = 1;
    std::cout << "5x5 with 2x2 stride max pooling:" << std::endl;
    run({{1, 24, 160, 128}, {}}, param);
    run({{1, 4, 240, 135}, {}}, param);
    run({{1, 32, 120, 67}, {}}, param);
    run({{1, 64, 60, 33}, {}}, param);
 }
 namespace {
 template <typename Opr>
 void benchmark_impl(
        const typename Opr::Param& param, std::vector<SmallVector<TensorShape>> shapes,
        size_t RUNS, TaskExecutorConfig&& multi_thread_config,
        TaskExecutorConfig&& single_thread_config, DType data_type) {
    std::vector<float> multi_thread_times, single_thread_times;
    {
        auto multi_thread_hanle = create_cpu_handle(0, true, &multi_thread_config);
        auto benchmarker = Benchmarker<Opr>(multi_thread_hanle.get());
        benchmarker.set_times(RUNS).set_display(false).set_param(param);
        benchmarker.set_dtype(0, data_type);
        for (auto shape : shapes) {
            multi_thread_times.push_back(benchmarker.exec(shape) / RUNS);
        }
    }
    {
        auto single_thread_handle = create_cpu_handle(0, true, &single_thread_config);
        auto benchmarker = Benchmarker<Opr>(single_thread_handle.get());
        benchmarker.set_times(RUNS).set_display(false).set_param(param);
        benchmarker.set_dtype(0, data_type);
        for (auto shape : shapes) {
            single_thread_times.push_back(benchmarker.exec(shape) / RUNS);
        }
    }
    printf("Benchmark : Multi threads  %zu, ", multi_thread_config.nr_thread);
    printf("core_ids:");
    for (size_t i = 0; i < multi_thread_config.affinity_core_set.size(); i++) {
        printf("%zu ", multi_thread_config.affinity_core_set[i]);
    }
    printf(", Single thread core_id %zu\n", single_thread_config.affinity_core_set[0]);
    for (size_t i = 0; i < shapes.size(); i++) {
        auto shape = shapes[i];
        printf("Case: ");
        for (auto sh : shape)
            printf("%s ", sh.to_string().c_str());
        printf("%zu threads time: %f,\n single thread time: "
               "%f. spead up = %f, speedup/cores=%f\n",
               multi_thread_config.nr_thread, multi_thread_times[i],
               single_thread_times[i], single_thread_times[i] / multi_thread_times[i],
               single_thread_times[i] / multi_thread_times[i] /
                       multi_thread_config.nr_thread);
    }
 }
 }  // namespace

 TEST_F(FALLBACK_MULTI_THREADS, BENCHMARK_POOLING_GI) {
    constexpr size_t RUNS = 50;

    using Param = param::Pooling;
    Param param;
    param.window_h = param.window_w = 3;
    param.stride_h = param.stride_w = 2;
    param.pad_h = param.pad_w = 1;

    std::vector<SmallVector<TensorShape>> shapes;

    shapes.push_back({{32, 32, 215, 215}, {}});
    shapes.push_back({{32, 32, 128, 128}, {}});
    shapes.push_back({{8, 256, 100, 100}, {}});
    shapes.push_back({{1, 256, 100, 100}, {}});
    shapes.push_back({{1, 32, 100, 100}, {}});
    shapes.push_back({{1, 256, 80, 80}, {}});
    shapes.push_back({{1, 256, 60, 60}, {}});
    shapes.push_back({{1, 256, 30, 30}, {}});

    param.window_h = param.window_w = 3;
    param.stride_h = param.stride_w = 2;
    param.pad_h = param.pad_w = 1;
    printf("Benchmark POOLING kernel:%d*%d stride:%d,mode %d\n", param.window_h,
           param.window_w, param.stride_h, static_cast<int>(param.mode));
    benchmark_impl<Pooling>(
            param, shapes, RUNS, {4, {0, 1, 2, 3}}, {1, {0}}, dtype::Float32());
    benchmark_impl<Pooling>(
            param, shapes, RUNS, {4, {4, 5, 6, 7}}, {1, {4}}, dtype::Float32());
    benchmark_impl<Pooling>(
            param, shapes, RUNS, {2, {0, 1}}, {1, {0}}, dtype::Float32());
 }

 TEST_F(FALLBACK_MULTI_THREADS, BENCHMARK_POOLING_GI_NCHW44) {
    constexpr size_t RUNS = 50;

    using Param = param::Pooling;
    Param param;
    param.pad_h = param.pad_w = 0;
    param.mode = Param::Mode::MAX;
    std::vector<SmallVector<TensorShape>> shapes;
    std::vector<std::vector<size_t>> filter_and_stride = {
            {2, 1}, {2, 2}, {3, 1}, {3, 2}, {4, 1}, {4, 2}, {5, 1}, {5, 2}};

    for (auto mode : {param::Pooling::Mode::MAX, param::Pooling::Mode::AVERAGE}) {
        for (auto filter : filter_and_stride) {
            shapes.push_back({{1, 32 * 4, 215, 215}, {}});
            shapes.push_back({{1, 32 * 4, 128, 128}, {}});
            shapes.push_back({{1, 16 * 4, 56, 56}, {}});

            param.mode = mode;
            param.window_h = param.window_w = filter[0];
            param.stride_h = param.stride_w = filter[1];
            param.format = Param::Format::NCHW;
            printf("NCHW Benchmark POOLING kernel:%d*%d stride:%d,mode %d\n",
                   param.window_h, param.window_h, param.stride_h,
                   static_cast<int>(param.mode));
            benchmark_impl<Pooling>(
                    param, shapes, RUNS, {4, {4, 5, 6, 7}}, {1, {4}},
                    dtype::QuantizedS8(1.1f));
            shapes.clear();
            shapes.push_back({{1, 32, 215, 215, 4}, {}});
            shapes.push_back({{1, 32, 128, 128, 4}, {}});
            shapes.push_back({{1, 16, 56, 56, 4}, {}});

            param.format = Param::Format::NCHW44;
            printf("NCHW44 Benchmark POOLING kernel:%d*%d stride:%d,mode %d\n",
                   param.window_h, param.window_w, param.stride_h,
                   static_cast<int>(param.mode));
            benchmark_impl<Pooling>(
                    param, shapes, RUNS, {4, {4, 5, 6, 7}}, {1, {4}},
                    dtype::QuantizedS8(1.1f));
            shapes.clear();
        }
    }
 }
 #endif

 }  // namespace test
 }  // namespace megdnn
   // vim: syntax=cpp.doxygen