chore(winograd): remove winograd transform code

GitOrigin-RevId: 78c3cfceae
4 years ago · fc0fcd2f7f
--- a/dnn/include/megdnn/oprs/nn.h
+++ b/dnn/include/megdnn/oprs/nn.h
@@ -435,16 +435,6 @@ public:
            const TensorLayout& bias, const TensorLayout& z,
            const TensorLayout& dst) = 0;

    /**
     * \brief deduce the origin filter layout and conv_bias param after winograd
     * transform, this used in fast-run to construct the origin cache-key
     */
    static void deduce_winograd_origin_layout_and_param(
            const Param::Format format, const size_t output_block_size,
            const TensorLayout& src_layout,
            const TensorLayout& winograd_filter_layout,
            TensorLayout& origin_layout, Param& origin_param);

    enum class BiasMode : uint32_t {
        NO_BIAS = 0,             //!< no bias
        BROADCAST_CHANNEL_BIAS,  //!< broadcast channel bias, [1, c, 1, 1]
--- a/dnn/include/megdnn/oprs/utils.h
+++ b/dnn/include/megdnn/oprs/utils.h
@@ -91,29 +91,6 @@ class MaxTensorDiff : public OperatorBase {
        void check_exec(const TensorLayout& layout1,
                        const TensorLayout& layout2, size_t workspace_in_bytes);
 };

 /*!
 * \brief winograd preprocess opr.
 *
 * for the detail \see src/fallback/conv_bias/winograd/winograd.h
 *
 */
 class WinogradFilterPreprocess : public OperatorBase {
    DEF_OPR_PARAM(Winograd);
    DEF_OPR_IMPL(WinogradFilterPreprocess, OperatorBase, 1, 1);

 public:
    virtual void exec(_megdnn_tensor_in src, _megdnn_tensor_out dst,
                      _megdnn_workspace) = 0;

    size_t get_workspace_in_bytes(const TensorLayout&, const TensorLayout&);

    void deduce_layout(const TensorLayout& src, TensorLayout& dst);

 protected:
    void check_exec(const TensorLayout& src, const TensorLayout& dst,
                    size_t workspace_in_bytes);
 };
 }  // namespace megdnn

 #include "megdnn/internal/opr_header_epilogue.h"
--- a/dnn/scripts/opr_param_defs.py
+++ b/dnn/scripts/opr_param_defs.py
@@ -39,7 +39,7 @@ pdef('Axis').add_fields('int32', 'axis', 0)
          'NCHW44','NCHW44_DOT',
          Doc('NCHW_WINOGRAD', 'NCHW layout with weights tranformed by winograd'),
          Doc('NCHW88_WINOGRAD', 'NCHW88 layout with weights tranformed by winograd'),
          Doc('NCHW44_WINOGRAD', 'NCHW44 layout with weights tranformed by winograd'), 
          Doc('NCHW44_WINOGRAD', 'NCHW44 layout with weights tranformed by winograd'),
          Doc('NCHW4_NCHW32', 'NCHW4_NCHW32 means input tensors are nchw4 layout, output tensor is nchw32 layout'), 
          Doc('NCHW32_NCHW4', 'NCHW32_NCHW4 means input tensors are nchw32 layout, output tensor is nchw4 layout'), 
          Doc('NCHW4_NCHW', 'NCHW4_NCHW means input tensors are nchw4 layout, output tensor is nchw layout'), 
@@ -456,15 +456,6 @@ pdef('PowC', 'power with constant exponent').add_fields('float32', 'exp', 0)
              'layout is (K/4, M/4, 4(m), 4(k)) x (K/4, N, 4(k))'))
 )

 (pdef('Winograd', 'winograd param used in convbias').
  add_fields(
      'uint32',
      Doc('output_block_size', 'output block size, detail meaning see winograd '
          'in convbias, equals to the meaning of m in F(m, r)'), 0).
  add_enum_alias('Format', 'MatrixMul').
  add_enum_alias('ComputeMode', 'Convolution', name_field='compute_mode')
 )

 (pdef('SVD').
 add_fields('bool',
            Doc('full_matrices',
--- a/dnn/src/arm_common/conv_bias/f16/algos.cpp
+++ b/dnn/src/arm_common/conv_bias/f16/algos.cpp
@@ -27,7 +27,7 @@ using namespace arm_common;
 /* ======================= AlgoFP16WinogradF23 ======================== */

 bool ConvBiasImpl::AlgoFP16WinogradF23::usable(
         const NCBKernSizeParam& param,
        const NCBKernSizeParam& param,
        AlgoSelectionStrategy /*algo_selection_strategy*/) const {
    MEGDNN_MARK_USED_VAR(param);
    MIDOUT_BEGIN(megdnn_arm_common_winograd_fp16, 0, 0) {
@@ -37,12 +37,7 @@ bool ConvBiasImpl::AlgoFP16WinogradF23::usable(
                                      strategy, m_tile_size, param)
                                      .get_matmul_kern_param(param);
        return m_matmul_algo->usable(matmul_param) &&
               (param.filter_meta.format == param::ConvBias::Format::NCHW ||
                (param.filter_meta.format ==
                         param::ConvBias::Format::NCHW_WINOGRAD &&
                 param.output_block_size == 2 &&
                 param.winograd_matmul_format ==
                         param::MatrixMul::Format::DEFAULT)) &&
               param.filter_meta.format == param::ConvBias::Format::NCHW &&
               !param.filter_meta.should_flip &&
               (param.filter_meta.spatial[0] == param.filter_meta.spatial[1] &&
                param.filter_meta.spatial[0] == 3) &&
@@ -78,12 +73,7 @@ bool ConvBiasImpl::AlgoFP16WinogradF45::usable(
                                      strategy, m_tile_size, param)
                                      .get_matmul_kern_param(param);
        return m_matmul_algo->usable(matmul_param) &&
               (param.filter_meta.format == param::ConvBias::Format::NCHW ||
                (param.filter_meta.format ==
                         param::ConvBias::Format::NCHW_WINOGRAD &&
                 param.output_block_size == 4 &&
                 param.winograd_matmul_format ==
                         param::MatrixMul::Format::DEFAULT)) &&
               param.filter_meta.format == param::ConvBias::Format::NCHW &&
               !param.filter_meta.should_flip &&
               (param.filter_meta.spatial[0] == param.filter_meta.spatial[1] &&
                param.filter_meta.spatial[0] == 5) &&
@@ -117,12 +107,7 @@ bool ConvBiasImpl::AlgoFP16WinogradF63::usable(
                                      strategy, m_tile_size, param)
                                      .get_matmul_kern_param(param);
        return m_matmul_algo->usable(matmul_param) &&
               (param.filter_meta.format == param::ConvBias::Format::NCHW ||
                (param.filter_meta.format ==
                         param::ConvBias::Format::NCHW_WINOGRAD &&
                 param.output_block_size == 6 &&
                 param.winograd_matmul_format ==
                         param::MatrixMul::Format::DEFAULT)) &&
               param.filter_meta.format == param::ConvBias::Format::NCHW &&
               !param.filter_meta.should_flip &&
               (param.filter_meta.spatial[0] == param.filter_meta.spatial[1] &&
                param.filter_meta.spatial[0] == 3) &&
@@ -162,12 +147,7 @@ bool ConvBiasImpl::AlgoFP16WinogradF23_8x8::usable(
                        .get_matmul_kern_param(param);
        return m_matmul_algo->usable(matmul_param) &&
               m_matmul_algo->packmode() == PackMode::NO_PACK &&
               (param.filter_meta.format == param::ConvBias::Format::NCHW ||
                (param.filter_meta.format ==
                         param::ConvBias::Format::NCHW_WINOGRAD &&
                 param.output_block_size == 2 &&
                 param.winograd_matmul_format ==
                         param::MatrixMul::Format::MK8)) &&
               param.filter_meta.format == param::ConvBias::Format::NCHW &&
               !param.filter_meta.should_flip &&
               (param.filter_meta.spatial[0] == param.filter_meta.spatial[1] &&
                param.filter_meta.spatial[0] == 3) &&
--- a/dnn/src/arm_common/conv_bias/fp32/algos.cpp
+++ b/dnn/src/arm_common/conv_bias/fp32/algos.cpp
@@ -47,12 +47,7 @@ bool ConvBiasImpl::AlgoFP32WinogradF23_4x4::usable(
                        .get_matmul_kern_param(param);
        return m_matmul_algo->usable(matmul_param) &&
               m_matmul_algo->packmode() == PackMode::NO_PACK &&
               (param.filter_meta.format == param::ConvBias::Format::NCHW ||
                (param.filter_meta.format ==
                         param::ConvBias::Format::NCHW_WINOGRAD &&
                 param.output_block_size == 2 &&
                 param.winograd_matmul_format ==
                         param::MatrixMul::Format::MK4)) &&
               param.filter_meta.format == param::ConvBias::Format::NCHW &&
               !param.filter_meta.should_flip &&
               (param.filter_meta.spatial[0] == param.filter_meta.spatial[1] &&
                param.filter_meta.spatial[0] == 3) &&
@@ -86,12 +81,7 @@ bool ConvBiasImpl::AlgoFP32WinogradF63::usable(
                                      strategy, m_tile_size, param)
                                      .get_matmul_kern_param(param);
        return m_matmul_algo->usable(matmul_param) &&
               (param.filter_meta.format == param::ConvBias::Format::NCHW ||
                (param.filter_meta.format ==
                         param::ConvBias::Format::NCHW_WINOGRAD &&
                 param.output_block_size == 6 &&
                 param.winograd_matmul_format ==
                         param::MatrixMul::Format::DEFAULT)) &&
               param.filter_meta.format == param::ConvBias::Format::NCHW &&
               !param.filter_meta.should_flip &&
               (param.filter_meta.spatial[0] == param.filter_meta.spatial[1] &&
                param.filter_meta.spatial[0] == 3) &&
@@ -125,12 +115,7 @@ bool ConvBiasImpl::AlgoFP32WinogradF54::usable(
                                      strategy, m_tile_size, param)
                                      .get_matmul_kern_param(param);
        return m_matmul_algo->usable(matmul_param) &&
               (param.filter_meta.format == param::ConvBias::Format::NCHW ||
                (param.filter_meta.format ==
                         param::ConvBias::Format::NCHW_WINOGRAD &&
                 param.output_block_size == 5 &&
                 param.winograd_matmul_format ==
                         param::MatrixMul::Format::DEFAULT)) &&
               param.filter_meta.format == param::ConvBias::Format::NCHW &&
               !param.filter_meta.should_flip &&
               (param.filter_meta.spatial[0] == param.filter_meta.spatial[1] &&
                param.filter_meta.spatial[0] == 4) &&
@@ -164,12 +149,7 @@ bool ConvBiasImpl::AlgoFP32WinogradF45::usable(
                                      strategy, m_tile_size, param)
                                      .get_matmul_kern_param(param);
        return m_matmul_algo->usable(matmul_param) &&
               (param.filter_meta.format == param::ConvBias::Format::NCHW ||
                (param.filter_meta.format ==
                         param::ConvBias::Format::NCHW_WINOGRAD &&
                 param.output_block_size == 4 &&
                 param.winograd_matmul_format ==
                         param::MatrixMul::Format::DEFAULT)) &&
               param.filter_meta.format == param::ConvBias::Format::NCHW &&
               !param.filter_meta.should_flip &&
               (param.filter_meta.spatial[0] == param.filter_meta.spatial[1] &&
                param.filter_meta.spatial[0] == 5) &&
@@ -209,12 +189,7 @@ bool ConvBiasImpl::AlgoFP32WinogradF63_4x4::usable(
                        .get_matmul_kern_param(param);
        return m_matmul_algo->usable(matmul_param) &&
               m_matmul_algo->packmode() == PackMode::NO_PACK &&
               (param.filter_meta.format == param::ConvBias::Format::NCHW ||
                (param.filter_meta.format ==
                         param::ConvBias::Format::NCHW_WINOGRAD &&
                 param.output_block_size == 6 &&
                 param.winograd_matmul_format ==
                         param::MatrixMul::Format::MK4)) &&
               param.filter_meta.format == param::ConvBias::Format::NCHW &&
               !param.filter_meta.should_flip &&
               (param.filter_meta.spatial[0] == param.filter_meta.spatial[1] &&
                param.filter_meta.spatial[0] == 3) &&
@@ -257,12 +232,7 @@ bool ConvBiasImpl::AlgoFP32WinogradF23_4x4_NCHW44::usable(
        return m_matmul_algo->usable(matmul_param) &&
               m_matmul_algo->packmode() ==
                       fallback::MatrixMulImpl::AlgoBase::PackMode::NO_PACK &&
               (param.filter_meta.format == param::ConvBias::Format::NCHW44 ||
                (param.filter_meta.format ==
                         param::ConvBias::Format::NCHW44_WINOGRAD &&
                 param.output_block_size == 2 &&
                 param.winograd_matmul_format ==
                         param::MatrixMul::Format::MK4)) &&
               param.filter_meta.format == param::ConvBias::Format::NCHW44 &&
               !param.filter_meta.should_flip &&
               (param.filter_meta.spatial[0] == param.filter_meta.spatial[1] &&
                param.filter_meta.spatial[0] == 3) &&
@@ -303,12 +273,7 @@ bool ConvBiasImpl::AlgoFP32WinogradF63_4x4_NCHW44::usable(
        return m_matmul_algo->usable(matmul_param) &&
               m_matmul_algo->packmode() ==
                       fallback::MatrixMulImpl::AlgoBase::PackMode::NO_PACK &&
               (param.filter_meta.format == param::ConvBias::Format::NCHW44 ||
                (param.filter_meta.format ==
                         param::ConvBias::Format::NCHW44_WINOGRAD &&
                 param.output_block_size == 6 &&
                 param.winograd_matmul_format ==
                         param::MatrixMul::Format::MK4)) &&
               param.filter_meta.format == param::ConvBias::Format::NCHW44 &&
               !param.filter_meta.should_flip &&
               (param.filter_meta.spatial[0] == param.filter_meta.spatial[1] &&
                param.filter_meta.spatial[0] == 3) &&
@@ -350,12 +315,7 @@ bool ConvBiasImpl::AlgoFP32WinogradF73_4x4_NCHW44::usable(
        return m_matmul_algo->usable(matmul_param) &&
               m_matmul_algo->packmode() ==
                       fallback::MatrixMulImpl::AlgoBase::PackMode::NO_PACK &&
               (param.filter_meta.format == param::ConvBias::Format::NCHW44 ||
                (param.filter_meta.format ==
                         param::ConvBias::Format::NCHW44_WINOGRAD &&
                 param.output_block_size == 7 &&
                 param.winograd_matmul_format ==
                         param::MatrixMul::Format::MK4)) &&
               param.filter_meta.format == param::ConvBias::Format::NCHW44 &&
               !param.filter_meta.should_flip &&
               (param.filter_meta.spatial[0] == param.filter_meta.spatial[1] &&
                param.filter_meta.spatial[0] == 3) &&
--- a/dnn/src/arm_common/conv_bias/int8/algos.cpp
+++ b/dnn/src/arm_common/conv_bias/int8/algos.cpp
@@ -242,14 +242,8 @@ bool ConvBiasImpl::AlgoS8WinogradF23_8x8::usable(
                        .get_matmul_kern_param(param);
        return m_matmul_algo->usable(matmul_param) &&
               m_matmul_algo->packmode() == PackMode::NO_PACK &&
               ((param.filter_meta.format == param::ConvBias::Format::NCHW &&
                 param.filter_type.enumv() == DTypeEnum::QuantizedS8) ||
                (param.filter_meta.format ==
                         param::ConvBias::Format::NCHW_WINOGRAD &&
                 param.output_block_size == 2 &&
                 param.winograd_matmul_format ==
                         param::MatrixMul::Format::MK8 &&
                 param.filter_type.enumv() == DTypeEnum::QuantizedS16)) &&
               (param.filter_meta.format == param::ConvBias::Format::NCHW &&
                 param.filter_type.enumv() == DTypeEnum::QuantizedS8) &&
               !param.filter_meta.should_flip &&
               (param.filter_meta.spatial[0] == param.filter_meta.spatial[1] &&
                param.filter_meta.spatial[0] == 3) &&
@@ -293,13 +287,8 @@ bool ConvBiasImpl::AlgoS8CF32WinogradF23_4x4_NCHW44::usable(
                        .get_matmul_kern_param(param));
        return is_matmul_usable &&
               m_matmul_algo->packmode() == PackMode::NO_PACK &&
               ((param.filter_meta.format == param::ConvBias::Format::NCHW44 &&
                 param.filter_type.enumv() == DTypeEnum::QuantizedS8) ||
                ((param.filter_meta.format ==
                  param::ConvBias::Format::NCHW44_WINOGRAD) &&
                 param.output_block_size == 2 &&
                 param.winograd_matmul_format ==
                         param::MatrixMul::Format::MK4)) &&
               (param.filter_meta.format == param::ConvBias::Format::NCHW44 &&
                 param.filter_type.enumv() == DTypeEnum::QuantizedS8) &&
               !param.filter_meta.should_flip &&
               (param.filter_meta.spatial[0] == param.filter_meta.spatial[1] &&
                param.filter_meta.spatial[0] == 3) &&
@@ -341,14 +330,8 @@ bool ConvBiasImpl::AlgoS8WinogradF23_8x8_NCHW44::usable(
                        .get_matmul_kern_param(param);
        bool is_matmul_usable = m_matmul_algo->usable(matmul_param);
        return is_matmul_usable &&
               ((param.filter_meta.format == param::ConvBias::Format::NCHW44 &&
                 param.filter_type.enumv() == DTypeEnum::QuantizedS8) ||
                (param.filter_meta.format ==
                         param::ConvBias::Format::NCHW44_WINOGRAD &&
                 param.output_block_size == 2 &&
                 param.winograd_matmul_format ==
                         param::MatrixMul::Format::MK8 &&
                 param.filter_type.enumv() == DTypeEnum::QuantizedS16)) &&
               (param.filter_meta.format == param::ConvBias::Format::NCHW44 &&
                param.filter_type.enumv() == DTypeEnum::QuantizedS8) &&
               !param.filter_meta.should_flip &&
               (param.filter_meta.spatial[0] == param.filter_meta.spatial[1] &&
                param.filter_meta.spatial[0] == 3) &&
--- a/dnn/src/arm_common/conv_bias/int8/strategy_nchw44_2x3_4x4.cpp
+++ b/dnn/src/arm_common/conv_bias/int8/strategy_nchw44_2x3_4x4.cpp
@@ -240,7 +240,6 @@ void winograd_2x3_4x4_s8_f32_nchw44::filter(const int8_t* filter,
                                 float* transform_mid_buf, size_t OC, size_t IC,
                                 size_t oc_start, size_t oc_end) {
    constexpr int alpha = 2 + 3 - 1;

    /**
     * origin: (4x3) * (3 x 3) * (3 x 4)
     */
--- a/dnn/src/arm_common/conv_bias/opr_impl.cpp
+++ b/dnn/src/arm_common/conv_bias/opr_impl.cpp
@@ -290,8 +290,8 @@ ConvBiasImpl::get_all_packed_algo() {
 bool ConvBiasImpl::is_matmul_quantized_prefer(
        const ConvBiasImpl::NCBKernSizeParam& param) const {
    fallback::ConvBiasImpl::NCBKernSizeParam conv_ncb_param(
            param, 0, param::MatrixMul::Format::DEFAULT, {}, 0,
            BiasMode::NO_BIAS, param::ConvBias::NonlineMode::IDENTITY);
            param, {}, 0, BiasMode::NO_BIAS,
            param::ConvBias::NonlineMode::IDENTITY);
    conv_ncb_param.dst_type = param.bias_type;
    conv_ncb_param.filter_meta.group = 1;

@@ -320,11 +320,6 @@ SmallVector<AlgoCategory> ConvBiasImpl::suggest_algo_category_order(
    auto FH = param.filter_meta.spatial[0];
    auto FW = param.filter_meta.spatial[1];
    //! TODO: now winograd only support fast-run
    if (param.filter_meta.format == param::ConvBias::Format::NCHW_WINOGRAD ||
        param.filter_meta.format == param::ConvBias::Format::NCHW44_WINOGRAD ||
        param.filter_meta.format == param::ConvBias::Format::NCHW88_WINOGRAD) {
        return {AlgoCategory::WINOGRAD};
    }
    //! im2col
    bool im2col_prefer = (IC >= 32 || OC >= 32);
    //! quantized algo use matmul when direct algo is unusable
--- a/dnn/src/arm_common/handle.cpp
+++ b/dnn/src/arm_common/handle.cpp
@@ -27,7 +27,7 @@
 #include "src/arm_common/type_cvt/opr_impl.h"
 #include "src/arm_common/reduce/opr_impl.h"
 #include "src/arm_common/conv_bias/opr_impl.h"
 #include "src/arm_common/winograd_filter_preprocess/opr_impl.h"


 namespace megdnn {
 namespace arm_common {
@@ -50,7 +50,6 @@ MEGDNN_SPECIALIZE_CREATE_OPERATOR(WarpPerspective)
 MEGDNN_SPECIALIZE_CREATE_OPERATOR(TypeCvt)
 MEGDNN_SPECIALIZE_CREATE_OPERATOR(Reduce)
 MEGDNN_SPECIALIZE_CREATE_OPERATOR(ConvBias)
 MEGDNN_SPECIALIZE_CREATE_OPERATOR(WinogradFilterPreprocess)
 MEGDNN_SPECIALIZE_CREATE_OPERATOR(ConvolutionBackwardData)

 #pragma GCC diagnostic push
--- a/dnn/src/arm_common/winograd_filter_preprocess/opr_impl.cpp
+++ b/dnn/src/arm_common/winograd_filter_preprocess/opr_impl.cpp
@@ -1,179 +0,0 @@
 /**
 * \file dnn/src/arm_common/winograd_filter_preprocess/opr_impl.cpp
 * MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
 *
 * Copyright (c) 2014-2020 Megvii Inc. All rights reserved.
 *
 * Unless required by applicable law or agreed to in writing,
 * software distributed under the License is distributed on an
 * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 */

 #include "src/arm_common/winograd_filter_preprocess/opr_impl.h"
 #include "src/arm_common/handle.h"
 #include "src/common/utils.h"
 #include "src/arm_common/conv_bias/fp32/strategy.h"
 #include "src/arm_common/conv_bias/int8/strategy.h"
 #include "src/arm_common/conv_bias/f16/strategy.h"

 #include "midout.h"
 MIDOUT_DECL(megdnn_arm_common_winograd_filter_preprocess)

 using namespace megdnn;
 using namespace arm_common;

 void WinogradFilterPreprocessImpl::exec(_megdnn_tensor_in src,
                                        _megdnn_tensor_out dst,
                                        _megdnn_workspace workspace) {
    using namespace winograd;
    check_exec(src.layout, dst.layout, workspace.size);

    //! NCHW44 group conv or NCHW group conv or both dense conv
    size_t flt_start = 0;
    size_t pack_c_size = 1;
    size_t group = 1;
    if (src.layout.ndim == 5) { //! {g, OC, IC, FH, FW}
        flt_start = 1;
        group = src.layout[0];
    } else if (src.layout.ndim == 6) { //! {OC/4, IC/4, FH, FW, 4, 4}
        pack_c_size = src.layout[5];
    } else if (src.layout.ndim == 7) { //! {g, OC/4, IC/4, FH, FW, 4, 4}
        flt_start = 1;
        group = src.layout[0];
        pack_c_size = src.layout[6];
    }
    size_t OC = src.layout[flt_start] * pack_c_size,
           IC = src.layout[flt_start + 1] * pack_c_size,
           FW = src.layout[flt_start + 3];
    size_t m = param().output_block_size;

    bool execed = false;

 #define DISPATCH(_strategy, _format, ...)                                    \
    MIDOUT_BEGIN(megdnn_arm_common_winograd_filter_preprocess,               \
                 ##__VA_ARGS__) {                                            \
        if (param().format == _format) {                                     \
            for (size_t g = 0; g < group; g++) {                             \
                auto run = [=]() {                                           \
                    _strategy strategy(src.layout.dtype, src.layout.dtype,   \
                                       src.layout.dtype);                    \
                    megdnn::winograd::ConvBias<_strategy, _format>(strategy, \
                                                                   1_z)      \
                            .filter_process(src_ptr, dst_ptr, workspace_ptr, \
                                            OC, IC);                         \
                };                                                           \
                MEGDNN_DISPATCH_CPU_KERN_OPR(run());                         \
                src_ptr += src.layout.stride[0];                             \
                dst_ptr += dst.layout.stride[0];                             \
            }                                                                \
            execed = true;                                                   \
        }                                                                    \
    }                                                                        \
    MIDOUT_END();

    if (src.layout.dtype.enumv() == DTypeEnum::Float32) {
        const float* src_ptr = src.ptr<float>();
        float* dst_ptr = dst.ptr<float>();
        float* workspace_ptr = workspace.ptr<float>();
        if (FW == 3) {
            if (m == 2) {
                if (pack_c_size == 1) {
                    DISPATCH(winograd_2x3_4x4_f, param::Winograd::Format::MK4,
                             0, 0);
                } else if (pack_c_size == 4) {
                    DISPATCH(winograd_F23_mk4_f_nchw44,
                             param::Winograd::Format::MK4, 0, 5);
                }
            } else if (m == 6) {
                DISPATCH(winograd_6x3_1x1_f, param::Winograd::Format::DEFAULT,
                         0, 1);
                if (pack_c_size == 1) {
                    DISPATCH(winograd_6x3_4x4_f, param::Winograd::Format::MK4,
                             0, 2);
                } else if (pack_c_size == 4) {
                    DISPATCH(winograd_F63_mk4_f_nchw44,
                             param::Winograd::Format::MK4, 0, 6);
                }
            } else if (m == 7) {
                megdnn_assert(pack_c_size == 4, "WINOGRAD F(7,3) Only Supports NCHW44");
                DISPATCH(winograd_F73_mk4_f_nchw44,
                         param::Winograd::Format::MK4, 0, 7);
            }
        } else if (FW == 4) {
            if (m == 5) {
                DISPATCH(winograd_5x4_1x1_f, param::Winograd::Format::DEFAULT,
                         0, 3);
            }
        } else if (FW == 5) {
            if (m == 4) {
                DISPATCH(winograd_4x5_1x1_f, param::Winograd::Format::DEFAULT,
                         0, 4);
            }
        }
    }
    if (src.layout.dtype.enumv() == DTypeEnum::QuantizedS8) {
        const dt_int8* src_ptr = src.compatible_ptr<dt_int8>();
        if (param().compute_mode == param::ConvBias::ComputeMode::DEFAULT) {
            dt_int16* dst_ptr = dst.compatible_ptr<dt_int16>();
            dt_int16* workspace_ptr = workspace.ptr<dt_int16>();
            if (FW == 3) {
                if (m == 2) {
                    if (pack_c_size == 1) {
                        DISPATCH(winograd_2x3_8x8_s8,
                                 param::Winograd::Format::MK8, 1, 0);
                    } else if (pack_c_size == 4) {
                        DISPATCH(winograd_2x3_8x8_s8_nchw44,
                                 param::Winograd::Format::MK8, 1, 0);
                    }else{
                        megdnn_throw("only support pack_c_size = 1 or 4");
                    }
                }
            }
        } else {
            dt_int32* dst_ptr_tmp = dst.compatible_ptr<dt_int32>();
            dt_int32* workspace_ptr_tmp = workspace.ptr<dt_int32>();
            float* dst_ptr = reinterpret_cast<float*>(dst_ptr_tmp);
            float* workspace_ptr = reinterpret_cast<float*>(workspace_ptr_tmp);
            if (pack_c_size == 4) {
                if (FW == 3) {
                    if (m == 2) {
                        DISPATCH(winograd_2x3_4x4_s8_f32_nchw44,
                                 param::Winograd::Format::MK4, 1, 1);
                    }
                }
            } else {
                megdnn_throw("only support pack_c_size == 4");
            }
        }
    }
 #if __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
    if (src.layout.dtype.enumv() == DTypeEnum::Float16) {
        const dt_float16* src_ptr = src.ptr<dt_float16>();
        dt_float16* dst_ptr = dst.ptr<dt_float16>();
        dt_float16* workspace_ptr = workspace.ptr<dt_float16>();
        if (FW == 3) {
            if (m == 2) {
                DISPATCH(winograd_2x3_4x4_f16, param::Winograd::Format::DEFAULT,
                         2, 0);
                DISPATCH(winograd_2x3_8x8_f16, param::Winograd::Format::MK8, 2,
                         1);
            } else if (m == 6) {
                DISPATCH(winograd_6x3_1x1_f16, param::Winograd::Format::DEFAULT,
                         2, 2);
            }
        } else if (FW == 5) {
            if (m == 4) {
                DISPATCH(winograd_4x5_1x1_f16, param::Winograd::Format::DEFAULT,
                         2, 3);
            }
        }
    }
 #endif
 #undef DISPATCH

    megdnn_assert(execed,
                  "Unsupport winograd filter preprocess. m: %zu src: %s", m,
                  src.layout.to_string().c_str());
 }

 // vim: syntax=cpp.doxygen
--- a/dnn/src/arm_common/winograd_filter_preprocess/opr_impl.h
+++ b/dnn/src/arm_common/winograd_filter_preprocess/opr_impl.h
@@ -1,28 +0,0 @@
 /**
 * \file dnn/src/arm_common/winograd_filter_preprocess/opr_impl.h
 * MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
 *
 * Copyright (c) 2014-2020 Megvii Inc. All rights reserved.
 *
 * Unless required by applicable law or agreed to in writing,
 * software distributed under the License is distributed on an
 * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 */
 #pragma once
 #include "megdnn/oprs.h"
 #include "src/common/utils.h"

 namespace megdnn {
 namespace arm_common {

 class WinogradFilterPreprocessImpl : public WinogradFilterPreprocess {
 public:
    using WinogradFilterPreprocess::WinogradFilterPreprocess;
    void exec(_megdnn_tensor_in src, _megdnn_tensor_out dst,
              _megdnn_workspace workspace) override;
 };

 }  // namespace arm_common
 }  // namespace megdnn

 // vim: syntax=cpp.doxygen
--- a/dnn/src/common/conv_bias.cpp
+++ b/dnn/src/common/conv_bias.cpp
@@ -35,37 +35,11 @@ ConvBiasForward::CanonizedFilterMeta ConvBiasForward::check_exec(
        const TensorLayout& bias, const TensorLayout& z,
        const TensorLayout& dst, size_t workspace_in_bytes,
        const PreprocessedFilter* preprocessed_filter) {
    if ((param().format == param::ConvBias::Format::NCHW_WINOGRAD ||
         param().format == param::ConvBias::Format::NCHW88_WINOGRAD ||
         param().format == param::ConvBias::Format::NCHW44_WINOGRAD) &&
        src.dtype.category() == DTypeCategory::QUANTIZED) {
        megdnn_assert(filter.dtype.enumv() == DTypeEnum::QuantizedS16 ||
                      //!int8 winogradf23_44 using float,QuantizedS32 take the scale
                      filter.dtype.enumv() == DTypeEnum::QuantizedS32);
        megdnn_assert(src.dtype.enumv() == DTypeEnum::QuantizedS8 ||
                      src.dtype.enumv() == DTypeEnum::Quantized8Asymm);
    } else {
        megdnn_assert(src.dtype.enumv() == filter.dtype.enumv());
    }
    megdnn_assert(src.dtype.enumv() == filter.dtype.enumv());
    if (src.dtype.enumv() == DTypeEnum::QuantizedS8) {
        if (bias.dtype.enumv() == DTypeEnum::QuantizedS32) {
            float scale_src = src.dtype.param<dtype::QuantizedS8>().scale;
            float scale_filter = 0.f;
            if (param().format == param::ConvBias::Format::NCHW_WINOGRAD ||
                param().format == param::ConvBias::Format::NCHW88_WINOGRAD ||
                param().format == param::ConvBias::Format::NCHW44_WINOGRAD) {
                if (filter.dtype.enumv() == DTypeEnum::QuantizedS32) {
                    //! int8 winogradf23_44 using float,QuantizedS32 take the
                    //! scale
                    scale_filter =
                            filter.dtype.param<dtype::QuantizedS32>().scale;
                } else {
                    scale_filter =
                            filter.dtype.param<dtype::QuantizedS16>().scale;
                }
            } else {
                scale_filter = filter.dtype.param<dtype::QuantizedS8>().scale;
            }
            float scale_filter = filter.dtype.param<dtype::QuantizedS8>().scale;
            float scale_bias = bias.dtype.param<dtype::QuantizedS32>().scale;
            megdnn_assert(
                    std::abs(scale_src * scale_filter - scale_bias) < 1e-6,
@@ -77,15 +51,8 @@ ConvBiasForward::CanonizedFilterMeta ConvBiasForward::check_exec(
    } else if (src.dtype.enumv() == DTypeEnum::Quantized8Asymm) {
        if (bias.dtype.enumv() == DTypeEnum::QuantizedS32) {
            float scale_src = src.dtype.param<dtype::Quantized8Asymm>().scale;
            float scale_filter = 0.f;
            if (param().format == param::ConvBias::Format::NCHW_WINOGRAD ||
                param().format == param::ConvBias::Format::NCHW88_WINOGRAD ||
                param().format == param::ConvBias::Format::NCHW44_WINOGRAD) {
                scale_filter = filter.dtype.param<dtype::QuantizedS16>().scale;
            } else {
                scale_filter =
                        filter.dtype.param<dtype::Quantized8Asymm>().scale;
            }
            float scale_filter =
                    filter.dtype.param<dtype::Quantized8Asymm>().scale;
            float scale_bias = bias.dtype.param<dtype::QuantizedS32>().scale;
            megdnn_assert(
                    std::abs(scale_src * scale_filter - scale_bias) < 1e-6,
@@ -115,7 +82,6 @@ ConvBiasForward::CanonizedFilterMeta ConvBiasForward::check_exec(
        if (check_eq(bias, dst))
            return ret;
        if (param().format == param::ConvBias::Format::NCHW ||
            param().format == param::ConvBias::Format::NCHW_WINOGRAD ||
            param().format == param::ConvBias::Format::NCHW4_NCHW) {
            megdnn_assert(bias.shape[0] == 1);
            megdnn_assert(bias.shape[1] == dst.shape[1], "bias:%s, dst:%s",
@@ -131,7 +97,6 @@ ConvBiasForward::CanonizedFilterMeta ConvBiasForward::check_exec(
        } else if (param().format == param::ConvBias::Format::NCHW4 ||
                   param().format == param::ConvBias::Format::NCHW44 ||
                   param().format == param::ConvBias::Format::NCHW44_DOT ||
                   param().format == param::ConvBias::Format::NCHW44_WINOGRAD ||
                   param().format == param::ConvBias::Format::NCHW32_NCHW4) {
            megdnn_assert(bias.shape[0] == 1);
            megdnn_assert(bias.shape[1] == dst.shape[1], "bias:%s, dst:%s",
@@ -140,8 +105,7 @@ ConvBiasForward::CanonizedFilterMeta ConvBiasForward::check_exec(
            megdnn_assert(bias.shape[3] == 1);
            megdnn_assert(bias.shape[4] == 4);
        } else if (param().format == param::ConvBias::Format::NCHW8 ||
                   param().format == param::ConvBias::Format::NCHW88 ||
                   param().format == param::ConvBias::Format::NCHW88_WINOGRAD) {
                   param().format == param::ConvBias::Format::NCHW88 ) {
            megdnn_assert(bias.shape[0] == 1);
            megdnn_assert(bias.shape[1] == dst.shape[1], "bias:%s, dst:%s",
                          bias.to_string().c_str(), dst.to_string().c_str());
@@ -175,11 +139,6 @@ ConvBiasForward::CanonizedFilterMeta ConvBiasForward::check_exec(
    }

    if (z.ndim != 0) {
        megdnn_assert(param().format != param::ConvBias::Format::NCHW_WINOGRAD);
        megdnn_assert(param().format !=
                      param::ConvBias::Format::NCHW88_WINOGRAD);
        megdnn_assert(param().format !=
                      param::ConvBias::Format::NCHW44_WINOGRAD);
        megdnn_assert(param().format != param::ConvBias::Format::NCHW4_NCHW32);
        megdnn_assert(param().format != param::ConvBias::Format::NCHW32_NCHW4);
        megdnn_assert(z.dtype.enumv() == dst.dtype.enumv());
@@ -187,105 +146,6 @@ ConvBiasForward::CanonizedFilterMeta ConvBiasForward::check_exec(
    }
    return ret;
 }
 /*!
 * \brief deduce the origin filter layout and param after winograd transformed
 */
 void ConvBiasForward::deduce_winograd_origin_layout_and_param(
        const Param::Format format, const size_t output_block_size,
        const TensorLayout& src_layout,
        const TensorLayout& winograd_filter_layout, TensorLayout& origin_layout,
        Param& origin_param) {
    if (format == megdnn::param::ConvBias::Format::NCHW88_WINOGRAD ||
        format == megdnn::param::ConvBias::Format::NCHW44_WINOGRAD ||
        format == megdnn::param::ConvBias::Format::NCHW_WINOGRAD) {
        //! change NCHWxx_WINOGRAD to NCHWxx
        size_t OC = 0;
        size_t IC = 0;
        size_t GROUP = 1;
        size_t FH = winograd_filter_layout[1] - output_block_size + 1;

        //! {alpha, alpha, IC, OC}
        if (winograd_filter_layout.ndim == 4) {
            OC = winograd_filter_layout[3];
            IC = winograd_filter_layout[2];
        }
        //! {group, alpha, alpha, IC, OC}
        else if (winograd_filter_layout.ndim == 5) {
            OC = winograd_filter_layout[4];
            IC = winograd_filter_layout[3];
            GROUP = winograd_filter_layout[0];
        }
        //! {alpha, alpha, OC/f, IC/f, f, f}
        else if (winograd_filter_layout.ndim == 6) {
            OC = winograd_filter_layout[2] * winograd_filter_layout[5];
            IC = winograd_filter_layout[3] * winograd_filter_layout[4];
        }
        //! {group, alpha, alpha, OC/f, IC/f, f, f}
        else if (winograd_filter_layout.ndim == 7) {
            OC = winograd_filter_layout[3] * winograd_filter_layout[6];
            IC = winograd_filter_layout[4] * winograd_filter_layout[5];
            GROUP = winograd_filter_layout[0];
        }
        auto origin_data_type = winograd_filter_layout.dtype;
        if (src_layout.dtype.enumv() == DTypeEnum::QuantizedS8) {
            if (origin_data_type.enumv() == DTypeEnum::QuantizedS16) {
                float scale =
                        origin_data_type.param<dtype::QuantizedS16>().scale;
                origin_data_type = megdnn::dtype::QuantizedS8(scale);
            } else {
                //! In order to braing the sacle of filter, the transformed
                //! qint8 winograd filter computing with float dtype is Qint32
                megdnn_assert(origin_data_type.enumv() ==
                              DTypeEnum::QuantizedS32);
                float scale =
                        origin_data_type.param<dtype::QuantizedS32>().scale;
                origin_data_type = megdnn::dtype::QuantizedS8(scale);
            }
        }

        if (GROUP == 1) {
            if (format == megdnn::param::ConvBias::Format::NCHW_WINOGRAD) {
                origin_layout =
                        TensorLayout({OC, IC, FH, FH}, origin_data_type);
            } else if (format ==
                       megdnn::param::ConvBias::Format::NCHW44_WINOGRAD) {
                origin_layout = TensorLayout({OC / 4, IC / 4, FH, FH, 4, 4},
                                             origin_data_type);
            } else {
                megdnn_assert(format ==
                              megdnn::param::ConvBias::Format::NCHW88_WINOGRAD);
                origin_layout = TensorLayout({OC / 8, IC / 8, FH, FH, 8, 8},
                                             origin_data_type);
            }
        } else {
            if (format == megdnn::param::ConvBias::Format::NCHW_WINOGRAD) {
                origin_layout =
                        TensorLayout({GROUP, OC, IC, FH, FH}, origin_data_type);
            } else if (format ==
                       megdnn::param::ConvBias::Format::NCHW44_WINOGRAD) {
                origin_layout =
                        TensorLayout({GROUP, OC / 4, IC / 4, FH, FH, 4, 4},
                                     origin_data_type);
            } else {
                megdnn_assert(format ==
                              megdnn::param::ConvBias::Format::NCHW88_WINOGRAD);
                origin_layout =
                        TensorLayout({GROUP, OC / 8, IC / 8, FH, FH, 8, 8},
                                     origin_data_type);
            }
        }
        origin_param.output_block_size = 0;
        if (format == megdnn::param::ConvBias::Format::NCHW_WINOGRAD) {
            origin_param.format = megdnn::param::ConvBias::Format::NCHW;
        } else if (format == megdnn::param::ConvBias::Format::NCHW44_WINOGRAD) {
            origin_param.format = megdnn::param::ConvBias::Format::NCHW44;
        } else {
            megdnn_assert(format ==
                          megdnn::param::ConvBias::Format::NCHW88_WINOGRAD);
            origin_param.format = megdnn::param::ConvBias::Format::NCHW88;
        }
    }
 }

 template <typename T>
 struct NCHWParamTrait;
--- a/dnn/src/common/convolution.cpp
+++ b/dnn/src/common/convolution.cpp
@@ -41,36 +41,12 @@ uint32_t spatial_getter(uint32_t filter, const Param&) {
    return filter;
 }

 template <>
 uint32_t
 spatial_getter<param::ConvBias, param::ConvBias::Format::NCHW_WINOGRAD>(
        uint32_t filter, const param::ConvBias& param) {
    //! f = m + r - 1 -> r = f + 1 - m
    return filter - param.output_block_size + 1;
 }

 template <>
 uint32_t
 spatial_getter<param::ConvBias, param::ConvBias::Format::NCHW88_WINOGRAD>(
        uint32_t filter, const param::ConvBias& param) {
    //! f = m + r - 1 -> r = f + 1 - m
    return filter - param.output_block_size + 1;
 }
 template <>
 uint32_t
 spatial_getter<param::ConvBias, param::ConvBias::Format::NCHW44_WINOGRAD>(
        uint32_t filter, const param::ConvBias& param) {
    //! f = m + r - 1 -> r = f + 1 - m
    return filter - param.output_block_size + 1;
 }

 template <typename Parameter, typename Param>
 void make_canonized_filter_meta_nchw_nhwc(
        size_t src_ndim, const TensorLayout& filter, const Param& param,
        typename ConvolutionBase<Parameter>::CanonizedFilterMeta& ret) {
    megdnn_assert(param.format == Param::Format::NCHW ||
                  param.format == Param::Format::NHWC ||
                  param.format == Param::Format::NCHW_WINOGRAD);
                  param.format == Param::Format::NHWC );
    auto img_ndim = src_ndim - 2;
    size_t flt_start, flt_spatial_start, ocpg_pos, icpg_pos;
    if (param.sparse == Param::Sparse::DENSE) {
@@ -101,20 +77,6 @@ void make_canonized_filter_meta_nchw_nhwc(
        flt_spatial_start = 2;
        ocpg_pos = 0;
        icpg_pos = 1;
    } else if (param.format == Param::Format::NCHW_WINOGRAD) {
        // filter should be (alphah, alphaw, ic, oc) or (alphah, alphaw, ocb,
        // icb, ic_block_size, oc_block_size)
        flt_spatial_start = 0;
        if (filter.ndim == flt_start + 4) {
            ocpg_pos = 3;
            icpg_pos = 2;
        } else {
            megdnn_assert(filter.ndim == flt_start + 6);
            ic_block_size = filter[flt_start + 4];
            oc_block_size = filter[flt_start + 5];
            ocpg_pos = 2;
            icpg_pos = 3;
        }
    } else {
        megdnn_assert(param.format == Param::Format::NHWC,
                      "invalid conv tensor format");
@@ -136,14 +98,8 @@ void make_canonized_filter_meta_nchw_nhwc(
        megdnn_assert(dilation[i] > 0,
                      "invalid dilation on spatial dim %zu: %u", i,
                      dilation[i]);
        if (param.format == Param::Format::NCHW_WINOGRAD) {
            ret.spatial[i] =
                    spatial_getter<Param, Param::Format::NCHW_WINOGRAD>(
                            filter[i + flt_start + flt_spatial_start], param);
        } else {
            ret.spatial[i] = spatial_getter<Param, Param::Format::NCHW>(
                    filter[i + flt_start + flt_spatial_start], param);
        }
        ret.spatial[i] = spatial_getter<Param, Param::Format::NCHW>(
                filter[i + flt_start + flt_spatial_start], param);
        ret.dilated_spatial[i] = (ret.spatial[i] - 1) * dilation[i] + 1;
    }
 }
@@ -295,20 +251,12 @@ void make_canonized_filter_meta_nchwxx(
     *                  FH, FW, pack_size(IC), pack_size(OC)} [group]
     *        {GROUP/pack_size, 1, 1, FH, FW, pack_size} [chan]
     *
     ** NCHW88_WINOGRAD and NCHW44_WINOGRAD mode
     * filter:
     *        {alpha, alpha, OC/pack_size, IC/pack_size, pack_size(IC),
     *pack_size(OC)} [dense]
     *        {GROUP, alpha, alpha, OC_PER_GROUP/pack_size,
     *          IC_PER_GROUP/pack_size, pack_size(IC), pack_size(OC)} [group]
     *
     */

    megdnn_assert(param.format == Param::Format::NCHW88 ||
                  param.format == Param::Format::NCHW44 ||
                  param.format == Param::Format::NCHW44_WINOGRAD ||
                  param.format == Param::Format::NCHW44_DOT ||
                  param.format == Param::Format::NCHW88_WINOGRAD);
                  param.format == Param::Format::NCHW44_DOT);
    size_t img_ndim = 2;
    size_t flt_start = 0;
    size_t flt_spatial_start = 2;
@@ -325,10 +273,6 @@ void make_canonized_filter_meta_nchwxx(
                          filter[filter.ndim - 1]);
            ret.group = 1;
            flt_start = 0;
            if (param.format == Param::Format::NCHW88_WINOGRAD ||
                param.format == Param::Format::NCHW44_WINOGRAD) {
                flt_start = 2;
            }
            if (filter[filter.ndim - 2] == 2 * pack_size &&
                filter[filter.ndim - 1] == 2 * pack_size) {
                pack_c_size = 2 * pack_size;
@@ -339,10 +283,6 @@ void make_canonized_filter_meta_nchwxx(
            ret.icpg = filter[flt_start + 1] * pack_c_size;
        } else if (filter.ndim == img_ndim + 3) {
            // ohwi8o
            megdnn_assert(param.format != Param::Format::NCHW88_WINOGRAD,
                          "Hybrid nchw88 mode in not support winograd");
            megdnn_assert(param.format != Param::Format::NCHW44_WINOGRAD,
                          "Hybrid nchw44 mode in not support winograd");
            flt_start = 0;
            flt_spatial_start = 1;
            ret.group = 1;
@@ -357,15 +297,9 @@ void make_canonized_filter_meta_nchwxx(
        megdnn_assert(param.sparse == Param::Sparse::GROUP,
                      "invalid convolution sparse type");
        flt_start = 1;
        if (param.format == Param::Format::NCHW88_WINOGRAD ||
            param.format == Param::Format::NCHW44_WINOGRAD) {
            flt_start = 3;
        }
        auto filter_oc = filter[flt_start];
        auto filter_ic = filter[flt_start + 1];
        if (filter_oc == 1 && filter_ic == 1 && filter.ndim == (img_ndim + 4) &&
            param.format != Param::Format::NCHW88_WINOGRAD &&
            param.format != Param::Format::NCHW44_WINOGRAD) {
        if (filter_oc == 1 && filter_ic == 1 && filter.ndim == (img_ndim + 4)) {
            // Depthwise case goihw8g
            megdnn_assert(filter.ndim == img_ndim + 4,
                          "bad filter ndim for group convolution: "
@@ -416,17 +350,7 @@ void make_canonized_filter_meta_nchwxx(
                      "NCHWXX has invalid dilation on spatial dim %zu: %u, "
                      "require to be 1",
                      i, dilation[i]);
        if (param.format == Param::Format::NCHW88_WINOGRAD) {
            ret.spatial[i] =
                    spatial_getter<Param, Param::Format::NCHW88_WINOGRAD>(
                            filter[i + flt_start - 2], param);
        } else if (param.format == Param::Format::NCHW44_WINOGRAD) {
            ret.spatial[i] =
                    spatial_getter<Param, Param::Format::NCHW44_WINOGRAD>(
                            filter[i + flt_start - 2], param);
        } else {
            ret.spatial[i] = filter[i + flt_start + flt_spatial_start];
        }
        ret.spatial[i] = filter[i + flt_start + flt_spatial_start];
        ret.dilated_spatial[i] = (ret.spatial[i] - 1) * dilation[i] + 1;
    }
 }
@@ -579,13 +503,11 @@ ConvolutionBase<Parameter>::make_canonized_filter_meta(
    } else if (param().format == Param::Format::NCHW8) {
        make_canonized_filter_meta_nchwx<8, Parameter>(src_ndim, filter,
                                                       param(), ret);
    } else if (param().format == Param::Format::NCHW88 ||
               param().format == Param::Format::NCHW88_WINOGRAD) {
    } else if (param().format == Param::Format::NCHW88) {
        make_canonized_filter_meta_nchwxx<8, Parameter>(src_ndim, filter,
                                                        param(), ret);
    } else if (param().format == Param::Format::NCHW44 ||
               param().format == Param::Format::NCHW44_DOT ||
               param().format == Param::Format::NCHW44_WINOGRAD) {
               param().format == Param::Format::NCHW44_DOT) {
        make_canonized_filter_meta_nchwxx<4, Parameter>(src_ndim, filter,
                                                        param(), ret);
    } else if (param().format == Param::Format::NCHW32 ||
@@ -597,8 +519,7 @@ ConvolutionBase<Parameter>::make_canonized_filter_meta(
                                                       param(), ret);
    } else {
        megdnn_assert(param().format == Param::Format::NHWC ||
                      param().format == Param::Format::NCHW ||
                      param().format == Param::Format::NCHW_WINOGRAD);
                      param().format == Param::Format::NCHW);
        make_canonized_filter_meta_nchw_nhwc<Parameter>(src_ndim, filter,
                                                        param(), ret);
    }
@@ -619,17 +540,8 @@ void ConvolutionBase<Parameter>::check_or_deduce_dtype_fwd(DType src,
    } else if (src.enumv() == DTypeEnum::QuantizedS8 ||
               src.enumv() == DTypeEnum::Quantized8Asymm ||
               src.enumv() == DTypeEnum::Quantized4Asymm) {
        //! Qint8 winograd compute with float, in order to bringing the filter
        //! scale, here just use QuantizedS32 as filter type.
        if (src.enumv() == DTypeEnum::QuantizedS8 &&
            filter.enumv() == DTypeEnum::QuantizedS32) {
            supported_dst_dtype.push_back(dtype::QuantizedS32(
                    src.param<dtype::QuantizedS8>().scale *
                    filter.param<dtype::QuantizedS32>().scale));
        } else {
            supported_dst_dtype.push_back(
                    dtype::QuantizedS32(mul_scale(src, filter)));
        }
        supported_dst_dtype.push_back(
                dtype::QuantizedS32(mul_scale(src, filter)));
        if (dst.valid() && dst.enumv() == src.enumv()) {
            supported_dst_dtype.push_back(dst);
        }
@@ -681,24 +593,12 @@ ConvolutionBase<Parameter>::deduce_layout_fwd(const TensorLayout& src,
    megdnn_assert_contiguous(src);
    megdnn_assert_contiguous(filter);
    megdnn_assert(src.ndim >= 3_z, "%s", errmsg().c_str());
    if ((param().format == Param::Format::NCHW_WINOGRAD ||
         param().format == Param::Format::NCHW44_WINOGRAD) &&
        src.dtype.category() == DTypeCategory::QUANTIZED) {
        megdnn_assert((filter.dtype.enumv() == DTypeEnum::QuantizedS16 ||
                       filter.dtype.enumv() == DTypeEnum::QuantizedS32),
                      "%s", errmsg().c_str());
        megdnn_assert(src.dtype.enumv() == DTypeEnum::QuantizedS8 ||
                              src.dtype.enumv() == DTypeEnum::Quantized8Asymm,
                      "%s", errmsg().c_str());
    } else {
        megdnn_assert(src.dtype.enumv() == filter.dtype.enumv(), "%s",
                      errmsg().c_str());
    }
    megdnn_assert(src.dtype.enumv() == filter.dtype.enumv(), "%s",
                  errmsg().c_str());
    check_or_deduce_dtype_fwd(src.dtype, filter.dtype, dst.dtype);
    size_t img_dim;
    if (param().format == Param::Format::NCHW ||
        param().format == Param::Format::NHWC ||
        param().format == Param::Format::NCHW_WINOGRAD) {
        param().format == Param::Format::NHWC) {
        img_dim = src.ndim - 2;
        megdnn_assert(filter.ndim >= img_dim + 2 && filter.ndim <= img_dim + 6,
                      "%s", errmsg().c_str());
@@ -714,8 +614,6 @@ ConvolutionBase<Parameter>::deduce_layout_fwd(const TensorLayout& src,
                      param().format == Param::Format::NCHW32 ||
                      param().format == Param::Format::NCHW32_NCHW4 ||
                      param().format == Param::Format::NCHW88 ||
                      param().format == Param::Format::NCHW88_WINOGRAD ||
                      param().format == Param::Format::NCHW44_WINOGRAD ||
                      param().format == Param::Format::CHWN4);
        img_dim = src.ndim - 3;
        if ((param().format == Param::Format::NCHW88 ||
@@ -770,8 +668,7 @@ ConvolutionBase<Parameter>::deduce_layout_fwd(const TensorLayout& src,
                          "but got src %s, filter %s",
                          src.to_string().c_str(), filter.to_string().c_str());
        }
        if (param().format == Param::Format::NCHW88 ||
            param().format == Param::Format::NCHW88_WINOGRAD) {
        if (param().format == Param::Format::NCHW88) {
            megdnn_assert((src.ndim == 4 && filter.ndim == 5 &&
                           filter[filter.ndim - 1] == 8) ||
                                  (src.ndim == 5 &&
@@ -786,8 +683,7 @@ ConvolutionBase<Parameter>::deduce_layout_fwd(const TensorLayout& src,
                          src.to_string().c_str(), filter.to_string().c_str());
        }
        if (param().format == Param::Format::NCHW44 ||
            param().format == Param::Format::NCHW44_DOT ||
            param().format == Param::Format::NCHW44_WINOGRAD) {
            param().format == Param::Format::NCHW44_DOT) {
            //!support nchw44 filter change to 88 for int8 winogradf23_88 using MK8 mamtul
            megdnn_assert((src.ndim == 4 && filter.ndim == 5 &&
                           filter[filter.ndim - 1] == 4) ||
@@ -820,12 +716,10 @@ ConvolutionBase<Parameter>::deduce_layout_fwd(const TensorLayout& src,
                  "currently only convolution on 2D image is supported");
    auto cflt = make_canonized_filter_meta(src.ndim, filter);
    if (param().format == Param::Format::NCHW ||
        param().format == Param::Format::NHWC ||
        param().format == Param::Format::NCHW_WINOGRAD) {
        param().format == Param::Format::NHWC ) {
        size_t src_or_dst_c_pos = 0;
        size_t src_or_dst_spatial_start = 0;
        if (param().format == Param::Format::NCHW ||
            param().format == Param::Format::NCHW_WINOGRAD) {
        if (param().format == Param::Format::NCHW) {
            src_or_dst_c_pos = 1;
            src_or_dst_spatial_start = 2;
        } else {
@@ -836,10 +730,6 @@ ConvolutionBase<Parameter>::deduce_layout_fwd(const TensorLayout& src,
        }
        megdnn_assert(cflt.icpg * cflt.group == src[src_or_dst_c_pos], "%s",
                      errmsg().c_str());
        if (param().format == Param::Format::NCHW_WINOGRAD) {
            megdnn_assert(cflt.spatial[0] == cflt.spatial[1],
                          "NCHW_WINOGRAD only support conv with fh == fw");
        }
        dst.ndim = src.ndim;
        dst[0] = src[0];
        dst[src_or_dst_c_pos] = cflt.ocpg * cflt.group;
@@ -900,8 +790,7 @@ ConvolutionBase<Parameter>::deduce_layout_fwd(const TensorLayout& src,
        dst[3] = infer_conv_shape(src[3], cflt.dilated_spatial[1],
                                  cflt.stride[1], cflt.padding[1]);
        dst[4] = 32;
    } else if (param().format == Param::Format::NCHW88 ||
               param().format == Param::Format::NCHW88_WINOGRAD) {
    } else if (param().format == Param::Format::NCHW88 ) {
        megdnn_assert(src.ndim == 5 || (src.ndim == 4 && src[1] <= 8),
                      "invalid src ndim for NCHW88, expected=5 or 4, got=%zu",
                      src.ndim);
@@ -923,8 +812,7 @@ ConvolutionBase<Parameter>::deduce_layout_fwd(const TensorLayout& src,
        }

    } else if (param().format == Param::Format::NCHW44 ||
               param().format == Param::Format::NCHW44_DOT ||
               param().format == Param::Format::NCHW44_WINOGRAD) {
               param().format == Param::Format::NCHW44_DOT) {
        megdnn_assert(src.ndim == 5 || (src.ndim == 4 && src[1] <= 4),
                      "invalid src ndim for NCHW44, expected=5 or 4, got=%zu",
                      src.ndim);
--- a/dnn/src/common/handle_impl.h
+++ b/dnn/src/common/handle_impl.h
@@ -189,7 +189,6 @@ private:
    cb(RelayoutFormat) \
    cb(TopK) \
    cb(PowC) \
    cb(WinogradFilterPreprocess) \
    cb(LocalShareForward) \
    cb(LocalShareBackwardData) \
    cb(LocalShareBackwardFilter) \
--- a/dnn/src/common/winograd_filter_preprocess.cpp
+++ b/dnn/src/common/winograd_filter_preprocess.cpp
@@ -1,157 +0,0 @@
 /**
 * \file dnn/src/common/winograd_filter_preprocess.cpp
 * MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
 *
 * Copyright (c) 2014-2020 Megvii Inc. All rights reserved.
 *
 * Unless required by applicable law or agreed to in writing,
 * software distributed under the License is distributed on an
 * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 */
 #include "megdnn/oprs.h"

 #include <numeric>
 #include "src/common/utils.h"

 using namespace megdnn;
 void WinogradFilterPreprocess::deduce_layout(const TensorLayout& src,
                                             TensorLayout& dst) {
    auto errmsg = [&]() {
        return "invalid filter layout:" + megdnn_layout_msg(src);
    };
    MEGDNN_MARK_USED_VAR(errmsg);
    //! NCHW88 weight layout include
    //! dense{oc/8, ic/8, fh, fw, 8, 8}; group {g, oc/8, ic/8, fh, fw, 8, 8};
    //! channel wise{g/8, 1, 1, fh, fw, 8}
    megdnn_assert(
            src.ndim == 4 || src.ndim == 5 || src.ndim == 6 || src.ndim == 7,
            "%s", errmsg().c_str());
    //! nchw88 channel wise conv
    megdnn_assert(!(src.ndim == 6 && src[1] == 1 && src[2] == 1),
                  "chennel wise nchw88 can not use winograd ");
    //! nchw88 group conv
    size_t flt_start = 0;
    size_t pack_c_size = 1;
    size_t group = 1;
    //! group conv
    if (src.ndim == 5) {
        flt_start = 1;
        group = src[0];
        //! nchw88 dense conv
    } else if (src.ndim == 6) {
        pack_c_size = src[5];
        //! nchw88 group conv
    } else if (src.ndim == 7) {
        flt_start = 1;
        group = src[0];
        pack_c_size = src[6];
    }
    size_t OC = src[flt_start] * pack_c_size,
           IC = src[flt_start + 1] * pack_c_size, FH = src[flt_start + 2],
           FW = src[flt_start + 3];
    size_t m = param().output_block_size;
    megdnn_assert(FH == FW, "%s", errmsg().c_str());

    size_t alpha = FH + m - 1;
    DType dst_type = src.dtype;
    if (src.dtype.category() == DTypeCategory::QUANTIZED) {
        megdnn_assert(src.dtype.enumv() == DTypeEnum::QuantizedS8);
        if (param().compute_mode ==
            param::ConvBias::ComputeMode::DEFAULT) {
            //! input int8 compute short
            dst_type = dtype::QuantizedS16(
                    src.dtype.param<dtype::QuantizedS8>().scale);
        } else {
            //! input int8 compute float32
            dst_type = dtype::QuantizedS32(
                    src.dtype.param<dtype::QuantizedS8>().scale);
        }
    }

    if (src.ndim == 4 || src.ndim == 6) {
        if (param().format == param::Winograd::Format::DEFAULT) {
            dst = TensorLayout({alpha, alpha, IC, OC}, dst_type);
        } else {
            megdnn_assert(param().format == param::Winograd::Format::MK4 ||
                          param().format == param::Winograd::Format::MK8);
            size_t pack_size = MatrixMulForward::pack_size(param().format);
            dst = TensorLayout({alpha, alpha, OC / pack_size, IC / pack_size,
                                pack_size, pack_size},
                               dst_type);
        }
    } else {
        megdnn_assert(src.ndim == 5 || src.ndim == 7);
        if (param().format == param::Winograd::Format::DEFAULT) {
            dst = TensorLayout({group, alpha, alpha, IC, OC}, dst_type);
        } else {
            megdnn_assert(param().format == param::Winograd::Format::MK4 ||
                          param().format == param::Winograd::Format::MK8);
            size_t pack_size = MatrixMulForward::pack_size(param().format);
            dst = TensorLayout({group, alpha, alpha, OC / pack_size,
                                IC / pack_size, pack_size, pack_size},
                               dst_type);
        }
    }
 }

 void WinogradFilterPreprocess::check_exec(const TensorLayout& src,
                                          const TensorLayout& dst,
                                          size_t workspace_in_bytes) {
    auto errmsg = [&]() {
        return megdnn_layout_msg(src) + ", " + megdnn_layout_msg(dst);
    };
    MEGDNN_MARK_USED_VAR(errmsg);
    megdnn_assert_contiguous(src);
    megdnn_assert_contiguous(dst);
    //! nchwxx now only support Format MKx
    if (param().format == param::Winograd::Format::DEFAULT) {
        megdnn_assert(src.ndim == dst.ndim && (src.ndim == 4 || src.ndim == 5),
                      "%s", errmsg().c_str());
    } else {
        megdnn_assert(
                (param().format == param::Winograd::Format::MK4 ||
                 param().format == param::Winograd::Format::MK8) &&
                        (src.ndim == dst.ndim - 2 || src.ndim == dst.ndim) &&
                        (src.ndim == 4 || src.ndim == 5 || src.ndim == 6 ||
                         src.ndim == 7),
                "%s", errmsg().c_str());
    }

    TensorLayout dst_expected;
    deduce_layout(src, dst_expected);
    megdnn_assert_eq_layout(dst_expected, dst);
    auto required_workspace_in_bytes = get_workspace_in_bytes(src, dst);
    megdnn_assert(workspace_in_bytes >= required_workspace_in_bytes);
 }

 size_t WinogradFilterPreprocess::get_workspace_in_bytes(
        const TensorLayout& src, const TensorLayout& dst) {
    MEGDNN_MARK_USED_VAR(dst);
    DType output_compute_dtype = src.dtype;
    if (src.dtype.category() == DTypeCategory::QUANTIZED) {
        megdnn_assert(src.dtype.enumv() == DTypeEnum::QuantizedS8 ||
                      src.dtype.enumv() == DTypeEnum::Quantized8Asymm);
        if (param().compute_mode ==
            param::ConvBias::ComputeMode::DEFAULT) {
            //! input int8 compute short
            output_compute_dtype = dtype::QuantizedS16(
                    src.dtype.param<dtype::QuantizedS8>().scale);
        } else {
            //! input int8 compute float32
            output_compute_dtype = dtype::QuantizedS32(
                    src.dtype.param<dtype::QuantizedS8>().scale);
        }
    }

    size_t FW = src[3];
    if (src.ndim == 5 || src.ndim == 7) {
        FW = src[4];
    }

    size_t pack_size = MatrixMulForward::pack_size(param().format);
    size_t alpha = param().output_block_size + FW - 1;
    return 2 * alpha * alpha * output_compute_dtype.size() * pack_size *
           pack_size;
 }

 // vim: syntax=cpp.doxygen
--- a/dnn/src/cuda/handle_create.cpp
+++ b/dnn/src/cuda/handle_create.cpp
@@ -72,7 +72,6 @@
 #include "src/cuda/type_cvt/opr_impl.h"
 #include "src/cuda/warp_affine/opr_impl.h"
 #include "src/cuda/warp_perspective/opr_impl.h"
 #include "src/cuda/winograd_filter_preprocess/opr_impl.h"
 #include "src/cuda/local_share/opr_impl.h"
 #include "src/cuda/roi_align/opr_impl.h"
 #include "src/cuda/batch_conv_bias/opr_impl.h"
--- a/dnn/src/cuda/winograd_filter_preprocess/opr_impl.cpp
+++ b/dnn/src/cuda/winograd_filter_preprocess/opr_impl.cpp
@@ -1,22 +0,0 @@
 /**
 * \file dnn/src/cuda/winograd_filter_preprocess/opr_impl.cpp
 * MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
 *
 * Copyright (c) 2014-2020 Megvii Inc. All rights reserved.
 *
 * Unless required by applicable law or agreed to in writing,
 * software distributed under the License is distributed on an
 * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 */
 #include "src/cuda/winograd_filter_preprocess/opr_impl.h"
 #include "src/common/utils.h"

 using namespace megdnn;
 using namespace cuda;

 void WinogradFilterPreprocessImpl::exec(_megdnn_tensor_in, _megdnn_tensor_in,
                                        _megdnn_workspace) {
    megdnn_throw("WinogradFilterPreprocess is not supported in CUDA");
 }

 // vim: syntax=cpp.doxygen
--- a/dnn/src/cuda/winograd_filter_preprocess/opr_impl.h
+++ b/dnn/src/cuda/winograd_filter_preprocess/opr_impl.h
@@ -1,27 +0,0 @@
 /**
 * \file dnn/src/cuda/winograd_filter_preprocess/opr_impl.h
 * MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
 *
 * Copyright (c) 2014-2020 Megvii Inc. All rights reserved.
 *
 * Unless required by applicable law or agreed to in writing,
 * software distributed under the License is distributed on an
 * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 */
 #pragma once
 #include "megdnn/oprs.h"

 namespace megdnn {
 namespace cuda {

 class WinogradFilterPreprocessImpl : public WinogradFilterPreprocess {
 public:
    using WinogradFilterPreprocess::WinogradFilterPreprocess;
    void exec(_megdnn_tensor_in src, _megdnn_tensor_out dst,
              _megdnn_workspace workspace) override;
 };

 }  // namespace cuda
 }  // namespace megdnn

 // vim: syntax=cpp.doxygen
--- a/dnn/src/fallback/conv_bias/algos.cpp
+++ b/dnn/src/fallback/conv_bias/algos.cpp
@@ -259,12 +259,7 @@ bool ConvBiasImpl::AlgoWinogradF32::usable(
                                      strategy, UNIT_TILE_SIZE, param)
                                      .get_matmul_kern_param(param);
        return m_matmul_algo->usable(matmul_param) &&
               (param.filter_meta.format == param::ConvBias::Format::NCHW ||
                (param.filter_meta.format ==
                         param::ConvBias::Format::NCHW_WINOGRAD &&
                 param.output_block_size == 2 &&
                 param.winograd_matmul_format ==
                         param::MatrixMul::Format::DEFAULT)) &&
               param.filter_meta.format == param::ConvBias::Format::NCHW &&
               param.filter_meta.should_flip &&
               (param.filter_meta.spatial[0] == param.filter_meta.spatial[1] &&
                param.filter_meta.spatial[0] == 3) &&
@@ -329,12 +324,7 @@ bool ConvBiasImpl::AlgoWinogradF32_4x4::usable(
                        strategy, UNIT_TILE_SIZE, param)
                        .get_matmul_kern_param(param);
        return m_matmul_algo->usable(matmul_param) &&
               (param.filter_meta.format == param::ConvBias::Format::NCHW ||
                (param.filter_meta.format ==
                         param::ConvBias::Format::NCHW_WINOGRAD &&
                 param.output_block_size == 2 &&
                 param.winograd_matmul_format ==
                         param::MatrixMul::Format::MK4)) &&
               param.filter_meta.format == param::ConvBias::Format::NCHW &&
               param.filter_meta.should_flip &&
               (param.filter_meta.spatial[0] == param.filter_meta.spatial[1] &&
                param.filter_meta.spatial[0] == 3) &&
@@ -397,12 +387,7 @@ bool ConvBiasImpl::AlgoWinogradQS8::usable(
                                      .get_matmul_kern_param(param);

        return m_matmul_algo->usable(matmul_param) &&
               (param.filter_meta.format == param::ConvBias::Format::NCHW ||
                (param.filter_meta.format ==
                         param::ConvBias::Format::NCHW_WINOGRAD &&
                 param.output_block_size == 2 &&
                 param.winograd_matmul_format ==
                         param::MatrixMul::Format::DEFAULT)) &&
               param.filter_meta.format == param::ConvBias::Format::NCHW &&
               param.filter_meta.should_flip &&
               (param.filter_meta.spatial[0] == param.filter_meta.spatial[1] &&
                param.filter_meta.spatial[0] == 3) &&
@@ -467,12 +452,7 @@ bool ConvBiasImpl::AlgoWinogradQS8_8x8::usable(
                        strategy, UNIT_TILE_SIZE, param)
                        .get_matmul_kern_param(param);
        return m_matmul_algo->usable(matmul_param) &&
               (param.filter_meta.format == param::ConvBias::Format::NCHW ||
                (param.filter_meta.format ==
                         param::ConvBias::Format::NCHW_WINOGRAD &&
                 param.output_block_size == 2 &&
                 param.winograd_matmul_format ==
                         param::MatrixMul::Format::MK8)) &&
               param.filter_meta.format == param::ConvBias::Format::NCHW &&
               param.filter_meta.should_flip &&
               (param.filter_meta.spatial[0] == param.filter_meta.spatial[1] &&
                param.filter_meta.spatial[0] == 3) &&
--- a/dnn/src/fallback/conv_bias/opr_impl.cpp
+++ b/dnn/src/fallback/conv_bias/opr_impl.cpp
@@ -342,10 +342,7 @@ ConvBiasImpl::NCBKernSizeParam ConvBiasImpl::make_ncb_kern_size_param(
        param().format == Param::Format::NCHW4 ||
        param().format == Param::Format::NCHW44 ||
        param().format == Param::Format::NCHW44_DOT ||
        param().format == Param::Format::NCHW ||
        param().format == Param::Format::NCHW_WINOGRAD ||
        param().format == Param::Format::NCHW88_WINOGRAD ||
        param().format == Param::Format::NCHW44_WINOGRAD) {
        param().format == Param::Format::NCHW) {
        spatial_pos = 2;
    } else if (param().format == Param::Format::NHWC) {
        spatial_pos = 1;
@@ -370,25 +367,7 @@ ConvBiasImpl::NCBKernSizeParam ConvBiasImpl::make_ncb_kern_size_param(
                  "should be equal");
    auto&& fm = check_layout_fwd(src, filter, dst);
    auto& conv_fm = reinterpret_cast<ConvolutionImpl::CanonizedFilterMeta&>(fm);

    param::MatrixMul::Format format = param::MatrixMul::Format::DEFAULT;
    if (param().format == Param::Format::NCHW_WINOGRAD ||
        param().format == Param::Format::NCHW88_WINOGRAD ||
        param().format == Param::Format::NCHW44_WINOGRAD) {
        size_t flt_start = 0;
        if (param().sparse == Param::Sparse::GROUP) {
            flt_start = 1;
        }

        if (filter.ndim == 6 + flt_start) {
            if (filter[5] == 4) {
                format = param::MatrixMul::Format::MK4;
            } else {
                megdnn_assert(filter[5] == 8);
                format = param::MatrixMul::Format::MK8;
            }
        }
    }
    
    size_t nr_threads = static_cast<naive::HandleImpl*>(handle())
                                ->megcore_dispatcher()
                                ->nr_threads();
@@ -407,8 +386,6 @@ ConvBiasImpl::NCBKernSizeParam ConvBiasImpl::make_ncb_kern_size_param(
             nr_threads,
             reinterpret_cast<const ConvolutionForward::PreprocessedFilter*>(
                     preprocessed_filter)},
            param().output_block_size,
            format,
            bias.dtype,
            bias.stride[0],
            bias_mode,
@@ -537,11 +514,7 @@ SmallVector<AlgoCategory> ConvBiasImpl::suggest_algo_category_order(
    auto FH = param.filter_meta.spatial[0];
    auto FW = param.filter_meta.spatial[1];
    //! TODO: now winograd only support in fast-run
    if (param.filter_meta.format == param::ConvBias::Format::NCHW_WINOGRAD ||
        param.filter_meta.format == param::ConvBias::Format::NCHW44_WINOGRAD ||
        param.filter_meta.format == param::ConvBias::Format::NCHW88_WINOGRAD) {
        return {AlgoCategory::WINOGRAD};
    }

    //! im2col + matmul
    bool im2col_prefer = (IC >= 32 || OC >= 32);
    //! quantized algo use matmul when direct algo is unusable
@@ -632,21 +605,6 @@ const T* ConvBiasImpl::NCBKernParam::filter(size_t group_pack_id,

            break;
        }
        case ConvBiasImpl::Param::Format::NCHW_WINOGRAD:
        case ConvBiasImpl::Param::Format::NCHW44_WINOGRAD:
        case ConvBiasImpl::Param::Format::NCHW88_WINOGRAD: {
            //! four format of weight layout
            //! 1. {g, alpha, alpha, ocpg/8, icpg/8, 8, 8}
            //! 2. {alpha, alpha, ocpg/8, icpg/8, 8, 8}
            //! 3. {g, alpha, alpha, oc, ic, 8, 8}
            //! 4. {alpha, alpha, oc, ic}
            group_offset = pack_group_size * group_pack_id * filter_meta.icpg *
                           filter_meta.ocpg *
                           (filter_meta.spatial[0] + output_block_size - 1) *
                           (filter_meta.spatial[1] + output_block_size - 1) *
                           filter_type.size();
            break;
        }
        default:
            megdnn_assert(0, "other filter format is not support yet");
    }
--- a/dnn/src/fallback/conv_bias/opr_impl.h
+++ b/dnn/src/fallback/conv_bias/opr_impl.h
@@ -103,19 +103,13 @@ public:
    struct NCBKernSizeParam : ConvolutionImpl::NCBKernSizeParam {
        NCBKernSizeParam() = default;
        NCBKernSizeParam(const ConvolutionImpl::NCBKernSizeParam& param,
                         size_t output_block_size,
                         param::MatrixMul::Format winograd_matmul_format,
                         DType bias_type, ptrdiff_t bias_bs, BiasMode bias_mode,
                         Param::NonlineMode nonlineMode)
                : ConvolutionImpl::NCBKernSizeParam(param),
                  output_block_size{output_block_size},
                  winograd_matmul_format{winograd_matmul_format},
                  bias_type{bias_type},
                  bias_bs{bias_bs},
                  bias_mode{bias_mode},
                  nonlineMode{nonlineMode} {}
        size_t output_block_size;  //!< used in winograd algo
        param::MatrixMul::Format winograd_matmul_format;
        DType bias_type;
        //! stride for batch of bias
        ptrdiff_t bias_bs;
--- a/dnn/src/fallback/conv_bias/winograd/winograd.h
+++ b/dnn/src/fallback/conv_bias/winograd/winograd.h
@@ -88,13 +88,7 @@ class ConvBias {
        size_t filter_transform_buf_size = 0;
        //! filter : (alpha, alpha, IC, OC) or (OCB, ICB, IC_BLOCK_SIZE,
        //! OC_BLOCK_SIZE)
        if (param.preprocessed_filter == nullptr &&
            param.filter_meta.format !=
                    param::ConvBias::Format::NCHW_WINOGRAD &&
            param.filter_meta.format !=
                    param::ConvBias::Format::NCHW88_WINOGRAD &&
            param.filter_meta.format !=
                    param::ConvBias::Format::NCHW44_WINOGRAD) {
        if (param.preprocessed_filter == nullptr) {
            filter_transform_buf_size = Strategy::ALPHA * Strategy::ALPHA * OC *
                                        IC * sizeof(input_filter_compute_type);
        }
@@ -108,12 +102,7 @@ class ConvBias {
                    nullptr,
                    {winograd_comput_size, filter_transform_buf_size * GROUP});
        } else {
            megdnn_assert(param.filter_meta.format ==
                                  param::ConvBias::Format::NCHW_WINOGRAD ||
                          param.filter_meta.format ==
                                  param::ConvBias::Format::NCHW88_WINOGRAD ||
                          param.filter_meta.format ==
                                  param::ConvBias::Format::NCHW44_WINOGRAD);
            megdnn_assert(param.preprocessed_filter != nullptr);
            return WorkspaceBundle(nullptr, {winograd_comput_size});
        }
    }
@@ -499,7 +488,6 @@ public:
        const TensorND& preprocessed_dst =
                param.preprocessed_filter->tensors[0];
        WorkspaceBundle bundle = get_preprocess_wbundle(param);

        Strategy strategy = m_strategy;
        SmallVector<NCBKern> kerns;
        auto filter_process_kern =
@@ -558,13 +546,7 @@ public:
                param.filter_meta.stride[1] == 1 &&
                (param.filter_meta.format == param::ConvBias::Format::NCHW ||
                 param.filter_meta.format == param::ConvBias::Format::NCHW88 ||
                 param.filter_meta.format == param::ConvBias::Format::NCHW44 ||
                 param.filter_meta.format ==
                         param::ConvBias::Format::NCHW_WINOGRAD ||
                 param.filter_meta.format ==
                         param::ConvBias::Format::NCHW88_WINOGRAD ||
                 param.filter_meta.format ==
                         param::ConvBias::Format::NCHW44_WINOGRAD));
                 param.filter_meta.format == param::ConvBias::Format::NCHW44));

        SmallVector<NCBKern> kerns;
        if (param.preprocessed_filter == nullptr &&
--- a/dnn/src/fallback/convolution/algos.cpp
+++ b/dnn/src/fallback/convolution/algos.cpp
@@ -316,8 +316,6 @@ ConvolutionImpl::AlgoDefault::init_conv_bias_param(
                mul_scale(param.src_type, param.filter_type));
    }
    return {param,
            0,
            param::MatrixMul::Format::DEFAULT,
            bias_type,
            0,
            BiasMode::NO_BIAS,
--- a/dnn/src/fallback/convolution/opr_impl.cpp
+++ b/dnn/src/fallback/convolution/opr_impl.cpp
@@ -225,8 +225,7 @@ ConvolutionImpl::NCBKernSizeParam ConvolutionImpl::make_ncb_kern_size_param(
        param().format == Param::Format::NCHW44_DOT ||
        param().format == Param::Format::NCHW44) {
        spatial_pos = 2;
    } else if (param().format == Param::Format::NCHW ||
               param().format == Param::Format::NCHW_WINOGRAD) {
    } else if (param().format == Param::Format::NCHW) {
        spatial_pos = 2;
    } else if (param().format == Param::Format::NHWC) {
        spatial_pos = 1;
--- a/dnn/src/naive/handle.cpp
+++ b/dnn/src/naive/handle.cpp
@@ -78,7 +78,6 @@
 #include "src/naive/type_cvt/opr_impl.h"
 #include "src/naive/warp_affine/opr_impl.h"
 #include "src/naive/warp_perspective/opr_impl.h"
 #include "src/naive/winograd_filter_preprocess/opr_impl.h"
 #include "src/naive/remap/opr_impl.h"
 #include "src/naive/fake_quant/opr_impl.h"

--- a/dnn/src/naive/winograd_filter_preprocess/opr_impl.cpp
+++ b/dnn/src/naive/winograd_filter_preprocess/opr_impl.cpp
@@ -1,234 +0,0 @@
 /**
 * \file dnn/src/naive/winograd_filter_preprocess/opr_impl.cpp
 * MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
 *
 * Copyright (c) 2014-2020 Megvii Inc. All rights reserved.
 *
 * Unless required by applicable law or agreed to in writing,
 * software distributed under the License is distributed on an
 * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or
 * implied.
 */

 #include "src/naive/winograd_filter_preprocess/opr_impl.h"
 #include "src/common/utils.h"
 #include "src/common/winograd/winograd_helper.h"
 #include "src/naive/handle.h"

 #include "midout.h"
 MIDOUT_DECL(megdnn_naive_winograd_filter_preprocess)

 using namespace megdnn;
 using namespace naive;

 void WinogradFilterPreprocessImpl::exec(_megdnn_tensor_in src,
                                        _megdnn_tensor_out dst,
                                        _megdnn_workspace workspace) {
    check_exec(src.layout, dst.layout, workspace.size);
    
    //! nchw88 group conv
    size_t flt_start = 0;
    size_t pack_c_size = 1;
    size_t group = 1;
    //! group conv
    if (src.layout.ndim == 5) {
        flt_start = 1;
        group = src.layout[0];
        //! nchw88 dense conv
    } else if (src.layout.ndim == 6) {
        pack_c_size = src.layout[5];
        //! nchw88 group conv
    } else if (src.layout.ndim == 7) {
        flt_start = 1;
        group = src.layout[0];
        pack_c_size = src.layout[6];
    }
    size_t OC = src.layout[flt_start] * pack_c_size,
           IC = src.layout[flt_start + 1] * pack_c_size,
           FW = src.layout[flt_start + 3];

    size_t m = param().output_block_size;

    bool execed = false;

 #define cb(_ctype, _dst_type, _input_filter_compute_type,                    \
           _output_compute_type, _format, rescale)                           \
    if (param().format == _format) {                                         \
        return winograd::StrategyHelper<                                     \
                _ctype, _dst_type, _input_filter_compute_type,               \
                _output_compute_type, param::ConvBias::Format::NCHW,         \
                _format>::filter(src_ptr, dst_ptr, workspace_ptr, OC, IC, 0, \
                                 OC, m, FW, interp_points, src.layout.dtype, \
                                 rescale);                                   \
    }

 #define DISPATCH_FORMAT_MK4(_ctype, _dst_type, _input_filter_compute_type,  \
                            _output_compute_type, _rescale)                 \
    cb(_ctype, _dst_type, _input_filter_compute_type, _output_compute_type, \
       param::Winograd::Format::DEFAULT, _rescale);                         \
    cb(_ctype, _dst_type, _input_filter_compute_type, _output_compute_type, \
       param::Winograd::Format::MK4, _rescale);

 #define DISPATCH_FORMAT_MK8(_ctype, _dst_type, _input_filter_compute_type,  \
                            _output_compute_type, _rescale)                 \
    cb(_ctype, _dst_type, _input_filter_compute_type, _output_compute_type, \
       param::Winograd::Format::DEFAULT, _rescale);                         \
    cb(_ctype, _dst_type, _input_filter_compute_type, _output_compute_type, \
       param::Winograd::Format::MK8, _rescale);

 #define DISPATCH_KERNEL(_ctype, _dst_type, _input_filter_compute_type,     \
                        _output_compute_type, _kern, _rescale, ...)        \
    const _ctype* src_ptr = src.compatible_ptr<_ctype>();                  \
    _input_filter_compute_type* dst_ptr =                                  \
            dst.compatible_ptr<_input_filter_compute_type>();              \
    _input_filter_compute_type* workspace_ptr =                            \
            workspace.ptr<_input_filter_compute_type>();                   \
    MIDOUT_BEGIN(megdnn_naive_winograd_filter_preprocess, ##__VA_ARGS__) { \
        for (size_t g = 0; g < group; g++) {                               \
            auto run = [=]() {                                             \
                _kern(_ctype, _dst_type, _input_filter_compute_type,       \
                      _output_compute_type, _rescale);                     \
            };                                                             \
            MEGDNN_DISPATCH_CPU_KERN_OPR(run());                           \
            src_ptr += src.layout.stride[0];                               \
            dst_ptr += dst.layout.stride[0];                               \
        }                                                                  \
        execed = true;                                                     \
    }                                                                      \
    MIDOUT_END();

 #define DISPATCH_DTYPE(_midout_tag)                                          \
    if (src.layout.dtype.enumv() == DTypeEnum::Float32) {                    \
        DISPATCH_KERNEL(dt_float32, dt_float32, dt_float32, dt_float32,      \
                        DISPATCH_FORMAT_MK4, 1.0f, _midout_tag, 0);          \
    }                                                                        \
    if (src.layout.dtype.enumv() == DTypeEnum::QuantizedS8) {                \
        DISPATCH_KERNEL(dt_int8, dt_int8, dt_int16, dt_int32,                \
                        DISPATCH_FORMAT_MK8, 2.0f, _midout_tag, 1);          \
    }                                                                        \
    MEGDNN_INC_FLOAT16(if (src.layout.dtype.enumv() == DTypeEnum::Float16) { \
        DISPATCH_KERNEL(dt_float16, dt_float16, dt_float16, dt_float16,      \
                        DISPATCH_FORMAT_MK8, 1.0f, _midout_tag, 2);          \
    })

    if (src.layout.ndim <= 5) {
        //! dispatch_dtype with consider layout and format.
        if (FW == 3) {
            if (m == 2) {
                std::vector<float> interp_points = {0, 1, -1};
                DISPATCH_DTYPE(0);
            } else if (m == 6) {
                std::vector<float> interp_points = {0, 1, -1, 2, -2, 0.5, -0.5};
                DISPATCH_DTYPE(1);
            }
        } else if (FW == 4) {
            if (m == 5) {
                std::vector<float> interp_points = {0, 0.5, -0.5, 1, -1, 2, -2};
                DISPATCH_DTYPE(2);
            }
        } else if (FW == 5) {
            if (m == 4) {
                std::vector<float> interp_points = {0, 1, -1, 0.5, -0.5, 2, -2};
                DISPATCH_DTYPE(3);
            }
        }
 #undef cb
 #undef DISPATCH_FORMAT_MK4
 #undef DISPATCH_FORMAT_MK8
 #undef DISPATCH_DTYPE
    } else {
        megdnn_assert(src.layout.ndim == 6 || src.layout.ndim == 7);
 #define cb(_ctype, _dst_type, _input_filter_compute_type,                    \
           _output_compute_type, _format, rescale)                           \
    if (param().format == _format) {                                         \
        return winograd::StrategyHelper<                                     \
                _ctype, _dst_type, _input_filter_compute_type,               \
                _output_compute_type, param::ConvBias::Format::NCHW88,       \
                _format>::filter(src_ptr, dst_ptr, workspace_ptr, OC, IC, 0, \
                                 OC, m, FW, interp_points, src.layout.dtype, \
                                 rescale);                                   \
    }

 #define DISPATCH_FORMAT_MK8(_ctype, _dst_type, _input_filter_compute_type,  \
                            _output_compute_type, _rescale)                 \
    cb(_ctype, _dst_type, _input_filter_compute_type, _output_compute_type, \
       param::Winograd::Format::MK8, _rescale);

 #define DISPATCH_DTYPE(_midout_tag)                                     \
    if (src.layout.dtype.enumv() == DTypeEnum::Float32) {               \
        DISPATCH_KERNEL(dt_float32, dt_float32, dt_float32, dt_float32, \
                        DISPATCH_FORMAT_MK8, 1.0f, _midout_tag, 0);     \
    }
        if (pack_c_size == 8) {  //! NCHW88
            if (FW == 3) {
                if (m == 2) {
                    std::vector<float> interp_points = {0, 1, -1};
                    DISPATCH_DTYPE(4);
                } else if (m == 6) {
                    std::vector<float> interp_points = {0,  1,   -1,  2,
                                                        -2, 0.5, -0.5};
                    DISPATCH_DTYPE(5);
                }
            }
 #undef cb
 #undef DISPATCH_DTYPE
        }
        else if (pack_c_size == 4) {  //! NCHW44
 #define cb(_ctype, _dst_type, _input_filter_compute_type,                    \
           _output_compute_type, _format, rescale)                           \
    if (param().format == _format) {                                         \
        return winograd::StrategyHelper<                                     \
                _ctype, _dst_type, _input_filter_compute_type,               \
                _output_compute_type, param::ConvBias::Format::NCHW44,       \
                _format>::filter(src_ptr, dst_ptr, workspace_ptr, OC, IC, 0, \
                                 OC, m, FW, interp_points, src.layout.dtype, \
                                 rescale);                                   \
    }

 #define DISPATCH_FORMAT_MK4(_ctype, _dst_type, _input_filter_compute_type,  \
                            _output_compute_type, _rescale)                 \
    cb(_ctype, _dst_type, _input_filter_compute_type, _output_compute_type, \
       param::Winograd::Format::MK4, _rescale);

 #define DISPATCH_DTYPE(_midout_tag)                                     \
    if (src.layout.dtype.enumv() == DTypeEnum::Float32) {               \
        DISPATCH_KERNEL(dt_float32, dt_float32, dt_float32, dt_float32, \
                        DISPATCH_FORMAT_MK4, 1.0f, _midout_tag, 0);     \
    }                                                                   \
    if (src.layout.dtype.enumv() == DTypeEnum::QuantizedS8) {           \
        if (param().format == param::Winograd::Format::MK4) {           \
            DISPATCH_KERNEL(dt_int8, dt_int8, dt_float32, dt_float32,   \
                            DISPATCH_FORMAT_MK4, 1.0f, _midout_tag, 0); \
        } else if (param().format == param::Winograd::Format::MK8) {    \
            DISPATCH_KERNEL(dt_int8, dt_int8, dt_int16, dt_int32,       \
                            DISPATCH_FORMAT_MK8, 2.0f, _midout_tag, 0); \
        }                                                               \
    }
            if (FW == 3) {
                if (m == 2) {
                    std::vector<float> interp_points = {0, 1, -1};
                    DISPATCH_DTYPE(6);
                } else if (m == 6) {
                    std::vector<float> interp_points = {0,  1,   -1,  2,
                                                        -2, 0.5, -0.5};
                    DISPATCH_DTYPE(7);
                } else if (m == 7) {
                    std::vector<float> interp_points = {0,  1,   -1,   2,
                                                        -2, 0.5, -0.5, 1.5};
                    DISPATCH_DTYPE(8);
                }
            }
 #undef cb
 #undef DISPATCH_FORMAT_MK8
 #undef DISPATCH_FORMAT_MK4
 #undef DISPATCH_KERNEL
 #undef DISPATCH_DTYPE
        }
    }

    megdnn_assert(execed,
                  "Unsupport winograd filter preprocess. m: %zu src: %s", m,
                  src.layout.to_string().c_str());
 }

 // vim: syntax=cpp.doxygen
--- a/dnn/src/naive/winograd_filter_preprocess/opr_impl.h
+++ b/dnn/src/naive/winograd_filter_preprocess/opr_impl.h
@@ -1,28 +0,0 @@
 /**
 * \file dnn/src/naive/winograd_filter_preprocess/opr_impl.h
 * MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
 *
 * Copyright (c) 2014-2020 Megvii Inc. All rights reserved.
 *
 * Unless required by applicable law or agreed to in writing,
 * software distributed under the License is distributed on an
 * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 */
 #pragma once
 #include "megdnn/oprs.h"
 #include "src/common/utils.h"

 namespace megdnn {
 namespace naive {

 class WinogradFilterPreprocessImpl : public WinogradFilterPreprocess {
 public:
    using WinogradFilterPreprocess::WinogradFilterPreprocess;
    void exec(_megdnn_tensor_in src, _megdnn_tensor_out dst,
              _megdnn_workspace workspace) override;
 };

 }  // namespace naive
 }  // namespace megdnn

 // vim: syntax=cpp.doxygen
--- a/dnn/src/x86/conv_bias/f32/winograd_algo.cpp
+++ b/dnn/src/x86/conv_bias/f32/winograd_algo.cpp
@@ -43,12 +43,7 @@ bool ConvBiasImpl::AlgoFP32WinogradF63_8x8::usable(
                        strategy, m_tile_size, param)
                        .get_matmul_kern_param(param);
        return m_matmul_algo->usable(matmul_param) &&
               (param.filter_meta.format == param::ConvBias::Format::NCHW88 ||
                (param.filter_meta.format ==
                         param::ConvBias::Format::NCHW88_WINOGRAD &&
                 param.output_block_size == 6 &&
                 param.winograd_matmul_format ==
                         param::MatrixMul::Format::MK8)) &&
               param.filter_meta.format == param::ConvBias::Format::NCHW88 &&
               !param.filter_meta.should_flip &&
               (param.filter_meta.spatial[0] == param.filter_meta.spatial[1] &&
                param.filter_meta.spatial[0] == 3) &&
@@ -89,12 +84,7 @@ bool ConvBiasImpl::AlgoFP32WinogradF23_8x8::usable(
                        strategy, m_tile_size, param)
                        .get_matmul_kern_param(param);
        return m_matmul_algo->usable(matmul_param) &&
               (param.filter_meta.format == param::ConvBias::Format::NCHW88 ||
                (param.filter_meta.format ==
                         param::ConvBias::Format::NCHW88_WINOGRAD &&
                 param.output_block_size == 2 &&
                 param.winograd_matmul_format ==
                         param::MatrixMul::Format::MK8)) &&
               param.filter_meta.format == param::ConvBias::Format::NCHW88 &&
               !param.filter_meta.should_flip &&
               (param.filter_meta.spatial[0] == param.filter_meta.spatial[1] &&
                param.filter_meta.spatial[0] == 3) &&
--- a/dnn/src/x86/conv_bias/opr_impl.cpp
+++ b/dnn/src/x86/conv_bias/opr_impl.cpp
@@ -173,11 +173,7 @@ SmallVector<AlgoCategory> ConvBiasImpl::suggest_algo_category_order(
    auto FH = param.filter_meta.spatial[0];
    auto FW = param.filter_meta.spatial[1];
    //! TODO: now winograd only support fast-run
    if (param.filter_meta.format == param::ConvBias::Format::NCHW_WINOGRAD ||
        param.filter_meta.format == param::ConvBias::Format::NCHW44_WINOGRAD ||
        param.filter_meta.format == param::ConvBias::Format::NCHW88_WINOGRAD) {
        return {AlgoCategory::WINOGRAD};
    }

    //! nchw88 use mkl-dnn which algo is direct
    if (param.filter_meta.format == param::ConvBias::Format::NCHW88) {
        return {AlgoCategory::DIRECT, AlgoCategory::IM2COL};
--- a/dnn/test/arm_common/conv_bias_multi_thread.cpp
+++ b/dnn/test/arm_common/conv_bias_multi_thread.cpp
@@ -629,6 +629,35 @@ TEST_F(ARM_COMMON_MULTI_THREADS, CONVBIAS_INT8_DIRECT_DOT_NCHW44_S2_8x8x32) {

 #endif

 TEST_F(ARM_COMMON_MULTI_THREADS, CONV_BIAS_WINOGRAD) {
    using namespace conv_bias;
    std::vector<TestArg> args = get_winograd_args(3);

    Checker<ConvBiasForward> checker(handle());

    auto run = [&checker](const std::vector<TestArg>& args, DType A_dtype,
                          DType B_dtype, DType C_dtype, DType D_dtype,
                          const float eps) {
        for (auto&& arg : args) {
            checker.set_dtype(0, A_dtype)
                    .set_dtype(1, B_dtype)
                    .set_dtype(2, C_dtype)
                    .set_dtype(4, D_dtype)
                    .set_epsilon(eps)
                    .set_param(arg.param)
                    .execs({arg.src, arg.filter, arg.bias, {}, {}});
        }
    };
    run(args, dtype::Float32(), dtype::Float32(), dtype::Float32(),
        dtype::Float32(), 1e-3f);
 #if __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
    Float16PeriodicalRNG* rng = new Float16PeriodicalRNG(0x3c00);
    checker.set_rng(0, rng).set_rng(1, rng).set_rng(2, rng);
    run(args, dtype::Float16(), dtype::Float16(), dtype::Float16(),
        dtype::Float16(), 0.35f);
 #endif
 }

 TEST_F(ARM_COMMON_MULTI_THREADS, CONV_BIAS_WINOGRAD_F23_4) {
    using namespace conv_bias;
    std::vector<TestArg> args = get_winograd_mk_packed_args();
@@ -717,207 +746,97 @@ TEST_F(ARM_COMMON_MULTI_THREADS, CONV_BIAS_WINOGRAD_F45) {
    check_winograd("1:4:32", checker, args);
 }



 TEST_F(ARM_COMMON_MULTI_THREADS, CONV_BIAS_WINOGRAD) {
    using namespace conv_bias;
    std::vector<TestArg> args = get_winograd_args(3);

    Checker<ConvBiasForward> checker(handle());

    auto extra_impl = [](const TensorNDArray& tensors, uint32_t m,
                         param::ConvBias param, Handle* handle) {
        megdnn_assert(param.format == param::ConvBias::Format::NCHW);
        auto winograd_preprocess_opr =
                handle->create_operator<WinogradFilterPreprocess>();
        winograd_preprocess_opr->param().output_block_size = m;
        TensorLayout filter_transform_layout;
        winograd_preprocess_opr->deduce_layout(tensors[1].layout,
                                               filter_transform_layout);
        size_t winograd_preprocess_workspace_in_bytes =
                winograd_preprocess_opr->get_workspace_in_bytes(
                        tensors[1].layout, filter_transform_layout);

        auto conv_bias_opr = handle->create_operator<ConvBias>();
        conv_bias_opr->param() = param;
        conv_bias_opr->param().format = param::ConvBias::Format::NCHW_WINOGRAD;
        conv_bias_opr->param().output_block_size = m;
        size_t conv_bias_workspace_in_bytes =
                conv_bias_opr->get_workspace_in_bytes(
                        tensors[0].layout, filter_transform_layout,
                        tensors[2].layout, tensors[3].layout, tensors[4].layout,
                        nullptr);

        WorkspaceBundle wb(nullptr, {filter_transform_layout.span().dist_byte(),
                                     conv_bias_workspace_in_bytes,
                                     winograd_preprocess_workspace_in_bytes});
        wb.set(malloc(wb.total_size_in_bytes()));

        TensorND filter_transform_tensor(wb.get(0),
                                         std::move(filter_transform_layout));
        winograd_preprocess_opr->exec(tensors[1], filter_transform_tensor,
                                      wb.get_workspace(2));
        conv_bias_opr->exec(tensors[0], filter_transform_tensor, tensors[2],
                            tensors[3], tensors[4], nullptr,
                            wb.get_workspace(1));

        free(wb.ptr());
    };

    auto run = [&checker, &extra_impl](
                       Handle* handle, const std::vector<TestArg>& args,
                       const std::vector<size_t>& out_size, DType A_dtype,
                       DType B_dtype, DType C_dtype, DType D_dtype,
                       const float eps) {
        for (auto&& arg : args) {
            for (uint32_t m : out_size) {
                checker.set_extra_opr_impl(std::bind(extra_impl,
                                                     std::placeholders::_1, m,
                                                     arg.param, handle));
                checker.set_dtype(0, A_dtype)
                        .set_dtype(1, B_dtype)
                        .set_dtype(2, C_dtype)
                        .set_dtype(4, D_dtype)
                        .set_epsilon(eps)
                        .set_param(arg.param)
                        .execs({arg.src, arg.filter, arg.bias, {}, {}});
            }
        }
    };
    run(handle(), args, {6}, dtype::Float32(), dtype::Float32(),
        dtype::Float32(), dtype::Float32(), 1e-3f);
 #if __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
    Float16PeriodicalRNG* rng = new Float16PeriodicalRNG(0x3c00);
    checker.set_rng(0, rng).set_rng(1, rng).set_rng(2, rng);
    run(handle(), args, {6}, dtype::Float16(), dtype::Float16(),
        dtype::Float16(), dtype::Float16(), 0.35f);
 #endif
 }



 TEST_F(ARM_COMMON_MULTI_THREADS, CONV_BIAS_WINOGRAD_MK_PACKED_F32_1) {
    using namespace conv_bias;

    Checker<ConvBiasForward> checker(handle());
    auto run = [&checker](Handle* handle, const std::vector<TestArg>& args,
                          const std::vector<size_t>& out_size, DType A_dtype,
    auto run = [&checker](const std::vector<TestArg>& args, DType A_dtype,
                          DType B_dtype, DType C_dtype, DType D_dtype,
                          param::MatrixMul::Format format, float eps) {
                          float eps) {
        for (auto&& arg : args) {
            for (uint32_t m : out_size) {
                checker.set_extra_opr_impl(std::bind(
                        winograd_algo_extra_impl, std::placeholders::_1, m,
                        arg.param, handle, format));
                checker.set_dtype(0, A_dtype)
                        .set_dtype(1, B_dtype)
                        .set_dtype(2, C_dtype)
                        .set_dtype(4, D_dtype)
                        .set_epsilon(eps)
                        .set_param(arg.param)
                        .execs({arg.src, arg.filter, arg.bias, {}, {}});
            }
            checker.set_dtype(0, A_dtype)
                    .set_dtype(1, B_dtype)
                    .set_dtype(2, C_dtype)
                    .set_dtype(4, D_dtype)
                    .set_epsilon(eps)
                    .set_param(arg.param)
                    .execs({arg.src, arg.filter, arg.bias, {}, {}});
        }
    };
    std::vector<TestArg> args = get_winograd_mk_packed_args(8);
    std::vector<TestArg> args_first_half(args.begin(),
                                         args.begin() + args.size() / 2);
    run(handle(), args_first_half, {2, 6}, dtype::Float32{}, dtype::Float32{},
        dtype::Float32{}, dtype::Float32{}, param::MatrixMul::Format::MK4,
        1e-3f);
    run(args_first_half, dtype::Float32{}, dtype::Float32{}, dtype::Float32{},
        dtype::Float32{}, 1e-3f);
 }



 TEST_F(ARM_COMMON_MULTI_THREADS, CONV_BIAS_WINOGRAD_MK_PACKED_F32_2) {
    using namespace conv_bias;

    Checker<ConvBiasForward> checker(handle());
    auto run = [&checker](Handle* handle, const std::vector<TestArg>& args,
                          const std::vector<size_t>& out_size, DType A_dtype,
    auto run = [&checker](const std::vector<TestArg>& args, DType A_dtype,
                          DType B_dtype, DType C_dtype, DType D_dtype,
                          param::MatrixMul::Format format, float eps) {
                          float eps) {
        for (auto&& arg : args) {
            for (uint32_t m : out_size) {
                checker.set_extra_opr_impl(std::bind(
                        winograd_algo_extra_impl, std::placeholders::_1, m,
                        arg.param, handle, format));
                checker.set_dtype(0, A_dtype)
                        .set_dtype(1, B_dtype)
                        .set_dtype(2, C_dtype)
                        .set_dtype(4, D_dtype)
                        .set_epsilon(eps)
                        .set_param(arg.param)
                        .execs({arg.src, arg.filter, arg.bias, {}, {}});
            }
            checker.set_dtype(0, A_dtype)
                    .set_dtype(1, B_dtype)
                    .set_dtype(2, C_dtype)
                    .set_dtype(4, D_dtype)
                    .set_epsilon(eps)
                    .set_param(arg.param)
                    .execs({arg.src, arg.filter, arg.bias, {}, {}});
        }
    };
    std::vector<TestArg> args = get_winograd_mk_packed_args(8);
    std::vector<TestArg> args_second_half(args.begin() + args.size() / 2,
                                          args.end());
    run(handle(), args_second_half, {2, 6}, dtype::Float32{}, dtype::Float32{},
        dtype::Float32{}, dtype::Float32{}, param::MatrixMul::Format::MK4,
        1e-3f);
    run(args_second_half, dtype::Float32{}, dtype::Float32{}, dtype::Float32{},
        dtype::Float32{}, 1e-3f);
 }



 #if __ARM_FEATURE_FP16_VECTOR_ARITHMETIC

 TEST_F(ARM_COMMON_MULTI_THREADS, CONV_BIAS_WINOGRAD_MK_PACKED_F16) {
    using namespace conv_bias;

    Checker<ConvBiasForward> checker(handle());
    auto run = [&checker](Handle* handle, const std::vector<TestArg>& args,
                          const std::vector<size_t>& out_size, DType A_dtype,
    auto run = [&checker](const std::vector<TestArg>& args, DType A_dtype,
                          DType B_dtype, DType C_dtype, DType D_dtype,
                          param::MatrixMul::Format format, float eps) {
                          float eps) {
        for (auto&& arg : args) {
            for (uint32_t m : out_size) {
                checker.set_extra_opr_impl(std::bind(
                        winograd_algo_extra_impl, std::placeholders::_1, m,
                        arg.param, handle, format));
                checker.set_dtype(0, A_dtype)
                        .set_dtype(1, B_dtype)
                        .set_dtype(2, C_dtype)
                        .set_dtype(4, D_dtype)
                        .set_epsilon(eps)
                        .set_param(arg.param)
                        .execs({arg.src, arg.filter, arg.bias, {}, {}});
            }
            checker.set_dtype(0, A_dtype)
                    .set_dtype(1, B_dtype)
                    .set_dtype(2, C_dtype)
                    .set_dtype(4, D_dtype)
                    .set_epsilon(eps)
                    .set_param(arg.param)
                    .execs({arg.src, arg.filter, arg.bias, {}, {}});
        }
    };

    std::vector<TestArg> args = get_winograd_mk_packed_args(8);
    Float16PeriodicalRNG* rng = new Float16PeriodicalRNG(0x3c00);
    checker.set_rng(0, rng).set_rng(1, rng).set_rng(2, rng);
    run(handle(), args, {2}, dtype::Float16{}, dtype::Float16{},
        dtype::Float16{}, dtype::Float16{}, param::MatrixMul::Format::MK8,
        0.25);
    run(args, dtype::Float16{}, dtype::Float16{}, dtype::Float16{},
        dtype::Float16{}, 0.25);
 }


 #endif
 TEST_F(ARM_COMMON_MULTI_THREADS, CONV_BIAS_WINOGRAD_MK_PACKED_INT8) {
    using namespace conv_bias;

    Checker<ConvBiasForward> checker(handle());
    auto run = [&checker](Handle* handle, const std::vector<TestArg>& args,
                          const std::vector<size_t>& out_size, DType A_dtype,
    auto run = [&checker](const std::vector<TestArg>& args, DType A_dtype,
                          DType B_dtype, DType C_dtype, DType D_dtype,
                          param::MatrixMul::Format format, float eps) {
                          float eps) {
        for (auto&& arg : args) {
            for (uint32_t m : out_size) {
                checker.set_extra_opr_impl(std::bind(
                        winograd_algo_extra_impl, std::placeholders::_1, m,
                        arg.param, handle, format));
                checker.set_dtype(0, A_dtype)
                        .set_dtype(1, B_dtype)
                        .set_dtype(2, C_dtype)
                        .set_dtype(4, D_dtype)
                        .set_epsilon(eps)
                        .set_param(arg.param)
                        .execs({arg.src, arg.filter, arg.bias, {}, {}});
            }
            checker.set_dtype(0, A_dtype)
                    .set_dtype(1, B_dtype)
                    .set_dtype(2, C_dtype)
                    .set_dtype(4, D_dtype)
                    .set_epsilon(eps)
                    .set_param(arg.param)
                    .execs({arg.src, arg.filter, arg.bias, {}, {}});
        }
    };

@@ -933,24 +852,19 @@ TEST_F(ARM_COMMON_MULTI_THREADS, CONV_BIAS_WINOGRAD_MK_PACKED_INT8) {
            get_quantized_winograd_mk_packed_args(8);
    UniformIntRNG int_rng{-50, 50};
    checker.set_rng(0, &int_rng).set_rng(1, &int_rng).set_rng(2, &int_rng);
    run(handle(), quantized_args, {2}, dtype::QuantizedS8(2.5f),
        dtype::QuantizedS8(2.5f), dtype::QuantizedS32(6.25f),
        dtype::QuantizedS8(60.25f), param::MatrixMul::Format::MK8, 1e-3);
    run(quantized_args, dtype::QuantizedS8(2.5f), dtype::QuantizedS8(2.5f),
        dtype::QuantizedS32(6.25f), dtype::QuantizedS8(60.25f), 1e-3);
 }

 TEST_F(ARM_COMMON_MULTI_THREADS, CONV_BIAS_WINOGRAD_NCHW44_MK_PACKED_INT8) {
    using namespace conv_bias;

    Checker<ConvBiasForward> checker(handle());
    auto run = [&checker](Handle* handle, const std::vector<TestArg>& args,
                          const std::vector<size_t>& out_size, DType A_dtype,
    auto run = [&checker](const std::vector<TestArg>& args,
                          DType A_dtype,
                          DType B_dtype, DType C_dtype, DType D_dtype,
                          param::MatrixMul::Format format, float eps) {
                          float eps) {
        for (auto&& arg : args) {
            for (uint32_t m : out_size) {
                checker.set_extra_opr_impl(std::bind(
                        winograd_algo_extra_impl, std::placeholders::_1, m,
                        arg.param, handle, format));
                checker.set_dtype(0, A_dtype)
                        .set_dtype(1, B_dtype)
                        .set_dtype(2, C_dtype)
@@ -958,7 +872,6 @@ TEST_F(ARM_COMMON_MULTI_THREADS, CONV_BIAS_WINOGRAD_NCHW44_MK_PACKED_INT8) {
                        .set_epsilon(eps)
                        .set_param(arg.param)
                        .execs({arg.src, arg.filter, arg.bias, {}, {}});
            }
        }
    };

@@ -973,118 +886,99 @@ TEST_F(ARM_COMMON_MULTI_THREADS, CONV_BIAS_WINOGRAD_NCHW44_MK_PACKED_INT8) {
    std::vector<TestArg> quantized_args = get_int8_nchw44_args(3, 4);
    UniformIntRNG int_rng{-50, 50};
    checker.set_rng(0, &int_rng).set_rng(1, &int_rng).set_rng(2, &int_rng);
    run(handle(), quantized_args, {2}, dtype::QuantizedS8(2.5f),
    run(quantized_args, dtype::QuantizedS8(2.5f),
        dtype::QuantizedS8(2.5f), dtype::QuantizedS32(6.25f),
        dtype::QuantizedS8(60.25f), param::MatrixMul::Format::MK8, 1e-3);
        dtype::QuantizedS8(60.25f),1e-3);
 }

 TEST_F(ARM_COMMON_MULTI_THREADS,
       CONV_BIAS_WINOGRAD_NCHW44_MK_PACKED_INT8_GROUPMODE) {
       CONV_BIAS_WINOGRAD_NCHW44_MK_PACKED_INT8_COMP_F32_GROUPMODE) {
    using namespace conv_bias;

    Checker<ConvBiasForward> checker(handle());
    auto run = [&checker](Handle* handle, const std::vector<TestArg>& args,
                          const std::vector<size_t>& out_size, DType A_dtype,
    auto run = [&checker](const std::vector<TestArg>& args, DType A_dtype,
                          DType B_dtype, DType C_dtype, DType D_dtype,
                          param::MatrixMul::Format format, float eps) {
                          float eps) {
        for (auto&& arg : args) {
            for (uint32_t m : out_size) {
                checker.set_extra_opr_impl(std::bind(
                        winograd_algo_extra_impl, std::placeholders::_1, m,
                        arg.param, handle, format));
                checker.set_dtype(0, A_dtype)
                        .set_dtype(1, B_dtype)
                        .set_dtype(2, C_dtype)
                        .set_dtype(4, D_dtype)
                        .set_epsilon(eps)
                        .set_param(arg.param)
                        .execs({arg.src, arg.filter, arg.bias, {}, {}});
            }
            checker.set_dtype(0, A_dtype)
                    .set_dtype(1, B_dtype)
                    .set_dtype(2, C_dtype)
                    .set_dtype(4, D_dtype)
                    .set_epsilon(eps)
                    .set_param(arg.param)
                    .execs({arg.src, arg.filter, arg.bias, {}, {}});
        }
    };

    float epsilon = 0.001;
 #if MEGDNN_AARCH64
    const char* matmul_name = "AARCH64_INT16X16X32_MK8_8X8";
    const char* matmul_name = "AARCH64_F32_MK4_4x16";
 #else
    const char* matmul_name = "ARMV7_INT16X16X32_MK8_4X8";
    const char* matmul_name = "ARMV7_F32_MK4_4x8";
 #endif
    checker.set_before_exec_callback(conv_bias::ConvBiasAlgoChecker<ConvBias>(
            ssprintf("WINOGRAD_NCHW44:%s:8:2:32", matmul_name).c_str()));

            ssprintf("WINOGRAD_NCHW44:%s:4:2:32", matmul_name).c_str()));
    std::vector<TestArg> quantized_args =
            get_int8_nchw44_args(3, 4, false, true);
            get_int8_nchw44_args(3, 4, true, true);
    UniformIntRNG int_rng{-50, 50};
    checker.set_rng(0, &int_rng).set_rng(1, &int_rng).set_rng(2, &int_rng);
    run(handle(), quantized_args, {2}, dtype::QuantizedS8(2.5f),
        dtype::QuantizedS8(2.5f), dtype::QuantizedS32(6.25f),
        dtype::QuantizedS8(60.25f), param::MatrixMul::Format::MK8, 1e-3);
    run(quantized_args, dtype::QuantizedS8(0.41113496f),
        dtype::QuantizedS8(0.01887994f),
        dtype::QuantizedS32(0.41113496f * 0.01887994f),
        dtype::QuantizedS8(0.49550694f), epsilon);
 }

 TEST_F(ARM_COMMON_MULTI_THREADS,
       CONV_BIAS_WINOGRAD_NCHW44_MK_PACKED_INT8_COMP_F32) {
       CONV_BIAS_WINOGRAD_NCHW44_MK_PACKED_INT8_GROUPMODE) {
    using namespace conv_bias;

    Checker<ConvBiasForward> checker(handle());
    auto run = [&checker](Handle* handle, const std::vector<TestArg>& args,
                          const std::vector<size_t>& out_size, DType A_dtype,
    auto run = [&checker](const std::vector<TestArg>& args, DType A_dtype,
                          DType B_dtype, DType C_dtype, DType D_dtype,
                          param::MatrixMul::Format format, float eps) {
                          float eps) {
        for (auto&& arg : args) {
            for (uint32_t m : out_size) {
                checker.set_extra_opr_impl(std::bind(
                        winograd_algo_extra_impl, std::placeholders::_1, m,
                        arg.param, handle, format));
                checker.set_dtype(0, A_dtype)
                        .set_dtype(1, B_dtype)
                        .set_dtype(2, C_dtype)
                        .set_dtype(4, D_dtype)
                        .set_epsilon(eps)
                        .set_param(arg.param)
                        .execs({arg.src, arg.filter, arg.bias, {}, {}});
            }
            checker.set_dtype(0, A_dtype)
                    .set_dtype(1, B_dtype)
                    .set_dtype(2, C_dtype)
                    .set_dtype(4, D_dtype)
                    .set_epsilon(eps)
                    .set_param(arg.param)
                    .execs({arg.src, arg.filter, arg.bias, {}, {}});
        }
    };

    float epsilon = 0.001;
 #if MEGDNN_AARCH64
    const char* matmul_name = "AARCH64_F32_MK4_4x16";
    const char* matmul_name = "AARCH64_INT16X16X32_MK8_8X8";
 #else
    const char* matmul_name = "ARMV7_F32_MK4_4x8";
    const char* matmul_name = "ARMV7_INT16X16X32_MK8_4X8";
 #endif
    checker.set_before_exec_callback(conv_bias::ConvBiasAlgoChecker<ConvBias>(
            ssprintf("WINOGRAD_NCHW44:%s:4:2:32", matmul_name).c_str()));
    std::vector<TestArg> quantized_args = get_int8_nchw44_args(3, 4, true);
            ssprintf("WINOGRAD_NCHW44:%s:8:2:32", matmul_name).c_str()));

    std::vector<TestArg> quantized_args =
            get_int8_nchw44_args(3, 4, false, true);
    UniformIntRNG int_rng{-50, 50};
    checker.set_rng(0, &int_rng).set_rng(1, &int_rng).set_rng(2, &int_rng);
    run(handle(), quantized_args, {2}, dtype::QuantizedS8(0.41113496f),
        dtype::QuantizedS8(0.01887994f),
        dtype::QuantizedS32(0.41113496f * 0.01887994f),
        dtype::QuantizedS8(0.49550694f), param::MatrixMul::Format::MK4,
        epsilon);
    run(quantized_args, dtype::QuantizedS8(2.5f), dtype::QuantizedS8(2.5f),
        dtype::QuantizedS32(6.25f), dtype::QuantizedS8(60.25f), 1e-3);
 }

 TEST_F(ARM_COMMON_MULTI_THREADS,
       CONV_BIAS_WINOGRAD_NCHW44_MK_PACKED_INT8_COMP_F32_GROUPMODE) {
       CONV_BIAS_WINOGRAD_NCHW44_MK_PACKED_INT8_COMP_F32) {
    using namespace conv_bias;

    Checker<ConvBiasForward> checker(handle());
    auto run = [&checker](Handle* handle, const std::vector<TestArg>& args,
                          const std::vector<size_t>& out_size, DType A_dtype,
    auto run = [&checker](const std::vector<TestArg>& args, DType A_dtype,
                          DType B_dtype, DType C_dtype, DType D_dtype,
                          param::MatrixMul::Format format, float eps) {
                          float eps) {
        for (auto&& arg : args) {
            for (uint32_t m : out_size) {
                checker.set_extra_opr_impl(std::bind(
                        winograd_algo_extra_impl, std::placeholders::_1, m,
                        arg.param, handle, format));
                checker.set_dtype(0, A_dtype)
                        .set_dtype(1, B_dtype)
                        .set_dtype(2, C_dtype)
                        .set_dtype(4, D_dtype)
                        .set_epsilon(eps)
                        .set_param(arg.param)
                        .execs({arg.src, arg.filter, arg.bias, {}, {}});
            }
            checker.set_dtype(0, A_dtype)
                    .set_dtype(1, B_dtype)
                    .set_dtype(2, C_dtype)
                    .set_dtype(4, D_dtype)
                    .set_epsilon(eps)
                    .set_param(arg.param)
                    .execs({arg.src, arg.filter, arg.bias, {}, {}});
        }
    };

@@ -1096,23 +990,15 @@ TEST_F(ARM_COMMON_MULTI_THREADS,
 #endif
    checker.set_before_exec_callback(conv_bias::ConvBiasAlgoChecker<ConvBias>(
            ssprintf("WINOGRAD_NCHW44:%s:4:2:32", matmul_name).c_str()));
    std::vector<TestArg> quantized_args =
            get_int8_nchw44_args(3, 4, true, true);
    std::vector<TestArg> quantized_args = get_int8_nchw44_args(3, 4, true);
    UniformIntRNG int_rng{-50, 50};
    checker.set_rng(0, &int_rng).set_rng(1, &int_rng).set_rng(2, &int_rng);
    run(handle(), quantized_args, {2}, dtype::QuantizedS8(0.41113496f),
    run(quantized_args, dtype::QuantizedS8(0.41113496f),
        dtype::QuantizedS8(0.01887994f),
        dtype::QuantizedS32(0.41113496f * 0.01887994f),
        dtype::QuantizedS8(0.49550694f), param::MatrixMul::Format::MK4,
        epsilon);
        dtype::QuantizedS8(0.49550694f), epsilon);
 }







 #if __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
 TEST_F(ARM_COMMON_MULTI_THREADS, CONV_BIAS_WINOGRAD_F16_F23) {
    using namespace conv_bias;
@@ -1170,7 +1056,6 @@ TEST_F(ARM_COMMON_MULTI_THREADS, CONV_BIAS_WINOGRAD_F16_8x8_2) {
    check_winograd_fp16("8:2:32", checker, args_back_half, rng, 0.25,
                        param::MatrixMul::Format::MK8);
 }

 #endif
 TEST_F(ARM_COMMON_MULTI_THREADS, CONV_BIAS_WINOGRAD_INT8_8X8) {
    using namespace conv_bias;
@@ -1187,6 +1072,7 @@ TEST_F(ARM_COMMON_MULTI_THREADS, CONV_BIAS_WINOGRAD_INT8_8X8) {

    check_winograd("8:2:32", checker, args, param::MatrixMul::Format::MK8);
 }

 TEST_F(ARM_COMMON_MULTI_THREADS,
       CONV_BIAS_WINOGRAD_INT8_8X8_WEIGHT_PREPROCESS) {
    using namespace conv_bias;
--- a/dnn/test/arm_common/conv_bias_multi_thread_weight_preprocess.cpp
+++ b/dnn/test/arm_common/conv_bias_multi_thread_weight_preprocess.cpp
@@ -83,56 +83,12 @@ TEST_F(ARM_COMMON_MULTI_THREADS, CONV_BIAS_WINOGRAD_PREPROCESS_NCHW44) {

    Checker<ConvBiasForward> checker(handle());

    auto extra_impl = [](const TensorNDArray& tensors, uint32_t m,
                         param::ConvBias param, Handle* handle) {
        megdnn_assert(param.format == param::ConvBias::Format::NCHW44);
        auto winograd_preprocess_opr =
                handle->create_operator<WinogradFilterPreprocess>();
        winograd_preprocess_opr->param().output_block_size = m;
        winograd_preprocess_opr->param().format = param::MatrixMul::Format::MK4;
        TensorLayout filter_transform_layout;
        winograd_preprocess_opr->deduce_layout(tensors[1].layout,
                                               filter_transform_layout);
        size_t winograd_preprocess_workspace_in_bytes =
                winograd_preprocess_opr->get_workspace_in_bytes(
                        tensors[1].layout, filter_transform_layout);

        auto conv_bias_opr = handle->create_operator<ConvBias>();
        conv_bias_opr->param() = param;
        conv_bias_opr->param().format =
                param::ConvBias::Format::NCHW44_WINOGRAD;
        conv_bias_opr->param().output_block_size = m;
        size_t conv_bias_workspace_in_bytes =
                conv_bias_opr->get_workspace_in_bytes(
                        tensors[0].layout, filter_transform_layout,
                        tensors[2].layout, tensors[3].layout, tensors[4].layout,
                        nullptr);

        WorkspaceBundle wb(nullptr, {filter_transform_layout.span().dist_byte(),
                                     conv_bias_workspace_in_bytes,
                                     winograd_preprocess_workspace_in_bytes});
        wb.set(malloc(wb.total_size_in_bytes()));

        TensorND filter_transform_tensor(wb.get(0),
                                         std::move(filter_transform_layout));
        winograd_preprocess_opr->exec(tensors[1], filter_transform_tensor,
                                      wb.get_workspace(2));
        conv_bias_opr->exec(tensors[0], filter_transform_tensor, tensors[2],
                            tensors[3], tensors[4], nullptr,
                            wb.get_workspace(1));
        free(wb.ptr());
    };

    auto run = [&checker, &extra_impl](
                       Handle* handle, const std::vector<TestArg>& args,
                       const std::vector<size_t>& out_size, DType A_dtype,
    auto run = [&checker](
                       const std::vector<TestArg>& args,
                       DType A_dtype,
                       DType B_dtype, DType C_dtype, DType D_dtype,
                       const float eps) {
        for (auto&& arg : args) {
            for (uint32_t m : out_size) {
                checker.set_extra_opr_impl(std::bind(extra_impl,
                                                     std::placeholders::_1, m,
                                                     arg.param, handle));
                checker.set_dtype(0, A_dtype)
                        .set_dtype(1, B_dtype)
                        .set_dtype(2, C_dtype)
@@ -140,7 +96,6 @@ TEST_F(ARM_COMMON_MULTI_THREADS, CONV_BIAS_WINOGRAD_PREPROCESS_NCHW44) {
                        .set_epsilon(eps)
                        .set_param(arg.param)
                        .execs({arg.src, arg.filter, arg.bias, {}, {}});
            }
        }
    };

@@ -149,7 +104,7 @@ TEST_F(ARM_COMMON_MULTI_THREADS, CONV_BIAS_WINOGRAD_PREPROCESS_NCHW44) {
    //     dtype::Float32(), dtype::Float32(), 1e-2f);

    //! remove this when low precision mode is ok
    run(handle(), nchw44_args, {2, 6}, dtype::Float32(), dtype::Float32(),
    run(nchw44_args, dtype::Float32(), dtype::Float32(),
        dtype::Float32(), dtype::Float32(), 1e-3f);
 }
 TEST_F(ARM_COMMON_MULTI_THREADS,
@@ -158,31 +113,24 @@ TEST_F(ARM_COMMON_MULTI_THREADS,

    Checker<ConvBiasForward, OprWeightPreprocessProxy<ConvBiasForward>> checker(
            handle());
    auto run = [&checker](Handle* handle, const std::vector<TestArg>& args,
                          const std::vector<size_t>& out_size, DType A_dtype,
    auto run = [&checker](const std::vector<TestArg>& args, DType A_dtype,
                          DType B_dtype, DType C_dtype, DType D_dtype,
                          param::MatrixMul::Format format, float eps) {
                          float eps) {
        for (auto&& arg : args) {
            for (uint32_t m : out_size) {
                checker.set_extra_opr_impl(std::bind(
                        winograd_algo_extra_impl, std::placeholders::_1, m,
                        arg.param, handle, format));
                checker.set_dtype(0, A_dtype)
                        .set_dtype(1, B_dtype)
                        .set_dtype(2, C_dtype)
                        .set_dtype(4, D_dtype)
                        .set_epsilon(eps)
                        .set_param(arg.param)
                        .execs({arg.src, arg.filter, arg.bias, {}, {}});
            }
            checker.set_dtype(0, A_dtype)
                    .set_dtype(1, B_dtype)
                    .set_dtype(2, C_dtype)
                    .set_dtype(4, D_dtype)
                    .set_epsilon(eps)
                    .set_param(arg.param)
                    .execs({arg.src, arg.filter, arg.bias, {}, {}});
        }
    };
    std::vector<TestArg> args = get_winograd_mk_packed_args(8);
    std::vector<TestArg> args_first_half(args.begin(),
                                         args.begin() + args.size() / 2);
    run(handle(), args_first_half, {2, 6}, dtype::Float32{}, dtype::Float32{},
        dtype::Float32{}, dtype::Float32{}, param::MatrixMul::Format::MK4,
        1e-3f);
    run(args_first_half, dtype::Float32{}, dtype::Float32{}, dtype::Float32{},
        dtype::Float32{}, 1e-3f);
 }
 TEST_F(ARM_COMMON_MULTI_THREADS,
       CONV_BIAS_WINOGRAD_MK_PACKED_F32_2_WEIGHT_PREPROCESS) {
@@ -190,31 +138,24 @@ TEST_F(ARM_COMMON_MULTI_THREADS,

    Checker<ConvBiasForward, OprWeightPreprocessProxy<ConvBiasForward>> checker(
            handle());
    auto run = [&checker](Handle* handle, const std::vector<TestArg>& args,
                          const std::vector<size_t>& out_size, DType A_dtype,
    auto run = [&checker](const std::vector<TestArg>& args, DType A_dtype,
                          DType B_dtype, DType C_dtype, DType D_dtype,
                          param::MatrixMul::Format format, float eps) {
                          float eps) {
        for (auto&& arg : args) {
            for (uint32_t m : out_size) {
                checker.set_extra_opr_impl(std::bind(
                        winograd_algo_extra_impl, std::placeholders::_1, m,
                        arg.param, handle, format));
                checker.set_dtype(0, A_dtype)
                        .set_dtype(1, B_dtype)
                        .set_dtype(2, C_dtype)
                        .set_dtype(4, D_dtype)
                        .set_epsilon(eps)
                        .set_param(arg.param)
                        .execs({arg.src, arg.filter, arg.bias, {}, {}});
            }
            checker.set_dtype(0, A_dtype)
                    .set_dtype(1, B_dtype)
                    .set_dtype(2, C_dtype)
                    .set_dtype(4, D_dtype)
                    .set_epsilon(eps)
                    .set_param(arg.param)
                    .execs({arg.src, arg.filter, arg.bias, {}, {}});
        }
    };
    std::vector<TestArg> args = get_winograd_mk_packed_args(8);
    std::vector<TestArg> args_second_half(args.begin() + args.size() / 2,
                                          args.end());
    run(handle(), args_second_half, {2, 6}, dtype::Float32{}, dtype::Float32{},
        dtype::Float32{}, dtype::Float32{}, param::MatrixMul::Format::MK4,
        1e-3f);
    run(args_second_half, dtype::Float32{}, dtype::Float32{}, dtype::Float32{},
        dtype::Float32{}, 1e-3f);
 }
 #if __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
 TEST_F(ARM_COMMON_MULTI_THREADS,
@@ -223,32 +164,25 @@ TEST_F(ARM_COMMON_MULTI_THREADS,

    Checker<ConvBiasForward, OprWeightPreprocessProxy<ConvBiasForward>> checker(
            handle());
    auto run = [&checker](Handle* handle, const std::vector<TestArg>& args,
                          const std::vector<size_t>& out_size, DType A_dtype,
    auto run = [&checker](const std::vector<TestArg>& args, DType A_dtype,
                          DType B_dtype, DType C_dtype, DType D_dtype,
                          param::MatrixMul::Format format, float eps) {
                          float eps) {
        for (auto&& arg : args) {
            for (uint32_t m : out_size) {
                checker.set_extra_opr_impl(std::bind(
                        winograd_algo_extra_impl, std::placeholders::_1, m,
                        arg.param, handle, format));
                checker.set_dtype(0, A_dtype)
                        .set_dtype(1, B_dtype)
                        .set_dtype(2, C_dtype)
                        .set_dtype(4, D_dtype)
                        .set_epsilon(eps)
                        .set_param(arg.param)
                        .execs({arg.src, arg.filter, arg.bias, {}, {}});
            }
            checker.set_dtype(0, A_dtype)
                    .set_dtype(1, B_dtype)
                    .set_dtype(2, C_dtype)
                    .set_dtype(4, D_dtype)
                    .set_epsilon(eps)
                    .set_param(arg.param)
                    .execs({arg.src, arg.filter, arg.bias, {}, {}});
        }
    };

    std::vector<TestArg> args = get_winograd_mk_packed_args(8);
    Float16PeriodicalRNG* rng = new Float16PeriodicalRNG(0x3c00);
    checker.set_rng(0, rng).set_rng(1, rng).set_rng(2, rng);
    run(handle(), args, {2}, dtype::Float16{}, dtype::Float16{},
        dtype::Float16{}, dtype::Float16{}, param::MatrixMul::Format::MK8,
        0.25);
    run(args, dtype::Float16{}, dtype::Float16{}, dtype::Float16{},
        dtype::Float16{}, 0.25);
 }
 #endif
 TEST_F(ARM_COMMON_MULTI_THREADS,
@@ -257,23 +191,17 @@ TEST_F(ARM_COMMON_MULTI_THREADS,

    Checker<ConvBiasForward, OprWeightPreprocessProxy<ConvBiasForward>> checker(
            handle());
    auto run = [&checker](Handle* handle, const std::vector<TestArg>& args,
                          const std::vector<size_t>& out_size, DType A_dtype,
    auto run = [&checker](const std::vector<TestArg>& args, DType A_dtype,
                          DType B_dtype, DType C_dtype, DType D_dtype,
                          param::MatrixMul::Format format, float eps) {
                          float eps) {
        for (auto&& arg : args) {
            for (uint32_t m : out_size) {
                checker.set_extra_opr_impl(std::bind(
                        winograd_algo_extra_impl, std::placeholders::_1, m,
                        arg.param, handle, format));
                checker.set_dtype(0, A_dtype)
                        .set_dtype(1, B_dtype)
                        .set_dtype(2, C_dtype)
                        .set_dtype(4, D_dtype)
                        .set_epsilon(eps)
                        .set_param(arg.param)
                        .execs({arg.src, arg.filter, arg.bias, {}, {}});
            }
            checker.set_dtype(0, A_dtype)
                    .set_dtype(1, B_dtype)
                    .set_dtype(2, C_dtype)
                    .set_dtype(4, D_dtype)
                    .set_epsilon(eps)
                    .set_param(arg.param)
                    .execs({arg.src, arg.filter, arg.bias, {}, {}});
        }
    };

@@ -289,9 +217,8 @@ TEST_F(ARM_COMMON_MULTI_THREADS,
            get_quantized_winograd_mk_packed_args(8);
    UniformIntRNG int_rng{-50, 50};
    checker.set_rng(0, &int_rng).set_rng(1, &int_rng).set_rng(2, &int_rng);
    run(handle(), quantized_args, {2}, dtype::QuantizedS8(2.5f),
        dtype::QuantizedS8(2.5f), dtype::QuantizedS32(6.25f),
        dtype::QuantizedS8(60.25f), param::MatrixMul::Format::MK8, 1e-3);
    run(quantized_args, dtype::QuantizedS8(2.5f), dtype::QuantizedS8(2.5f),
        dtype::QuantizedS32(6.25f), dtype::QuantizedS8(60.25f), 1e-3);
 }
 TEST_F(ARM_COMMON_MULTI_THREADS,
       CONV_BIAS_WINOGRAD_NCHW44_MK_PACKED_INT8_WEIGHT_PREPROCESS) {
@@ -299,15 +226,11 @@ TEST_F(ARM_COMMON_MULTI_THREADS,

    Checker<ConvBiasForward, OprWeightPreprocessProxy<ConvBiasForward>> checker(
            handle());
    auto run = [&checker](Handle* handle, const std::vector<TestArg>& args,
                          const std::vector<size_t>& out_size, DType A_dtype,
    auto run = [&checker](const std::vector<TestArg>& args,
                          DType A_dtype,
                          DType B_dtype, DType C_dtype, DType D_dtype,
                          param::MatrixMul::Format format, float eps) {
                          float eps) {
        for (auto&& arg : args) {
            for (uint32_t m : out_size) {
                checker.set_extra_opr_impl(std::bind(
                        winograd_algo_extra_impl, std::placeholders::_1, m,
                        arg.param, handle, format));
                checker.set_dtype(0, A_dtype)
                        .set_dtype(1, B_dtype)
                        .set_dtype(2, C_dtype)
@@ -315,7 +238,6 @@ TEST_F(ARM_COMMON_MULTI_THREADS,
                        .set_epsilon(eps)
                        .set_param(arg.param)
                        .execs({arg.src, arg.filter, arg.bias, {}, {}});
            }
        }
    };

@@ -330,9 +252,8 @@ TEST_F(ARM_COMMON_MULTI_THREADS,
    std::vector<TestArg> quantized_args = get_int8_nchw44_args(3, 4);
    UniformIntRNG int_rng{-50, 50};
    checker.set_rng(0, &int_rng).set_rng(1, &int_rng).set_rng(2, &int_rng);
    run(handle(), quantized_args, {2}, dtype::QuantizedS8(2.5f),
        dtype::QuantizedS8(2.5f), dtype::QuantizedS32(6.25f),
        dtype::QuantizedS8(60.25f), param::MatrixMul::Format::MK8, 1e-3);
    run(quantized_args, dtype::QuantizedS8(2.5f), dtype::QuantizedS8(2.5f),
        dtype::QuantizedS32(6.25f), dtype::QuantizedS8(60.25f), 1e-3);
 }
 TEST_F(ARM_COMMON_MULTI_THREADS,
       CONV_BIAS_WINOGRAD_NCHW44_MK_PACKED_INT8_GROUPMODE_WEIGHT_PREPROCESS) {
@@ -340,23 +261,17 @@ TEST_F(ARM_COMMON_MULTI_THREADS,

    Checker<ConvBiasForward, OprWeightPreprocessProxy<ConvBiasForward>> checker(
            handle());
    auto run = [&checker](Handle* handle, const std::vector<TestArg>& args,
                          const std::vector<size_t>& out_size, DType A_dtype,
    auto run = [&checker](const std::vector<TestArg>& args, DType A_dtype,
                          DType B_dtype, DType C_dtype, DType D_dtype,
                          param::MatrixMul::Format format, float eps) {
                          float eps) {
        for (auto&& arg : args) {
            for (uint32_t m : out_size) {
                checker.set_extra_opr_impl(std::bind(
                        winograd_algo_extra_impl, std::placeholders::_1, m,
                        arg.param, handle, format));
                checker.set_dtype(0, A_dtype)
                        .set_dtype(1, B_dtype)
                        .set_dtype(2, C_dtype)
                        .set_dtype(4, D_dtype)
                        .set_epsilon(eps)
                        .set_param(arg.param)
                        .execs({arg.src, arg.filter, arg.bias, {}, {}});
            }
            checker.set_dtype(0, A_dtype)
                    .set_dtype(1, B_dtype)
                    .set_dtype(2, C_dtype)
                    .set_dtype(4, D_dtype)
                    .set_epsilon(eps)
                    .set_param(arg.param)
                    .execs({arg.src, arg.filter, arg.bias, {}, {}});
        }
    };

@@ -372,9 +287,8 @@ TEST_F(ARM_COMMON_MULTI_THREADS,
            get_int8_nchw44_args(3, 4, false, true);
    UniformIntRNG int_rng{-50, 50};
    checker.set_rng(0, &int_rng).set_rng(1, &int_rng).set_rng(2, &int_rng);
    run(handle(), quantized_args, {2}, dtype::QuantizedS8(2.5f),
        dtype::QuantizedS8(2.5f), dtype::QuantizedS32(6.25f),
        dtype::QuantizedS8(60.25f), param::MatrixMul::Format::MK8, 1e-3);
    run(quantized_args, dtype::QuantizedS8(2.5f), dtype::QuantizedS8(2.5f),
        dtype::QuantizedS32(6.25f), dtype::QuantizedS8(60.25f), 1e-3);
 }

 TEST_F(ARM_COMMON_MULTI_THREADS,
@@ -383,23 +297,17 @@ TEST_F(ARM_COMMON_MULTI_THREADS,

    Checker<ConvBiasForward, OprWeightPreprocessProxy<ConvBiasForward>> checker(
            handle());
    auto run = [&checker](Handle* handle, const std::vector<TestArg>& args,
                          const std::vector<size_t>& out_size, DType A_dtype,
    auto run = [&checker](const std::vector<TestArg>& args, DType A_dtype,
                          DType B_dtype, DType C_dtype, DType D_dtype,
                          param::MatrixMul::Format format, float eps) {
                          float eps) {
        for (auto&& arg : args) {
            for (uint32_t m : out_size) {
                checker.set_extra_opr_impl(std::bind(
                        winograd_algo_extra_impl, std::placeholders::_1, m,
                        arg.param, handle, format));
                checker.set_dtype(0, A_dtype)
                        .set_dtype(1, B_dtype)
                        .set_dtype(2, C_dtype)
                        .set_dtype(4, D_dtype)
                        .set_epsilon(eps)
                        .set_param(arg.param)
                        .execs({arg.src, arg.filter, arg.bias, {}, {}});
            }
            checker.set_dtype(0, A_dtype)
                    .set_dtype(1, B_dtype)
                    .set_dtype(2, C_dtype)
                    .set_dtype(4, D_dtype)
                    .set_epsilon(eps)
                    .set_param(arg.param)
                    .execs({arg.src, arg.filter, arg.bias, {}, {}});
        }
    };

@@ -414,11 +322,10 @@ TEST_F(ARM_COMMON_MULTI_THREADS,
    std::vector<TestArg> quantized_args = get_int8_nchw44_args(3, 4, true);
    UniformIntRNG int_rng{-50, 50};
    checker.set_rng(0, &int_rng).set_rng(1, &int_rng).set_rng(2, &int_rng);
    run(handle(), quantized_args, {2}, dtype::QuantizedS8(0.41113496f),
    run(quantized_args, dtype::QuantizedS8(0.41113496f),
        dtype::QuantizedS8(0.01887994f),
        dtype::QuantizedS32(0.41113496f * 0.01887994f),
        dtype::QuantizedS8(0.49550694f), param::MatrixMul::Format::MK4,
        epsilon);
        dtype::QuantizedS8(0.49550694f), epsilon);
 }

 TEST_F(ARM_COMMON_MULTI_THREADS,
@@ -427,23 +334,17 @@ TEST_F(ARM_COMMON_MULTI_THREADS,

    Checker<ConvBiasForward, OprWeightPreprocessProxy<ConvBiasForward>> checker(
            handle());
    auto run = [&checker](Handle* handle, const std::vector<TestArg>& args,
                          const std::vector<size_t>& out_size, DType A_dtype,
    auto run = [&checker](const std::vector<TestArg>& args, DType A_dtype,
                          DType B_dtype, DType C_dtype, DType D_dtype,
                          param::MatrixMul::Format format, float eps) {
                          float eps) {
        for (auto&& arg : args) {
            for (uint32_t m : out_size) {
                checker.set_extra_opr_impl(std::bind(
                        winograd_algo_extra_impl, std::placeholders::_1, m,
                        arg.param, handle, format));
                checker.set_dtype(0, A_dtype)
                        .set_dtype(1, B_dtype)
                        .set_dtype(2, C_dtype)
                        .set_dtype(4, D_dtype)
                        .set_epsilon(eps)
                        .set_param(arg.param)
                        .execs({arg.src, arg.filter, arg.bias, {}, {}});
            }
            checker.set_dtype(0, A_dtype)
                    .set_dtype(1, B_dtype)
                    .set_dtype(2, C_dtype)
                    .set_dtype(4, D_dtype)
                    .set_epsilon(eps)
                    .set_param(arg.param)
                    .execs({arg.src, arg.filter, arg.bias, {}, {}});
        }
    };

@@ -459,11 +360,10 @@ TEST_F(ARM_COMMON_MULTI_THREADS,
            get_int8_nchw44_args(3, 4, true, true);
    UniformIntRNG int_rng{-50, 50};
    checker.set_rng(0, &int_rng).set_rng(1, &int_rng).set_rng(2, &int_rng);
    run(handle(), quantized_args, {2}, dtype::QuantizedS8(0.41113496f),
    run(quantized_args, dtype::QuantizedS8(0.41113496f),
        dtype::QuantizedS8(0.01887994f),
        dtype::QuantizedS32(0.41113496f * 0.01887994f),
        dtype::QuantizedS8(0.49550694f), param::MatrixMul::Format::MK4,
        epsilon);
        dtype::QuantizedS8(0.49550694f), epsilon);
 }
 #if __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
 TEST_F(ARM_COMMON_MULTI_THREADS, CONV_BIAS_WINOGRAD_F16_F23_WEIGHT_PREPROCESS) {
--- a/dnn/test/arm_common/winograd_filter_preprocess.cpp
+++ b/dnn/test/arm_common/winograd_filter_preprocess.cpp
@@ -1,91 +0,0 @@
 /**
 * \file dnn/test/arm_common/winograd_filter_preprocess.cpp
 * MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
 *
 * Copyright (c) 2014-2020 Megvii Inc. All rights reserved.
 *
 * Unless required by applicable law or agreed to in writing,
 * software distributed under the License is distributed on an
 * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 */
 #include "test/common/checker.h"
 #include "test/common/benchmarker.h"
 #include "test/common/winograd_filter_preprocess.h"

 #include "test/arm_common/fixture.h"

 using namespace megdnn;
 using namespace test;

 TEST_F(ARM_COMMON, WinogradFilterPreprocessF32) {
    using namespace winograd_filter_preprocess;
    Checker<WinogradFilterPreprocess> checker(handle());
    // default
    std::vector<TestArg> args = get_args(6, 3);
    std::vector<TestArg> args54 = get_args(5, 4);
    std::vector<TestArg> args45 = get_args(4, 5);

    // mk4
    std::vector<TestArg> args_mk4_out2 =
            get_mk_packed_args(2, param::Winograd::Format::MK4, 4);
    std::vector<TestArg> args_mk4_out6 =
            get_mk_packed_args(6, param::Winograd::Format::MK4, 4);

    args.insert(args.end(), args54.begin(), args54.end());
    args.insert(args.end(), args45.begin(), args45.end());
    args.insert(args.end(), args_mk4_out2.begin(), args_mk4_out2.end());
    args.insert(args.end(), args_mk4_out6.begin(), args_mk4_out6.end());
    for (auto&& arg : args) {
        checker.set_param(arg.param)
                .set_dtype(0, dtype::Float32{})
                .set_dtype(1, dtype::Float32{})
                .execs({arg.src, {}});
    }
 }

 TEST_F(ARM_COMMON, WinogradFilterPreprocessQs8) {
    using namespace winograd_filter_preprocess;
    std::vector<TestArg> args =
            get_mk_packed_args(2, param::Winograd::Format::MK8, 8);
    Checker<WinogradFilterPreprocess> checker(handle());
    UniformIntRNG rng{-50, 50};
    checker.set_rng(0, &rng).set_rng(1, &rng).set_rng(2, &rng);
    for (auto&& arg : args) {
        checker.set_param(arg.param)
                .set_dtype(0, dtype::QuantizedS8(2.5f))
                .set_dtype(1, dtype::QuantizedS16(2.5f))
                .execs({arg.src, {}});
    }
 }

 #if __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
 TEST_F(ARM_COMMON, WinogradFilterPreprocessF16) {
    using namespace winograd_filter_preprocess;
    Checker<WinogradFilterPreprocess> checker(handle());
    // default
    std::vector<TestArg> args = get_args(6, 3);
    std::vector<TestArg> args_23 =
            get_mk_packed_args(2, param::Winograd::Format::DEFAULT, 4);
    std::vector<TestArg> args45 = get_args(4, 5);

    // mk8
    std::vector<TestArg> args_mk8_out2 =
            get_mk_packed_args(2, param::Winograd::Format::MK8, 8);

    args.insert(args.end(), args_23.begin(), args_23.end());
    args.insert(args.end(), args45.begin(), args45.end());
    args.insert(args.end(), args_mk8_out2.begin(), args_mk8_out2.end());

    Float16PeriodicalRNG* rng = new Float16PeriodicalRNG(0x3c00);
    for (auto&& arg : args) {
        checker.set_param(arg.param)
                .set_rng(0, rng)
                .set_dtype(0, dtype::Float16{})
                .set_dtype(1, dtype::Float16{})
                .execs({arg.src, {}});
    }
 }

 #endif

 // vim: syntax=cpp.doxygen
--- a/dnn/test/common/conv_bias.cpp
+++ b/dnn/test/common/conv_bias.cpp
@@ -1152,50 +1152,6 @@ void check_conv_bias_preprocess(std::vector<conv_bias::TestArg> args,
 }


 void winograd_algo_extra_impl(const TensorNDArray& tensors, uint32_t m,
                              param::ConvBias param, Handle* handle,
                              param::MatrixMul::Format format) {
    megdnn_assert(param.format == param::ConvBias::Format::NCHW ||
                  param.format == param::ConvBias::Format::NCHW44);
    auto winograd_preprocess_opr =
            handle->create_operator<WinogradFilterPreprocess>();
    winograd_preprocess_opr->param().output_block_size = m;
    winograd_preprocess_opr->param().format = format;
    winograd_preprocess_opr->param().compute_mode = param.compute_mode;
    TensorLayout filter_transform_layout;
    winograd_preprocess_opr->deduce_layout(tensors[1].layout,
                                           filter_transform_layout);
    size_t winograd_preprocess_workspace_in_bytes =
            winograd_preprocess_opr->get_workspace_in_bytes(
                    tensors[1].layout, filter_transform_layout);

    auto conv_bias_opr = handle->create_operator<ConvBias>();
    conv_bias_opr->param() = param;
    if (param.format == param::ConvBias::Format::NCHW) {
        conv_bias_opr->param().format = param::ConvBias::Format::NCHW_WINOGRAD;
    } else {
        conv_bias_opr->param().format =
                param::ConvBias::Format::NCHW44_WINOGRAD;
    }
    conv_bias_opr->param().output_block_size = m;
    size_t conv_bias_workspace_in_bytes = conv_bias_opr->get_workspace_in_bytes(
            tensors[0].layout, filter_transform_layout, tensors[2].layout,
            tensors[3].layout, tensors[4].layout, nullptr);

    WorkspaceBundle wb(nullptr, {filter_transform_layout.span().dist_byte(),
                                 conv_bias_workspace_in_bytes,
                                 winograd_preprocess_workspace_in_bytes});
    wb.set(malloc(wb.total_size_in_bytes()));

    TensorND filter_transform_tensor(wb.get(0),
                                     std::move(filter_transform_layout));
    winograd_preprocess_opr->exec(tensors[1], filter_transform_tensor,
                                  wb.get_workspace(2));
    conv_bias_opr->exec(tensors[0], filter_transform_tensor, tensors[2],
                        tensors[3], tensors[4], nullptr, wb.get_workspace(1));
    free(wb.ptr());
 };

 void checker_conv_bias_common(std::vector<conv_bias::TestArg> args, Handle* handle,
                       RNG* rng, float epsilon, DType type0, DType type1,
                       DType type2, DType type3, const char* algo_name) {
@@ -1388,7 +1344,6 @@ std::vector<conv_bias::TestArg> get_nchw44_conv_bias_args(
                                }
    return args;
 }

 }  // namespace conv_bias
 }  // namespace test
 }  // namespace megdnn
--- a/dnn/test/common/conv_bias.h
+++ b/dnn/test/common/conv_bias.h
@@ -94,9 +94,6 @@ void checker_conv_bias_int8x8x16(
        std::vector<megdnn::test::conv_bias::TestArg> args,
        megdnn::Handle* handle, const char* algo_name);

 void winograd_algo_extra_impl(const TensorNDArray& tensors, uint32_t m,
                              param::ConvBias param, Handle* handle,
                              param::MatrixMul::Format format);
 void checker_conv_bias_common(std::vector<conv_bias::TestArg> args,
                              Handle* handle, RNG* rng, float epsilon,
                              DType type0, DType type1, DType type2,
--- a/dnn/test/common/opr_trait.h
+++ b/dnn/test/common/opr_trait.h
@@ -95,7 +95,6 @@ DEF(MaskConvolution, 4, true, true);
 DEF(MaskPropagate, 2, true, true);
 DEF(RelayoutFormat, 2, true, true);
 DEF(MaxTensorDiff, 2, true, false);
 DEF(WinogradFilterPreprocess, 2, true, true);
 DEF(LocalShareForward, 3, true, true);
 DEF(LocalShareBackwardData, 3, true, false);
 DEF(LocalShareBackwardFilter, 3, true, false);
--- a/dnn/test/x86/conv_bias.cpp
+++ b/dnn/test/x86/conv_bias.cpp
@@ -1814,69 +1814,22 @@ TEST_F(X86_MULTI_THREADS, CONV_BIAS_WINOGRAD_WEIGHT_PREPROCESS) {
    using namespace conv_bias;
    std::vector<TestArg> args = get_winograd_mk_nchw88_args();
    Checker<ConvBiasForward> checker(handle());
    auto extra_impl = [](const TensorNDArray& tensors, uint32_t m,
                         param::ConvBias param, Handle* handle) {
        megdnn_assert(param.format == param::ConvBias::Format::NCHW88);
        auto winograd_preprocess_opr =
                handle->create_operator<WinogradFilterPreprocess>();
        winograd_preprocess_opr->param().output_block_size = m;
        winograd_preprocess_opr->param().format = param::MatrixMul::Format::MK8;
        TensorLayout filter_transform_layout;
        winograd_preprocess_opr->deduce_layout(tensors[1].layout,
                                               filter_transform_layout);
        size_t winograd_preprocess_workspace_in_bytes =
                winograd_preprocess_opr->get_workspace_in_bytes(
                        tensors[1].layout, filter_transform_layout);

        auto conv_bias_opr = handle->create_operator<ConvBias>();
        conv_bias_opr->param() = param;
        conv_bias_opr->param().format =
                param::ConvBias::Format::NCHW88_WINOGRAD;
        conv_bias_opr->param().output_block_size = m;
        size_t conv_bias_workspace_in_bytes =
                conv_bias_opr->get_workspace_in_bytes(
                        tensors[0].layout, filter_transform_layout,
                        tensors[2].layout, tensors[3].layout, tensors[4].layout,
                        nullptr);

        WorkspaceBundle wb(nullptr, {filter_transform_layout.span().dist_byte(),
                                     conv_bias_workspace_in_bytes,
                                     winograd_preprocess_workspace_in_bytes});
        wb.set(malloc(wb.total_size_in_bytes()));

        TensorND filter_transform_tensor(wb.get(0),
                                         std::move(filter_transform_layout));
        winograd_preprocess_opr->exec(tensors[1], filter_transform_tensor,
                                      wb.get_workspace(2));
        conv_bias_opr->exec(tensors[0], filter_transform_tensor, tensors[2],
                            tensors[3], tensors[4], nullptr,
                            wb.get_workspace(1));

        free(wb.ptr());
    };

    auto run = [&checker, &extra_impl](
                       Handle* handle, const std::vector<TestArg>& args,
                       const std::vector<size_t>& out_size, DType A_dtype,
                       DType B_dtype, DType C_dtype, DType D_dtype,
                       const float eps) {
    auto run = [&checker](const std::vector<TestArg>& args, DType A_dtype,
                          DType B_dtype, DType C_dtype, DType D_dtype,
                          const float eps) {
        for (auto&& arg : args) {
            for (uint32_t m : out_size) {
                checker.set_extra_opr_impl(std::bind(extra_impl,
                                                     std::placeholders::_1, m,
                                                     arg.param, handle));
                checker.set_dtype(0, A_dtype)
                        .set_dtype(1, B_dtype)
                        .set_dtype(2, C_dtype)
                        .set_dtype(4, D_dtype)
                        .set_epsilon(eps)
                        .set_param(arg.param)
                        .execs({arg.src, arg.filter, arg.bias, {}, {}});
            }
            checker.set_dtype(0, A_dtype)
                    .set_dtype(1, B_dtype)
                    .set_dtype(2, C_dtype)
                    .set_dtype(4, D_dtype)
                    .set_epsilon(eps)
                    .set_param(arg.param)
                    .execs({arg.src, arg.filter, arg.bias, {}, {}});
        }
    };
    run(handle(), args, {2, 6}, dtype::Float32(), dtype::Float32(),
        dtype::Float32(), dtype::Float32(), 1e-3f);
    run(args, dtype::Float32(), dtype::Float32(), dtype::Float32(),
        dtype::Float32(), 1e-3f);
 }

 /*********************************** End winograd ************************/
--- a/src/core/impl/graph/cg_impl.cpp
+++ b/src/core/impl/graph/cg_impl.cpp
@@ -32,7 +32,6 @@
 #include "megbrain/jit/fusion_pass.h"
 #endif

 #include "megbrain/gopt/weights_preprocess.h"

 using namespace mgb;
 using namespace cg;
--- a/src/gopt/impl/framework.cpp
+++ b/src/gopt/impl/framework.cpp
@@ -14,7 +14,6 @@
 #include "megbrain/gopt/gtrans.h"
 #include "megbrain/gopt/inference.h"
 #include "megbrain/gopt/misc.h"
 #include "megbrain/gopt/weights_preprocess.h"
 #include "megbrain/graph/cg.h"
 #include "megbrain/graph/event.h"
 #include "megbrain/graph/exc_extra_info.h"
@@ -780,8 +779,6 @@ const GraphOptimizer& GraphOptimizer::add_passes_for_optimize_options(
        add_pass<FuseConvBiasZPass>();
    });

    cb(weight_winograd_transform,
       { add_pass<WinogradTransformReplacePass>(); });
 #undef cb

    if (need_param_fuse) {
--- a/src/gopt/impl/weights_preprocess.cpp
+++ b/src/gopt/impl/weights_preprocess.cpp
@@ -1,206 +0,0 @@
 /**
 * \file src/gopt/impl/weights_preprocess.cpp
 * MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
 *
 * Copyright (c) 2014-2020 Megvii Inc. All rights reserved.
 *
 * Unless required by applicable law or agreed to in writing,
 * software distributed under the License is distributed on an
 * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 */

 #include "megbrain/gopt/weights_preprocess.h"
 #include "megbrain/gopt/inference.h"
 #include "megbrain/opr/dnn/convolution.h"
 #include "megbrain/opr/tensor_manip.h"

 #include "megbrain/utils/hash_ct.h"
 #include "midout.h"

 MIDOUT_DECL(megbrain_weight_preprocess)
 #define MIDOUT_B(tag) \
    MIDOUT_BEGIN(megbrain_weight_preprocess, midout_iv(MGB_HASH_STR(tag))) {
 #define MIDOUT_E \
    }            \
    MIDOUT_END();

 using namespace mgb;
 using namespace gopt;
 using namespace cg;

 const char* WinogradTransformReplacePass::name() const {
    return "winograd_transform";
 }

 void WinogradTransformReplacePass::apply(OptState& opt) const {
    MIDOUT_B("WinogradTransformReplacePass::apply")
    auto rewriter = opt.graph().make_rewriter();
    ConstVarPropogate cvprop{ConstVarType::IMMUTABLE_AND_PARAM};
    opt.graph().iter([&cvprop](OperatorNodeBase *opr) {
        cvprop.add_opr(opr);
    });

    auto get_algo = [](const opr::ConvBias& opr) -> std::string {
        auto&& inputs = opr.input();
        SmallVector<TensorLayout> layouts;
        mgb_assert(inputs.size() >= 2 && inputs.size() <= 4);
        auto&& mo = opr.megdnn_opr();
        for (size_t i = 0; i < 4; i++) {
            if (inputs.size() <= i) {
                if (i == 2) {
                    //! bias
                    DType dtype;
                    mo->deduce_dtype(inputs[0]->dtype(), inputs[1]->dtype(),
                                     DType{}, DType{}, dtype);
                    layouts.emplace_back(TensorShape{}, dtype);
                } else {
                    layouts.emplace_back(TensorShape{}, opr.output(0)->dtype(),
                                         opr.output(0)->format());
                }
            } else {
                layouts.emplace_back(inputs[i]->shape(), inputs[i]->dtype(),
                                     inputs[i]->format());
            }
        }
        layouts.emplace_back(opr.output(0)->shape(), opr.output(0)->dtype(),
                             opr.output(0)->format());

        AlgoChooserProfileCache& cache = opr.profile_cache();
        auto param_blob = opr.param_blob();
        AlgoChooserProfileCache::Key cache_key{layouts.data(), layouts.size(),
                                               param_blob.first,
                                               param_blob.second};
        auto&& rst = cache.get(cache_key);
        if (!rst.valid())
            return "";
        auto prof = rst.val();
        if (prof.empty())
            return "";
        return prof[0].algo;
    };
    auto on_opr = [&](OperatorNodeBase* opr) {
        auto type = opr->dyn_typeinfo();
        do {
            if (type != opr::ConvBias::typeinfo())
                break;
            auto&& conv_bias_opr = opr->cast_final_safe<opr::ConvBias>();
            auto&& inputs = conv_bias_opr.input();
            VarNodeArray new_inp;
            new_inp.reserve(inputs.size());
            for (auto i : inputs) {
                new_inp.push_back(rewriter.get_var(i));
            }
            if (!(cvprop.is_midconst(inputs[1]) ||
                  cvprop.is_const(inputs[1]))) {
                break;
            }
            auto algo_name = get_algo(conv_bias_opr);
            auto winograd_param =
                    megdnn::ConvBias::parse_winograd_name(algo_name);
            if (winograd_param == megdnn::ConvBias::INVALID_WINOGRAD_PARAM)
                break;
            mgb_assert(
                    conv_bias_opr.param().format ==
                                    megdnn::ConvBias::Param::Format::NCHW ||
                            conv_bias_opr.param().format ==
                                    megdnn::ConvBias::Param::Format::NCHW88 ||
                            conv_bias_opr.param().format ==
                                    megdnn::ConvBias::Param::Format::NCHW44,
                    "currently winograd only suppport NCHW and NCHW44 and "
                    "NCHW88");
            opr::ConvBiasForward::check_winograd_param_valid(
                    winograd_param, conv_bias_opr.input(0)->dtype());
            megdnn::param::Winograd winograd_preprocess_param;
            winograd_preprocess_param.format =
                    opr::ConvBiasForward::get_matmul_format(winograd_param);
            winograd_preprocess_param.output_block_size =
                    winograd_param.output_block_size;

            auto conv_bias_param = conv_bias_opr.param();
            //! If input dtype is Qint8 and matmul format is MK4, The winograd
            //! compute type is float.
            if (conv_bias_opr.input(0)->dtype().enumv() ==
                        DTypeEnum::QuantizedS8 &&
                winograd_preprocess_param.format ==
                        megdnn::param::MatrixMul::Format::MK4) {
                winograd_preprocess_param.compute_mode =
                        megdnn::param::ConvBias::ComputeMode::FLOAT32;
                conv_bias_param.compute_mode =
                        megdnn::param::ConvBias::ComputeMode::FLOAT32;
            }

            auto winograd_preprocess_opr = opr::WinogradFilterPreprocess::make(
                    new_inp[1], winograd_preprocess_param);
            mgb_assert(inputs.size() == 2 || inputs.size() == 3,
                       "input size need to be 2/3, but got: %zu",
                       inputs.size());
            SymbolVar new_conv_bias_opr;

            if (new_inp[0]->shape().ndim == 4) {
                conv_bias_param.format =
                        megdnn::ConvBias::Param::Format::NCHW_WINOGRAD;
            } else {
                mgb_assert(new_inp[0]->shape().ndim == 5);
                size_t pack_size = new_inp[0]->shape()[4];
                if (pack_size == 8) {
                    conv_bias_param.format =
                            megdnn::ConvBias::Param::Format::NCHW88_WINOGRAD;
                } else if (pack_size == 4) {
                    conv_bias_param.format =
                            megdnn::ConvBias::Param::Format::NCHW44_WINOGRAD;
                } else {
                    mgb_assert(0, "Invalid pack size %zu in algo %s", pack_size,
                               algo_name.c_str());
                }
            }

            conv_bias_param.output_block_size =
                    winograd_param.output_block_size;
            if (inputs.size() == 2) {
                new_conv_bias_opr = opr::ConvBias::make(
                        new_inp[0], winograd_preprocess_opr.node(),
                        conv_bias_param, conv_bias_opr.execution_policy(),
                        conv_bias_opr.config());
            } else {
                new_conv_bias_opr = opr::ConvBias::make(
                        new_inp[0], winograd_preprocess_opr.node(), new_inp[2],
                        conv_bias_param, conv_bias_opr.execution_policy(),
                        conv_bias_opr.config());
            }

            auto&& origin_out = conv_bias_opr.output();
            auto&& cur_out = new_conv_bias_opr.node()->owner_opr()->output();
            mgb_assert(origin_out.size() == cur_out.size());
            for (size_t i = 0; i < origin_out.size(); i++) {
                if (!origin_out[i]->contain_flag(
                            VarNode::Flag::VOLATILE_CONTENT)) {
                    rewriter.replace_var(origin_out[i], cur_out[i], nullptr);
                }
            }
            return;
        } while (0);

        rewriter.auto_replace_outputs(opr);
    };

    opt.graph().iter(on_opr);
    rewriter.apply_inplace();
    MIDOUT_E
 }

 /**
 * \warning WinogradTransformReplacePass implies that we run ParamFuse pass
 * before(currently run ParamFuse in optimize_for_inference when dump model),
 * othwise it can not deal with \c ConvBias(x, W+1), as the node of W+1 has no
 * flag PERSISTENT_DEVICE_VALUE, it's a mid-const node, we should use
 * ConstVarPropogate strictly speaking.
 */
 void gopt::transform_vars_inplace_with_winograd(
        mgb::cg::VarNodeArray& dest_vars) {
    gopt::GraphOptimizer optimizer;
    optimizer.add_pass<WinogradTransformReplacePass>();
    optimizer.add_pass<ParamFusePass>();
    optimizer.apply_inplace(dest_vars);
 }

 // vim: syntax=cpp.doxygen foldmethod=marker foldmarker=f{{{,f}}}
--- a/src/gopt/include/megbrain/gopt/weights_preprocess.h
+++ b/src/gopt/include/megbrain/gopt/weights_preprocess.h
@@ -1,32 +0,0 @@
 /**
 * \file src/gopt/include/megbrain/gopt/weights_preprocess.h
 * MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
 *
 * Copyright (c) 2014-2020 Megvii Inc. All rights reserved.
 *
 * Unless required by applicable law or agreed to in writing,
 * software distributed under the License is distributed on an
 * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 */

 #pragma once

 #include "megbrain/gopt/framework.h"

 namespace mgb {
 namespace gopt {

 class WinogradTransformReplacePass final : public Pass {
    class Impl;

 public:
    const char* name() const override;
    void apply(OptState& opt) const override;
 };

 void transform_vars_inplace_with_winograd(mgb::cg::VarNodeArray& dest_vars);

 }  // namespace gopt
 }  // namespace mgb

 // vim: syntax=cpp.doxygen foldmethod=marker foldmarker=f{{{,f}}}
--- a/src/opr/impl/search_policy/algo_chooser.cpp
+++ b/src/opr/impl/search_policy/algo_chooser.cpp
@@ -46,7 +46,6 @@ AlgoChooserProfileCache::Result AlgoChooser<Opr>::get_profile_result(

    ConvTensorLayouts origin_layouts = ctx.layouts();
    typename Opr::Param origin_param = ctx.mgb_opr()->param();
    get_origin_param_and_layouts(ctx, origin_layouts, origin_param);
    AlgoChooserProfileCache::Key cache_key{origin_layouts.data(),
                                           origin_layouts.size(), &origin_param,
                                           sizeof(origin_param)};
@@ -104,18 +103,6 @@ AlgoChooserProfileCache::Result AlgoChooser<Opr>::get_profile_result(
    return prof_rst;
 }

 template <>
 void AlgoChooser<megdnn::ConvBias>::get_origin_param_and_layouts(
        const ExeContext& ctx, ConvTensorLayouts& layouts,
        megdnn::ConvBias::Param& param) {
    auto format = static_cast<megdnn::param::ConvBias::Format>(
            ctx.megdnn_opr()->param().format);
    size_t output_block_size = ctx.megdnn_opr()->param().output_block_size;
    megdnn::ConvBias::deduce_winograd_origin_layout_and_param(
            format, output_block_size, ctx.layouts()[0], ctx.layouts()[1],
            layouts[1], param);
 }

 template <typename Opr>
 typename AlgoChooser<Opr>::ImplAlgo AlgoChooser<Opr>::choose_by_profile(
        ExeContext& ctx, bool require_reproducible, bool enable_update) {
--- a/src/opr/impl/tensor_manip.cpp
+++ b/src/opr/impl/tensor_manip.cpp
@@ -1607,15 +1607,5 @@ void RelayoutFormat::init_output_format() {
 }
 // f}}}
 //
 /* f{{{ ===================== WinogradFilterPreprocess ===================== */
 MGB_DYN_TYPE_OBJ_FINAL_IMPL(WinogradFilterPreprocess);
 MEGDNN_OPR_INIT1(WinogradFilterPreprocess, "winograd_filter_preprocess")
 void WinogradFilterPreprocess::init_output_dtype() {
    TensorLayout dst;
    TensorLayout src{input(0)->shape(), input(0)->dtype(), input(0)->format()};
    megdnn_opr()->deduce_layout(src, dst);
    output(0)->dtype(dst.dtype);
 }
 // f}}}

 // vim: syntax=cpp.doxygen foldmethod=marker foldmarker=f{{{,f}}}
--- a/src/opr/impl/tensor_manip.sereg.h
+++ b/src/opr/impl/tensor_manip.sereg.h
@@ -184,7 +184,6 @@ namespace opr {

    MGB_REG_OPR_SHALLOW_COPY(ParamPackConcat, opr_shallow_copy_param_pack_concat);
    MGB_SEREG_OPR(RelayoutFormat, 1);
    MGB_SEREG_OPR(WinogradFilterPreprocess, 1);
 } // namespace opr

 } // namespace mgb
--- a/src/opr/include/megbrain/opr/search_policy/algo_chooser.h
+++ b/src/opr/include/megbrain/opr/search_policy/algo_chooser.h
@@ -113,10 +113,6 @@ class AlgoChooser {
    //! entrance for getting algorithm according to execution strategy
    static ImplAlgo get_algo(ExeContext& ctx);

    static void get_origin_param_and_layouts(const ExeContext&,
                                             ConvTensorLayouts&,
                                             typename Opr::Param&) {}

    //! get all profile result, either by retrieving cache or profiling
    static AlgoChooserProfileCache::Result get_profile_result(
            ExeContext& ctx, bool enable_update);
--- a/src/opr/include/megbrain/opr/tensor_manip.h
+++ b/src/opr/include/megbrain/opr/tensor_manip.h
@@ -635,22 +635,6 @@ MGB_DEFINE_OPR_CLASS(RelayoutFormat,
                const OperatorNodeConfig &config = {});
        void init_output_format() override final;
 };

 /*!
 * \brief change conv weights layout base on winograd transform.
 *
 * See docs of megdnn params for more details
 */
 MGB_DEFINE_OPR_CLASS(WinogradFilterPreprocess,
                     intl::MegDNNOprWrapperFwd<megdnn::WinogradFilterPreprocess>)
    public:
        WinogradFilterPreprocess(VarNode* p0, const Param& param,
                const OperatorNodeConfig& config);
        static SymbolVar make(SymbolVar p0, const Param& param = {},
                const OperatorNodeConfig& config = {});
        void init_output_dtype() override final;
 };

 } // opr
 } // mgb

--- a/src/plugin/impl/opr_footprint.cpp
+++ b/src/plugin/impl/opr_footprint.cpp
@@ -171,12 +171,6 @@ uint64_t eval_conv_computation(const TensorShape& src_shape,
            cpos = 1;
            spatial_start = 2;
            break;
        case Param::Format::NCHW_WINOGRAD:
        case Param::Format::NCHW44_WINOGRAD:
        case Param::Format::NCHW88_WINOGRAD:
            cpos = 1;
            spatial_start = 0;
            break;
        case Param::Format::NHWC:
            cpos = 3;
            spatial_start = 1;
@@ -203,29 +197,9 @@ uint64_t eval_conv_computation(const TensorShape& src_shape,

    uint64_t fh = static_cast<uint64_t>(filter_shape[spatial_start]);
    uint64_t fw = static_cast<uint64_t>(filter_shape[spatial_start + 1]);
    if (param.format == Param::Format::NCHW_WINOGRAD ||
        param.format == Param::Format::NCHW44_WINOGRAD ||
        param.format == Param::Format::NCHW88_WINOGRAD) {
        mgb_assert(opr->same_type<opr::ConvBias>(),
                   "Only conv bias support WINOGRAD");
        auto&& conv_bias_opr = opr->cast_final_safe<opr::ConvBias>();
        uint32_t output_block_size = conv_bias_opr.param().output_block_size;
        mgb_assert(fh == fw,
                   "NCHW_WINOGRAD, NCHW88_WINOGRAD need fw==fh, got fw: %u fh "
                   "%u\n",
                   static_cast<uint32_t>(fh), static_cast<uint32_t>(fw));
        fh = fh + 1 - output_block_size;
        fw = fw + 1 - output_block_size;
    }
    
    // mul and add are counted as 2 operations
    if(param.format == Param::Format::NCHW88_WINOGRAD){
        return dst_shape.total_nr_elems() * fh * fw *
               static_cast<uint64_t>(src_shape[cpos] * 8) / group * 2;
    }
    if (param.format == Param::Format::NCHW44_WINOGRAD) {
        return dst_shape.total_nr_elems() * fh * fw *
               static_cast<uint64_t>(src_shape[cpos] * 4) / group * 2;
    }
    
    return dst_shape.total_nr_elems() * fh * fw *
           static_cast<uint64_t>(src_shape[cpos]) / group * 2;
 }
--- a/src/serialization/impl/schema.fbs
+++ b/src/serialization/impl/schema.fbs
@@ -28,6 +28,7 @@ table Blob {
 }

 table Reserved0 {}
 table DeprecatedParam {}

 union OperatorParam {
    param.Empty = 1,
@@ -50,7 +51,8 @@ union OperatorParam {
    param.ElemwiseMultiType = 18,
    param.PowC = 19,
    param.MatrixMul = 20,
    param.Winograd = 21,
    //Reserved for param.Winograd = 21,
    DeprecatedParam = 21,
    param.SVD = 22,
    param.Reduce = 23,
    param.Cumsum = 24,