refactor(dnn): refactor winograd strategy helper

GitOrigin-RevId: ecc2b15df9
5 years ago · dc6f89f2d3
--- a/dnn/src/common/winograd/winograd_helper.cpp
+++ b/dnn/src/common/winograd/winograd_helper.cpp
@@ -58,368 +58,300 @@ struct OutputGetter<
        return dtype.param<dtype::Quantized8Asymm>().quantize(item).as_uint8();
    }
 };

 }  // namespace

 namespace megdnn {
 namespace winograd {

 template <typename ctype, typename dst_type, typename input_filter_compute_type,
          typename output_compute_type>
 class StrategyHelper<ctype, dst_type, input_filter_compute_type,
                     output_compute_type, param::MatrixMul::Format::DEFAULT> {
 public:
    static void filter(const ctype* filter,
                       input_filter_compute_type* filter_transform_buf,
                       input_filter_compute_type* transform_mid_buf, size_t OC,
                       size_t IC, size_t oc_start, size_t oc_end, size_t m,
                       size_t r, const std::vector<float>& interp_points,
                       DType dtype, float rescale) {
        size_t alpha = m + r - 1;
        WinogradCoeff<input_filter_compute_type> winograd_coeff(m, r,
                                                                interp_points);

        input_filter_compute_type* mid_buf1 = transform_mid_buf;
        input_filter_compute_type* mid_buf2 = transform_mid_buf + alpha * alpha;

        Getter<ctype, input_filter_compute_type> getter(dtype);
        for (size_t oc = oc_start; oc < oc_end; oc++) {
            rep(ic, IC) {
                const ctype* filter_ptr = filter + (oc * IC + ic) * r * r;
                rep(i, r) rep(j, r) {
                    mid_buf1[i * r + j] = getter(filter_ptr[i * r + j]);
                }
 constexpr size_t layout_pack_size(param::ConvBias::Format layout) {
    switch (layout) {
        case param::ConvBias::Format::NHWCD4:
            return 4;
        case param::ConvBias::Format::NCHW4:
            return 4;
        case param::ConvBias::Format::NCHW32:
            return 32;
        case param::ConvBias::Format::NCHW88:
        case param::ConvBias::Format::NCHW8:
            return 8;
        default:
            return 1;
    }
 }

 template <param::ConvBias::Format layout, param::MatrixMul::Format format>
 struct FilterVisitor {
    size_t IC, OC;
    FilterVisitor(size_t OC, size_t IC) : IC(IC), OC(OC) {}
    size_t get(size_t r, size_t oc, size_t ic, size_t h, size_t w) {
        constexpr size_t input_pack_size = layout_pack_size(layout);
        size_t ocb_layout = oc / input_pack_size;
        size_t oc_layout = oc % input_pack_size;
        size_t icb_layout = ic / input_pack_size;
        size_t ic_layout = ic % input_pack_size;

        return (ocb_layout * (IC / input_pack_size) + icb_layout) * r * r *
                       input_pack_size * input_pack_size +
               ic_layout * input_pack_size + oc_layout +
               (h * r + w) * input_pack_size * input_pack_size;
    }

                /* tmp = Matmul(G, src) */
                megdnn::naive::run_matrix_mul_tpl<input_filter_compute_type,
                                                  input_filter_compute_type,
                                                  false, false>(
                        winograd_coeff.G(rescale).data(), mid_buf1, mid_buf2,
                        alpha, r, r, r, r, r, dtype, dtype);
                /* dst = Matmul(tmp, G^T) */
                megdnn::naive::run_matrix_mul_tpl<input_filter_compute_type,
                                                  input_filter_compute_type,
                                                  false, true>(
                        mid_buf2, winograd_coeff.G(rescale).data(), mid_buf1,
                        alpha, alpha, r, r, r, alpha, dtype, dtype);

                rep(i, alpha) rep(j, alpha) {
                    filter_transform_buf[(i * alpha + j) * OC * IC + ic * OC +
                                         oc] = mid_buf1[i * alpha + j];
                }
            }
    size_t put(size_t alpha, size_t oc, size_t ic, size_t h, size_t w) {
        if (format == param::MatrixMul::Format::DEFAULT) {
            return (h * alpha + w) * OC * IC + ic * OC + oc;
        }
        size_t matmul_pack_size = MatrixMulForward::pack_size(format);
        size_t ocb = oc / matmul_pack_size;
        size_t oc_pack = oc % matmul_pack_size;
        size_t icb = ic / matmul_pack_size;
        size_t ic_pack = ic % matmul_pack_size;

        size_t OCB = OC / matmul_pack_size;
        size_t ICB = IC / matmul_pack_size;

        return (h * alpha + w) * OCB * ICB * matmul_pack_size *
                       matmul_pack_size +
               ocb * ICB * matmul_pack_size * matmul_pack_size +
               icb * matmul_pack_size * matmul_pack_size +
               ic_pack * matmul_pack_size + oc_pack;
    }
 };

    static void input(const ctype* input,
                      input_filter_compute_type* input_transform_buf,
                      input_filter_compute_type* transform_mid_buf,
                      int ih_start, int iw_start, size_t IH, size_t IW,
                      size_t IC, size_t unit_idx, size_t nr_units_in_tile,
                      size_t m, size_t r,
                      const std::vector<float>& interp_points, DType dtype,
                      float rescale) {
        size_t alpha = m + r - 1;
        Getter<ctype, input_filter_compute_type> getter(dtype);
        WinogradCoeff<input_filter_compute_type> winograd_coeff(m, r,
                                                                interp_points);
        rep(ic, IC) {
            input_filter_compute_type* mid_buf1 = transform_mid_buf;
            input_filter_compute_type* mid_buf2 =
                    transform_mid_buf + alpha * alpha;
 template <param::ConvBias::Format layout, param::MatrixMul::Format format>
 struct InputVisitor {
    size_t IC;
    InputVisitor(size_t IC) : IC(IC) {}

            memset(mid_buf1, 0,
                   alpha * alpha * sizeof(input_filter_compute_type));
            rep(i, alpha) rep(j, alpha) {
                int ih = ih_start + i;
                int iw = iw_start + j;
                if (ih >= 0 && ih < (int)IH && iw >= 0 && iw < (int)IW) {
                    mid_buf1[i * alpha + j] =
                            getter(input[ic * IH * IW + ih * IW + iw]);
                }
            }
            megdnn::naive::run_matrix_mul_tpl<input_filter_compute_type,
                                              input_filter_compute_type, true,
                                              false>(
                    winograd_coeff.B(rescale).data(), mid_buf1, mid_buf2, alpha,
                    alpha, alpha, alpha, alpha, alpha, dtype, dtype);
            megdnn::naive::run_matrix_mul_tpl<input_filter_compute_type,
                                              input_filter_compute_type, false,
                                              false>(
                    mid_buf2, winograd_coeff.B(rescale).data(), mid_buf1, alpha,
                    alpha, alpha, alpha, alpha, alpha, dtype, dtype);
            rep(i, alpha) rep(j, alpha) {
                input_transform_buf[(i * alpha + j) * nr_units_in_tile * IC +
                                    unit_idx * IC + ic] =
                        mid_buf1[i * alpha + j];
            }
        }
    size_t get(size_t alpha, size_t ic, size_t IH, size_t IW, size_t ih,
               size_t iw) {
        constexpr size_t input_pack_size = layout_pack_size(layout);
        size_t icb_layout = ic / input_pack_size;
        size_t ic_layout = ic % input_pack_size;

        return (icb_layout * IH * IW + ih * IW + iw) * input_pack_size +
               ic_layout;
    }

    static void output(const output_compute_type* output_transform_buf,
                       const output_compute_type* bias, dst_type* output,
                       output_compute_type* transform_mid_buf, BiasMode bmode,
                       NonlineMode nonline_mode, size_t oh_start,
                       size_t ow_start, size_t OH, size_t OW, size_t oc_start,
                       size_t oc_end, size_t unit_idx, size_t nr_units_in_tile,
                       size_t m, size_t r,
                       const std::vector<float>& interp_points, DType dtype,
                       float input_filter_scale, float input_filter_rescale,
                       float rescale) {
        size_t alpha = m + r - 1;
        size_t OC = oc_end - oc_start;

        OutputGetter<output_compute_type, dst_type> getter(dtype);
        winograd::WinogradCoeff<output_compute_type> winograd_coeff(
                m, r, interp_points);
        for (size_t oc = oc_start; oc < oc_end; oc++) {
            size_t oc_index = oc - oc_start;
            output_compute_type* mid_buf1 = transform_mid_buf;
            output_compute_type* mid_buf2 = transform_mid_buf + alpha * alpha;

            // gather
            rep(i, alpha) rep(j, alpha) {
                mid_buf1[i * alpha + j] =
                        output_transform_buf[(i * alpha + j) *
                                                     nr_units_in_tile * OC +
                                             unit_idx * OC + oc_index];
            }
            /* A[alpha*m] M[alpha*alpha] */
            megdnn::naive::run_matrix_mul_tpl<output_compute_type,
                                              output_compute_type, true, false>(
                    winograd_coeff.A(rescale).data(), mid_buf1, mid_buf2, m,
                    alpha, alpha, m, alpha, alpha, dtype, dtype);
            megdnn::naive::run_matrix_mul_tpl<
                    output_compute_type, output_compute_type, false, false>(
                    mid_buf2, winograd_coeff.A(rescale).data(), mid_buf1, m, m,
                    alpha, alpha, m, m, dtype, dtype);
            rep(i, m) rep(j, m) {
                auto oh = oh_start + i;
                auto ow = ow_start + j;
                if (oh < OH && ow < OW) {
                    float val = mid_buf1[i * m + j];
                    if (bmode == BiasMode::BROADCAST_CHANNEL_BIAS) {
                        val += bias[oc] * input_filter_rescale *
                               input_filter_rescale;
                    } else if (bmode == BiasMode::BIAS) {
                        val += bias[oc * OH * OW + oh * OW + ow] *
                               input_filter_rescale * input_filter_rescale;
                    }
                    val = val * input_filter_scale /
                          (input_filter_rescale * input_filter_rescale *
                           rescale * rescale);
                    if (nonline_mode == NonlineMode::RELU) {
                        val = val > 0 ? val : 0;
                    } else if (nonline_mode == NonlineMode::SIGMOID) {
                        val = 1.f / (expf(-val) + 1.f);
                    } else if (nonline_mode == NonlineMode::H_SWISH) {
                        val = val * std::min(std::max(val + 3, 0.f), 6.f) / 6.f;
                    } else {
                        megdnn_assert(nonline_mode == NonlineMode::IDENTITY);
                    }

                    output[oc * OH * OW + oh * OW + ow] = getter(val);
                }
            }
    size_t put(size_t alpha, size_t ic, size_t nr_units_in_tile,
               size_t unit_idx, size_t h, size_t w) {
        if (format == param::MatrixMul::Format::DEFAULT) {
            return (h * alpha + w) * nr_units_in_tile * IC + unit_idx * IC + ic;
        }
        size_t matmul_pack_size = MatrixMulForward::pack_size(format);
        size_t icb = ic / matmul_pack_size;
        size_t ic_pack = ic % matmul_pack_size;
        size_t ICB = IC / matmul_pack_size;

        return (h * alpha + w) * ICB * nr_units_in_tile * matmul_pack_size +
               icb * nr_units_in_tile * matmul_pack_size +
               unit_idx * matmul_pack_size + ic_pack;
    }
 };

 template <typename ctype, typename dst_type, typename input_filter_compute_type,
          typename output_compute_type, param::MatrixMul::Format format>
 class StrategyHelper<
        ctype, dst_type, input_filter_compute_type, output_compute_type, format,
        std::enable_if_t<format == param::MatrixMul::Format::MK4 ||
                         format == param::MatrixMul::Format::MK8>> {
 public:
    static void filter(const ctype* filter,
                       input_filter_compute_type* filter_transform_buf,
                       input_filter_compute_type* transform_mid_buf, size_t OC,
                       size_t IC, size_t oc_start, size_t oc_end, size_t m,
                       size_t r, const std::vector<float>& interp_points,
                       DType dtype, float rescale) {
        size_t alpha = m + r - 1;
        WinogradCoeff<input_filter_compute_type> winograd_coeff(m, r,
                                                                interp_points);

        input_filter_compute_type* mid_buf1 = transform_mid_buf;
        input_filter_compute_type* mid_buf2 = transform_mid_buf + alpha * alpha;

        Getter<ctype, input_filter_compute_type> getter(dtype);
        size_t OCB = OC / pack_size;
        size_t ICB = IC / pack_size;
        for (size_t oc = oc_start; oc < oc_end; oc++) {
            rep(ic, IC) {
                const ctype* filter_ptr = filter + (oc * IC + ic) * r * r;
                rep(i, r) rep(j, r) {
                    mid_buf1[i * r + j] = getter(filter_ptr[i * r + j]);
                }
 template <param::ConvBias::Format layout, param::MatrixMul::Format format>
 struct OutputVisitor {
    size_t OC;
    OutputVisitor(size_t OC) : OC(OC) {}

                /* tmp = Matmul(G, src) */
                megdnn::naive::run_matrix_mul_tpl<input_filter_compute_type,
                                                  input_filter_compute_type,
                                                  false, false>(
                        winograd_coeff.G(rescale).data(), mid_buf1, mid_buf2,
                        alpha, r, r, r, r, r, dtype, dtype);
                /* dst = Matmul(tmp, G^T) */
                megdnn::naive::run_matrix_mul_tpl<input_filter_compute_type,
                                                  input_filter_compute_type,
                                                  false, true>(
                        mid_buf2, winograd_coeff.G(rescale).data(), mid_buf1,
                        alpha, alpha, r, r, r, alpha, dtype, dtype);

                size_t ocb = oc / pack_size;
                size_t oc_pack = oc % pack_size;
                size_t icb = ic / pack_size;
                size_t ic_pack = ic % pack_size;
                rep(i, alpha) rep(j, alpha) {
                    filter_transform_buf[(i * alpha + j) * OCB * ICB *
                                                 pack_size * pack_size +
                                         ocb * ICB * pack_size * pack_size +
                                         icb * pack_size * pack_size +
                                         ic_pack * pack_size + oc_pack] =
                            mid_buf1[i * alpha + j];
                }
            }
    size_t get(size_t alpha, size_t oc_index, size_t oc,
               size_t nr_units_in_tile, size_t unit_idx, size_t h, size_t w) {
        if (format == param::MatrixMul::Format::DEFAULT) {
            return (h * alpha + w) * nr_units_in_tile * OC + unit_idx * OC +
                   oc_index;
        }
        size_t matmul_pack_size = MatrixMulForward::pack_size(format);
        size_t ocb = oc_index / matmul_pack_size;
        size_t oc_pack = oc % matmul_pack_size;
        size_t OCB = OC / matmul_pack_size;

        return (h * alpha + w) * OCB * nr_units_in_tile * matmul_pack_size +
               ocb * nr_units_in_tile * matmul_pack_size +
               unit_idx * matmul_pack_size + oc_pack;
    }

    static void input(const ctype* input,
                      input_filter_compute_type* input_transform_buf,
                      input_filter_compute_type* transform_mid_buf,
                      int ih_start, int iw_start, size_t IH, size_t IW,
                      size_t IC, size_t unit_idx, size_t nr_units_in_tile,
                      size_t m, size_t r,
                      const std::vector<float>& interp_points, DType dtype,
                      float rescale) {
        size_t alpha = m + r - 1;
        Getter<ctype, input_filter_compute_type> getter(dtype);
        WinogradCoeff<input_filter_compute_type> winograd_coeff(m, r,
                                                                interp_points);
        size_t ICB = IC / pack_size;
        rep(ic, IC) {
            input_filter_compute_type* mid_buf1 = transform_mid_buf;
            input_filter_compute_type* mid_buf2 =
                    transform_mid_buf + alpha * alpha;
    size_t put(size_t oc, size_t OH, size_t OW, size_t oh, size_t ow) {
        constexpr size_t input_pack_size = layout_pack_size(layout);
        size_t oc_layout = oc % input_pack_size;

            memset(mid_buf1, 0,
                   alpha * alpha * sizeof(input_filter_compute_type));
            rep(i, alpha) rep(j, alpha) {
                int ih = ih_start + i;
                int iw = iw_start + j;
                if (ih >= 0 && ih < (int)IH && iw >= 0 && iw < (int)IW) {
                    mid_buf1[i * alpha + j] =
                            getter(input[ic * IH * IW + ih * IW + iw]);
                }
        return (oc / input_pack_size * OH * OW + oh * OW + ow) *
                       input_pack_size +
               oc_layout;
    }
 };

 template <typename ctype, typename dst_type, typename input_filter_compute_type,
          typename output_compute_type, param::ConvBias::Format layout,
          param::MatrixMul::Format format>
 void StrategyHelper<
        ctype, dst_type, input_filter_compute_type, output_compute_type, layout,
        format>::filter(const ctype* filter,
                        input_filter_compute_type* filter_transform_buf,
                        input_filter_compute_type* transform_mid_buf, size_t OC,
                        size_t IC, size_t oc_start, size_t oc_end, size_t m,
                        size_t r, const std::vector<float>& interp_points,
                        DType dtype, float rescale) {
    size_t alpha = m + r - 1;
    WinogradCoeff<input_filter_compute_type> winograd_coeff(m, r,
                                                            interp_points);
    input_filter_compute_type* mid_buf1 = transform_mid_buf;
    input_filter_compute_type* mid_buf2 = transform_mid_buf + alpha * alpha;
    Getter<ctype, input_filter_compute_type> getter(dtype);
    FilterVisitor<layout, format> filter_visitor(OC, IC);

    for (size_t oc = oc_start; oc < oc_end; oc++) {
        rep(ic, IC) {
            rep(i, r) rep(j, r) {
                mid_buf1[i * r + j] =
                        getter(filter[filter_visitor.get(r, oc, ic, i, j)]);
            }

            /* tmp = Matmul(G, src) */
            megdnn::naive::run_matrix_mul_tpl<input_filter_compute_type,
                                              input_filter_compute_type, true,
                                              input_filter_compute_type, false,
                                              false>(
                    winograd_coeff.B(rescale).data(), mid_buf1, mid_buf2, alpha,
                    alpha, alpha, alpha, alpha, alpha, dtype, dtype);
                    winograd_coeff.G(rescale).data(), mid_buf1, mid_buf2, alpha,
                    r, r, r, r, r, dtype, dtype);
            /* dst = Matmul(tmp, G^T) */
            megdnn::naive::run_matrix_mul_tpl<input_filter_compute_type,
                                              input_filter_compute_type, false,
                                              false>(
                    mid_buf2, winograd_coeff.B(rescale).data(), mid_buf1, alpha,
                    alpha, alpha, alpha, alpha, alpha, dtype, dtype);
            size_t icb = ic / pack_size;
            size_t ic_pack = ic % pack_size;
                                              true>(
                    mid_buf2, winograd_coeff.G(rescale).data(), mid_buf1, alpha,
                    alpha, r, r, r, alpha, dtype, dtype);

            rep(i, alpha) rep(j, alpha) {
                input_transform_buf[(i * alpha + j) * ICB * nr_units_in_tile *
                                            pack_size +
                                    icb * nr_units_in_tile * pack_size +
                                    unit_idx * pack_size + ic_pack] =
                filter_transform_buf[filter_visitor.put(alpha, oc, ic, i, j)] =
                        mid_buf1[i * alpha + j];
            }
        }
    }
 }

    static void output(const output_compute_type* output_transform_buf,
                       const output_compute_type* bias, dst_type* output,
                       output_compute_type* transform_mid_buf, BiasMode bmode,
                       NonlineMode nonline_mode, size_t oh_start,
                       size_t ow_start, size_t OH, size_t OW, size_t oc_start,
                       size_t oc_end, size_t unit_idx, size_t nr_units_in_tile,
 template <typename ctype, typename dst_type, typename input_filter_compute_type,
          typename output_compute_type, param::ConvBias::Format layout,
          param::MatrixMul::Format format>
 void StrategyHelper<
        ctype, dst_type, input_filter_compute_type, output_compute_type, layout,
        format>::input(const ctype* input,
                       input_filter_compute_type* input_transform_buf,
                       input_filter_compute_type* transform_mid_buf,
                       int ih_start, int iw_start, size_t IH, size_t IW,
                       size_t IC, size_t unit_idx, size_t nr_units_in_tile,
                       size_t m, size_t r,
                       const std::vector<float>& interp_points, DType dtype,
                       float input_filter_scale, float input_filter_rescale,
                       float rescale) {
        size_t alpha = m + r - 1;
        size_t OC = oc_end - oc_start;

        OutputGetter<output_compute_type, dst_type> getter(dtype);
        winograd::WinogradCoeff<output_compute_type> winograd_coeff(
                m, r, interp_points);
        size_t OCB = OC / pack_size;
        for (size_t oc = oc_start; oc < oc_end; oc++) {
            output_compute_type* mid_buf1 = transform_mid_buf;
            output_compute_type* mid_buf2 = transform_mid_buf + alpha * alpha;

            size_t ocb = (oc - oc_start) / pack_size;
            size_t oc_pack = oc % pack_size;
            // gather
            rep(i, alpha) rep(j, alpha) {
                mid_buf1[i * alpha + j] = output_transform_buf
                        [(i * alpha + j) * OCB * nr_units_in_tile * pack_size +
                         ocb * nr_units_in_tile * pack_size +
                         unit_idx * pack_size + oc_pack];
    size_t alpha = m + r - 1;
    WinogradCoeff<input_filter_compute_type> winograd_coeff(m, r,
                                                            interp_points);
    input_filter_compute_type* mid_buf1 = transform_mid_buf;
    input_filter_compute_type* mid_buf2 = transform_mid_buf + alpha * alpha;
    Getter<ctype, input_filter_compute_type> getter(dtype);
    InputVisitor<layout, format> intput_visitor(IC);

    rep(ic, IC) {
        memset(mid_buf1, 0, alpha * alpha * sizeof(input_filter_compute_type));
        rep(i, alpha) rep(j, alpha) {
            int ih = ih_start + i;
            int iw = iw_start + j;
            if (ih >= 0 && ih < (int)IH && iw >= 0 && iw < (int)IW) {
                mid_buf1[i * alpha + j] = getter(
                        input[intput_visitor.get(alpha, ic, IH, IW, ih, iw)]);
            }
            /* A[alpha*m] M[alpha*alpha] */
            megdnn::naive::run_matrix_mul_tpl<output_compute_type,
                                              output_compute_type, true, false>(
                    winograd_coeff.A(rescale).data(), mid_buf1, mid_buf2, m,
                    alpha, alpha, m, alpha, alpha, dtype, dtype);
            megdnn::naive::run_matrix_mul_tpl<
                    output_compute_type, output_compute_type, false, false>(
                    mid_buf2, winograd_coeff.A(rescale).data(), mid_buf1, m, m,
                    alpha, alpha, m, m, dtype, dtype);
            rep(i, m) rep(j, m) {
                auto oh = oh_start + i;
                auto ow = ow_start + j;
                if (oh < OH && ow < OW) {
                    float val = mid_buf1[i * m + j];
                    if (bmode == BiasMode::BROADCAST_CHANNEL_BIAS) {
                        val += bias[oc] * input_filter_rescale *
                               input_filter_rescale;
                    } else if (bmode == BiasMode::BIAS) {
                        val += bias[oc * OH * OW + oh * OW + ow] *
                               input_filter_rescale * input_filter_rescale;
                    }
                    val = val * input_filter_scale /
                          (input_filter_rescale * input_filter_rescale *
                           rescale * rescale);
                    if (nonline_mode == NonlineMode::RELU) {
                        val = val > 0 ? val : 0;
                    } else if (nonline_mode == NonlineMode::SIGMOID) {
                        val = 1.f / (expf(-val) + 1.f);
                    } else if (nonline_mode == NonlineMode::H_SWISH) {
                        val = val * std::min(std::max(val + 3, 0.f), 6.f) / 6.f;
                    } else {
                        megdnn_assert(nonline_mode == NonlineMode::IDENTITY);
                    }

                    output[oc * OH * OW + oh * OW + ow] = getter(val);
        }

        megdnn::naive::run_matrix_mul_tpl<input_filter_compute_type,
                                          input_filter_compute_type, true,
                                          false>(
                winograd_coeff.B(rescale).data(), mid_buf1, mid_buf2, alpha,
                alpha, alpha, alpha, alpha, alpha, dtype, dtype);
        megdnn::naive::run_matrix_mul_tpl<input_filter_compute_type,
                                          input_filter_compute_type, false,
                                          false>(
                mid_buf2, winograd_coeff.B(rescale).data(), mid_buf1, alpha,
                alpha, alpha, alpha, alpha, alpha, dtype, dtype);

        rep(i, alpha) rep(j, alpha) {
            input_transform_buf[intput_visitor.put(alpha, ic, nr_units_in_tile,
                                                   unit_idx, i, j)] =
                    mid_buf1[i * alpha + j];
        }
    }
 }

 template <typename ctype, typename dst_type, typename input_filter_compute_type,
          typename output_compute_type, param::ConvBias::Format layout,
          param::MatrixMul::Format format>
 void StrategyHelper<
        ctype, dst_type, input_filter_compute_type, output_compute_type, layout,
        format>::output(const output_compute_type* output_transform_buf,
                        const output_compute_type* bias, dst_type* output,
                        output_compute_type* transform_mid_buf, BiasMode bmode,
                        NonlineMode nonline_mode, size_t oh_start,
                        size_t ow_start, size_t OH, size_t OW, size_t oc_start,
                        size_t oc_end, size_t unit_idx, size_t nr_units_in_tile,
                        size_t m, size_t r,
                        const std::vector<float>& interp_points, DType dtype,
                        float input_filter_scale, float input_filter_rescale,
                        float rescale) {
    size_t alpha = m + r - 1;
    winograd::WinogradCoeff<output_compute_type> winograd_coeff(m, r,
                                                                interp_points);
    output_compute_type* mid_buf1 = transform_mid_buf;
    output_compute_type* mid_buf2 = transform_mid_buf + alpha * alpha;
    OutputGetter<output_compute_type, dst_type> getter(dtype);
    OutputVisitor<layout, format> output_visitor(oc_end - oc_start);

    for (size_t oc = oc_start; oc < oc_end; oc++) {
        /* gather */
        rep(i, alpha) rep(j, alpha) {
            mid_buf1[i * alpha + j] = output_transform_buf[output_visitor.get(
                    alpha, oc - oc_start, oc, nr_units_in_tile, unit_idx, i,
                    j)];
        }
        /* A[alpha*m] M[alpha*alpha] */
        megdnn::naive::run_matrix_mul_tpl<output_compute_type,
                                          output_compute_type, true, false>(
                winograd_coeff.A(rescale).data(), mid_buf1, mid_buf2, m, alpha,
                alpha, m, alpha, alpha, dtype, dtype);
        megdnn::naive::run_matrix_mul_tpl<output_compute_type,
                                          output_compute_type, false, false>(
                mid_buf2, winograd_coeff.A(rescale).data(), mid_buf1, m, m,
                alpha, alpha, m, m, dtype, dtype);

        rep(i, m) rep(j, m) {
            auto oh = oh_start + i;
            auto ow = ow_start + j;
            if (oh < OH && ow < OW) {
                float val = mid_buf1[i * m + j];
                if (bmode == BiasMode::BROADCAST_CHANNEL_BIAS) {
                    val += bias[oc] * input_filter_rescale *
                           input_filter_rescale;
                } else if (bmode == BiasMode::BIAS) {
                    val += bias[output_visitor.put(oc, OH, OW, oh, ow)] *
                           input_filter_rescale * input_filter_rescale;
                }
                val = val * input_filter_scale /
                      (input_filter_rescale * input_filter_rescale * rescale *
                       rescale);
                if (nonline_mode == NonlineMode::RELU) {
                    val = val > 0 ? val : 0;
                } else if (nonline_mode == NonlineMode::SIGMOID) {
                    val = 1.f / (expf(-val) + 1.f);
                } else if (nonline_mode == NonlineMode::H_SWISH) {
                    val = val * std::min(std::max(val + 3, 0.f), 6.f) / 6.f;
                } else {
                    megdnn_assert(nonline_mode == NonlineMode::IDENTITY);
                }
                output[output_visitor.put(oc, OH, OW, oh, ow)] = getter(val);
            }
        }
    }

    static size_t pack_size;
 };

 template <typename ctype, typename dst_type, typename input_filter_compute_type,
          typename output_compute_type, param::MatrixMul::Format format>
 size_t StrategyHelper<
        ctype, dst_type, input_filter_compute_type, output_compute_type, format,
        std::enable_if_t<format == param::MatrixMul::Format::MK4 ||
                         format == param::MatrixMul::Format::MK8>>::pack_size =
        MatrixMulForward::pack_size(format);

 #define INST(_ctype, _dst_type, _input_filter_compute_type, \
             _output_compute_type)                          \
    template class StrategyHelper<                          \
            _ctype, _dst_type, _input_filter_compute_type,  \
            _output_compute_type, param::MatrixMul::Format::DEFAULT>;
 #define INST(_ctype, _dst_type, _input_filter_compute_type,   \
             _output_compute_type)                            \
    template class StrategyHelper<_ctype, _dst_type,          \
                                  _input_filter_compute_type, \
                                  _output_compute_type>;

 INST(float, float, float, float)
 MEGDNN_INC_FLOAT16(INST(dt_float16, dt_float16, dt_float16, dt_float16))
@@ -428,234 +360,23 @@ INST(uint8_t, uint8_t, int16_t, int)
 #undef INST

 #define INST(_ctype, _dst_type, _input_filter_compute_type, \
             _output_compute_type)                          \
             _output_compute_type, layout)                  \
    template class StrategyHelper<                          \
            _ctype, _dst_type, _input_filter_compute_type,  \
            _output_compute_type, param::MatrixMul::Format::MK4>;
 INST(float, float, float, float)
            _output_compute_type, layout, param::MatrixMul::Format::MK4>;
 INST(float, float, float, float, param::ConvBias::Format::NCHW)
 #undef INST

 #define INST(_ctype, _dst_type, _input_filter_compute_type, \
             _output_compute_type)                          \
             _output_compute_type, layout)                  \
    template class StrategyHelper<                          \
            _ctype, _dst_type, _input_filter_compute_type,  \
            _output_compute_type, param::MatrixMul::Format::MK8>;
 INST(int8_t, int8_t, int16_t, int)
 MEGDNN_INC_FLOAT16(INST(dt_float16, dt_float16, dt_float16, dt_float16))
 #undef INST

 template <typename ctype, typename dst_type, typename input_filter_compute_type,
          typename output_compute_type, param::MatrixMul::Format format>
 class StrategyHelperNchwxx<
        ctype, dst_type, input_filter_compute_type, output_compute_type, format,
        std::enable_if_t<format == param::MatrixMul::Format::MK8>> {
 public:
    static void filter(const ctype* filter,
                       input_filter_compute_type* filter_transform_buf,
                       input_filter_compute_type* transform_mid_buf, size_t OC,
                       size_t IC, size_t oc_start, size_t oc_end, size_t m,
                       size_t r, const std::vector<float>& interp_points,
                       DType dtype, float rescale) {
        megdnn_assert(
                (oc_end - oc_start) % 8 == 0 && oc_start % 8 == 0 &&
                        oc_end % 8 == 0 && IC % 8 == 0 && OC % 8 == 0,
                "Winograd filter transform input param is not times of 8!");

        size_t alpha = m + r - 1;
        WinogradCoeff<input_filter_compute_type> winograd_coeff(m, r,
                                                                interp_points);

        input_filter_compute_type* mid_buf1 = transform_mid_buf;
        input_filter_compute_type* mid_buf2 = transform_mid_buf + alpha * alpha;

        Getter<ctype, input_filter_compute_type> getter(dtype);
        size_t OCB = OC / pack_size;
        size_t ICB = IC / pack_size;
        for (size_t oc = oc_start; oc < oc_end; oc++) {
            rep(ic, IC) {
                size_t ocb = oc / pack_size;
                size_t oc_pack = oc % pack_size;
                size_t icb = ic / pack_size;
                size_t ic_pack = ic % pack_size;

                const ctype* filter_ptr =
                        filter + (ocb * (IC / 8) + icb) * r * r * 8 * 8 +
                        ic_pack * 8 + oc_pack;
                rep(i, r) rep(j, r) {
                    mid_buf1[i * r + j] =
                            getter(filter_ptr[(i * r + j) * 8 * 8]);
                }

                /* tmp = Matmul(G, src) */
                megdnn::naive::run_matrix_mul_tpl<input_filter_compute_type,
                                                  input_filter_compute_type,
                                                  false, false>(
                        winograd_coeff.G(rescale).data(), mid_buf1, mid_buf2,
                        alpha, r, r, r, r, r, dtype, dtype);
                /* dst = Matmul(tmp, G^T) */
                megdnn::naive::run_matrix_mul_tpl<input_filter_compute_type,
                                                  input_filter_compute_type,
                                                  false, true>(
                        mid_buf2, winograd_coeff.G(rescale).data(), mid_buf1,
                        alpha, alpha, r, r, r, alpha, dtype, dtype);

                rep(i, alpha) rep(j, alpha) {
                    filter_transform_buf[(i * alpha + j) * OCB * ICB *
                                                 pack_size * pack_size +
                                         ocb * ICB * pack_size * pack_size +
                                         icb * pack_size * pack_size +
                                         ic_pack * pack_size + oc_pack] =
                            mid_buf1[i * alpha + j];
                }
            }
        }
    }

    static void input(const ctype* input,
                      input_filter_compute_type* input_transform_buf,
                      input_filter_compute_type* transform_mid_buf,
                      int ih_start, int iw_start, size_t IH, size_t IW,
                      size_t IC, size_t unit_idx, size_t nr_units_in_tile,
                      size_t m, size_t r,
                      const std::vector<float>& interp_points, DType dtype,
                      float rescale) {
        size_t alpha = m + r - 1;
        Getter<ctype, input_filter_compute_type> getter(dtype);
        WinogradCoeff<input_filter_compute_type> winograd_coeff(m, r,
                                                                interp_points);
        size_t ICB = IC / pack_size;
        rep(ic, IC) {
            size_t icb = ic / pack_size;
            size_t ic_pack = ic % pack_size;
            input_filter_compute_type* mid_buf1 = transform_mid_buf;
            input_filter_compute_type* mid_buf2 =
                    transform_mid_buf + alpha * alpha;

            memset(mid_buf1, 0,
                   alpha * alpha * sizeof(input_filter_compute_type));
            rep(i, alpha) rep(j, alpha) {
                int ih = ih_start + i;
                int iw = iw_start + j;
                if (ih >= 0 && ih < (int)IH && iw >= 0 && iw < (int)IW) {
                    mid_buf1[i * alpha + j] = getter(
                            input[(icb * IH * IW + ih * IW + iw) * pack_size +
                                  ic_pack]);
                }
            }
            megdnn::naive::run_matrix_mul_tpl<input_filter_compute_type,
                                              input_filter_compute_type, true,
                                              false>(
                    winograd_coeff.B(rescale).data(), mid_buf1, mid_buf2, alpha,
                    alpha, alpha, alpha, alpha, alpha, dtype, dtype);
            megdnn::naive::run_matrix_mul_tpl<input_filter_compute_type,
                                              input_filter_compute_type, false,
                                              false>(
                    mid_buf2, winograd_coeff.B(rescale).data(), mid_buf1, alpha,
                    alpha, alpha, alpha, alpha, alpha, dtype, dtype);
            rep(i, alpha) rep(j, alpha) {
                input_transform_buf[(i * alpha + j) * ICB * nr_units_in_tile *
                                            pack_size +
                                    icb * nr_units_in_tile * pack_size +
                                    unit_idx * pack_size + ic_pack] =
                        mid_buf1[i * alpha + j];
            }
        }
    }

    static void output(const output_compute_type* output_transform_buf,
                       const output_compute_type* bias, dst_type* output,
                       output_compute_type* transform_mid_buf, BiasMode bmode,
                       NonlineMode nonline_mode, size_t oh_start,
                       size_t ow_start, size_t OH, size_t OW, size_t oc_start,
                       size_t oc_end, size_t unit_idx, size_t nr_units_in_tile,
                       size_t m, size_t r,
                       const std::vector<float>& interp_points, DType dtype,
                       float input_filter_scale, float input_filter_rescale,
                       float rescale) {
        size_t alpha = m + r - 1;
        size_t OC = oc_end - oc_start;

        OutputGetter<output_compute_type, dst_type> getter(dtype);
        winograd::WinogradCoeff<output_compute_type> winograd_coeff(
                m, r, interp_points);
        size_t OCB = OC / pack_size;
        for (size_t oc = oc_start; oc < oc_end; oc++) {
            output_compute_type* mid_buf1 = transform_mid_buf;
            output_compute_type* mid_buf2 = transform_mid_buf + alpha * alpha;

            size_t ocb = (oc - oc_start) / pack_size;
            size_t oc_pack = oc % pack_size;
            // gather
            rep(i, alpha) rep(j, alpha) {
                mid_buf1[i * alpha + j] = output_transform_buf
                        [(i * alpha + j) * OCB * nr_units_in_tile * pack_size +
                         ocb * nr_units_in_tile * pack_size +
                         unit_idx * pack_size + oc_pack];
            }
            /* A[alpha*m] M[alpha*alpha] */
            megdnn::naive::run_matrix_mul_tpl<output_compute_type,
                                              output_compute_type, true, false>(
                    winograd_coeff.A(rescale).data(), mid_buf1, mid_buf2, m,
                    alpha, alpha, m, alpha, alpha, dtype, dtype);
            megdnn::naive::run_matrix_mul_tpl<
                    output_compute_type, output_compute_type, false, false>(
                    mid_buf2, winograd_coeff.A(rescale).data(), mid_buf1, m, m,
                    alpha, alpha, m, m, dtype, dtype);
            rep(i, m) rep(j, m) {
                auto oh = oh_start + i;
                auto ow = ow_start + j;
                if (oh < OH && ow < OW) {
                    float val = mid_buf1[i * m + j];
                    if (bmode == BiasMode::BROADCAST_CHANNEL_BIAS) {
                        val += bias[oc] * input_filter_rescale *
                               input_filter_rescale;
                    } else if (bmode == BiasMode::BIAS) {
                        val += bias[(oc / pack_size * OH * OW + oh * OW + ow) *
                                            pack_size +
                                    oc_pack] *
                               input_filter_rescale * input_filter_rescale;
                    }
                    val = val * input_filter_scale /
                          (input_filter_rescale * input_filter_rescale *
                           rescale * rescale);
                    if (nonline_mode == NonlineMode::RELU) {
                        val = val > 0 ? val : 0;
                    } else if (nonline_mode == NonlineMode::SIGMOID) {
                        val = 1.f / (expf(-val) + 1.f);
                    } else if (nonline_mode == NonlineMode::H_SWISH) {
                        val = val * std::min(std::max(val + 3, 0.f), 6.f) / 6.f;
                    } else {
                        megdnn_assert(nonline_mode == NonlineMode::IDENTITY);
                    }

                    output[(oc / pack_size * OH * OW + oh * OW + ow) *
                                   pack_size +
                           oc_pack] = getter(val);
                }
            }
        }
    }

    static size_t pack_size;
 };

 template <typename ctype, typename dst_type, typename input_filter_compute_type,
          typename output_compute_type, param::MatrixMul::Format format>
 size_t StrategyHelperNchwxx<
        ctype, dst_type, input_filter_compute_type, output_compute_type, format,
        std::enable_if_t<format == param::MatrixMul::Format::MK8>>::pack_size =
        MatrixMulForward::pack_size(format);

 #define INST(_ctype, _dst_type, _input_filter_compute_type, \
             _output_compute_type)                          \
    template class StrategyHelperNchwxx<                    \
            _ctype, _dst_type, _input_filter_compute_type,  \
            _output_compute_type, param::MatrixMul::Format::MK8>;
 INST(float, float, float, float)
            _output_compute_type, layout, param::MatrixMul::Format::MK8>;
 INST(int8_t, int8_t, int16_t, int, param::ConvBias::Format::NCHW)
 INST(float, float, float, float, param::ConvBias::Format::NCHW88)
 MEGDNN_INC_FLOAT16(INST(dt_float16, dt_float16, dt_float16, dt_float16,
                        param::ConvBias::Format::NCHW))
 #undef INST



 }  // namespace winograd
 }  // namespace megdnn

--- a/dnn/src/common/winograd/winograd_helper.h
+++ b/dnn/src/common/winograd/winograd_helper.h
@@ -6,7 +6,8 @@
 *
 * Unless required by applicable law or agreed to in writing,
 * software distributed under the License is distributed on an
 * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or
 * implied.
 */

 #pragma once
@@ -28,8 +29,8 @@ using BiasMode = ConvBiasForward::BiasMode;
 */
 template <typename ctype, typename dst_type, typename input_filter_compute_type,
          typename output_compute_type,
          param::MatrixMul::Format format = param::MatrixMul::Format::DEFAULT,
          typename enable = void>
          param::ConvBias::Format layout = param::ConvBias::Format::NCHW,
          param::MatrixMul::Format format = param::MatrixMul::Format::DEFAULT>
 class StrategyHelper {
 public:
    static void filter(const ctype* filter,
@@ -61,47 +62,6 @@ public:
           float rescale = 1.0f);
 };

 /**
 * \brief Strategy helper, contains some helper function for debug kernel
 * implementation
 *
 * \warning The layout should be NCHW88
 */
 template <typename ctype, typename dst_type, typename input_filter_compute_type,
          typename output_compute_type,
          param::MatrixMul::Format format = param::MatrixMul::Format::MK8,
          typename enable = void>
 class StrategyHelperNchwxx {
 public:
    static void filter(const ctype* filter,
                       input_filter_compute_type* filter_transform_buf,
                       input_filter_compute_type* transform_mid_buf, size_t OC,
                       size_t IC, size_t oc_start, size_t oc_end, size_t m,
                       size_t r, const std::vector<float>& interp_points,
                       DType dtype, float rescale = 1.0f);

    static void input(const ctype* input,
                      input_filter_compute_type* input_transform_buf,
                      input_filter_compute_type* transform_mid_buf,
                      int ih_start, int iw_start, size_t IH, size_t IW,
                      size_t IC, size_t unit_idx, size_t nr_units_in_tile,
                      size_t m, size_t r,
                      const std::vector<float>& interp_points, DType dtype,
                      float rescale = 1.0f);

    static void
    output(const output_compute_type* output_transform_buf,
           const output_compute_type* bias, dst_type* output,
           output_compute_type* transform_mid_buf, BiasMode bmode,
           NonlineMode nonline_mode, size_t oh_start, size_t ow_start,
           size_t OH, size_t OW, size_t oc_start, size_t oc_end,
           size_t unit_idx, size_t nr_units_in_tile, size_t m, size_t r,
           const std::vector<float>& interp_points, DType dtype,
           float input_filter_scale = 1.0f,    // input_scale * filter_scale
           float input_filter_rescale = 1.0f,  // input_rescale * filter_rescale
           float rescale = 1.0f);
 };

 }  // namespace winograd
 }  // namespace megdnn
   // vim: syntax=cpp.doxygen
--- a/dnn/src/fallback/conv_bias/winograd/strategy.cpp
+++ b/dnn/src/fallback/conv_bias/winograd/strategy.cpp
@@ -6,13 +6,14 @@
 *
 * Unless required by applicable law or agreed to in writing,
 * software distributed under the License is distributed on an
 * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or
 * implied.
 */

 #include "src/fallback/conv_bias/winograd/strategy.h"
 #include "src/fallback/conv_bias/winograd/winograd.h"
 #include "src/common/winograd/winograd_helper.h"
 #include "src/common/utils.h"
 #include "src/common/winograd/winograd_helper.h"
 #include "src/fallback/conv_bias/winograd/winograd.h"

 namespace megdnn {
 namespace fallback {
@@ -60,7 +61,7 @@ void winograd_2x3_4x4_f::filter(const float* filter,
                                float* transform_mid_buf, size_t OC, size_t IC,
                                size_t oc_start, size_t oc_end) {
    ::megdnn::winograd::StrategyHelper<
            float, float, float, float,
            float, float, float, float, param::ConvBias::Format::NCHW,
            param::MatrixMul::Format::MK4>::filter(filter, filter_transform_buf,
                                                   transform_mid_buf, OC, IC,
                                                   oc_start, oc_end,
@@ -73,11 +74,15 @@ void winograd_2x3_4x4_f::input(const float* input, float* input_transform_buf,
                               float* transform_mid_buf, int ih_start,
                               int iw_start, size_t IH, size_t IW, size_t IC,
                               size_t unit_idx, size_t nr_units_in_tile) {
    ::megdnn::winograd::StrategyHelper<float, float, float, float,
                                       param::MatrixMul::Format::MK4>::
            input(input, input_transform_buf, transform_mid_buf, ih_start,
                  iw_start, IH, IW, IC, unit_idx, nr_units_in_tile,
                  OUTPUT_BLOCK_SIZE, KERNEL_SIZE, {0, 1, -1}, src_dtype);
    ::megdnn::winograd::StrategyHelper<
            float, float, float, float, param::ConvBias::Format::NCHW,
            param::MatrixMul::Format::MK4>::input(input, input_transform_buf,
                                                  transform_mid_buf, ih_start,
                                                  iw_start, IH, IW, IC,
                                                  unit_idx, nr_units_in_tile,
                                                  OUTPUT_BLOCK_SIZE,
                                                  KERNEL_SIZE, {0, 1, -1},
                                                  src_dtype);
 }

 void winograd_2x3_4x4_f::output(const float* output_transform_buf,
@@ -87,16 +92,19 @@ void winograd_2x3_4x4_f::output(const float* output_transform_buf,
                                size_t ow_start, size_t OH, size_t OW,
                                size_t oc_start, size_t oc_end, size_t unit_idx,
                                size_t nr_units_in_tile) {
    ::megdnn::winograd::StrategyHelper<float, float, float, float,
                                       param::MatrixMul::Format::MK4>::
            output(output_transform_buf, bias, output, transform_mid_buf, bmode,
                   nonline_mode, oh_start, ow_start, OH, OW, oc_start, oc_end,
                   unit_idx, nr_units_in_tile, OUTPUT_BLOCK_SIZE, KERNEL_SIZE,
                   {0, 1, -1}, dst_dtype);
    ::megdnn::winograd::StrategyHelper<
            float, float, float, float, param::ConvBias::Format::NCHW,
            param::MatrixMul::Format::MK4>::output(output_transform_buf, bias,
                                                   output, transform_mid_buf,
                                                   bmode, nonline_mode,
                                                   oh_start, ow_start, OH, OW,
                                                   oc_start, oc_end, unit_idx,
                                                   nr_units_in_tile,
                                                   OUTPUT_BLOCK_SIZE,
                                                   KERNEL_SIZE, {0, 1, -1},
                                                   dst_dtype);
 }



 MEGDNN_REG_WINOGRAD_STRATEGY_IMPL(winograd_2x3_1x1_qs8)

 void winograd_2x3_1x1_qs8::filter(const int8_t* filter,
@@ -136,7 +144,6 @@ void winograd_2x3_1x1_qs8::output(const int* output_transform_buf,
            {0, 1, -1}, dst_dtype, scale_input * scale_filter, 2.0f, 1.0f);
 }


 MEGDNN_REG_WINOGRAD_STRATEGY_IMPL(winograd_2x3_8x8_qs8)

 void winograd_2x3_8x8_qs8::filter(const int8_t* filter,
@@ -144,7 +151,7 @@ void winograd_2x3_8x8_qs8::filter(const int8_t* filter,
                                  int16_t* transform_mid_buf, size_t OC,
                                  size_t IC, size_t oc_start, size_t oc_end) {
    ::megdnn::winograd::StrategyHelper<
            int8_t, int8_t, int16_t, int,
            int8_t, int8_t, int16_t, int, param::ConvBias::Format::NCHW,
            param::MatrixMul::Format::MK8>::filter(filter, filter_transform_buf,
                                                   transform_mid_buf, OC, IC,
                                                   oc_start, oc_end,
@@ -158,11 +165,15 @@ void winograd_2x3_8x8_qs8::input(const int8_t* input,
                                 int16_t* transform_mid_buf, int ih_start,
                                 int iw_start, size_t IH, size_t IW, size_t IC,
                                 size_t unit_idx, size_t nr_units_in_tile) {
    ::megdnn::winograd::StrategyHelper<int8_t, int8_t, int16_t, int,
                                       param::MatrixMul::Format::MK8>::
            input(input, input_transform_buf, transform_mid_buf, ih_start,
                  iw_start, IH, IW, IC, unit_idx, nr_units_in_tile,
                  OUTPUT_BLOCK_SIZE, KERNEL_SIZE, {0, 1, -1}, src_dtype, 1.0f);
    ::megdnn::winograd::StrategyHelper<
            int8_t, int8_t, int16_t, int, param::ConvBias::Format::NCHW,
            param::MatrixMul::Format::MK8>::input(input, input_transform_buf,
                                                  transform_mid_buf, ih_start,
                                                  iw_start, IH, IW, IC,
                                                  unit_idx, nr_units_in_tile,
                                                  OUTPUT_BLOCK_SIZE,
                                                  KERNEL_SIZE, {0, 1, -1},
                                                  src_dtype, 1.0f);
 }

 void winograd_2x3_8x8_qs8::output(const int* output_transform_buf,
@@ -180,13 +191,19 @@ void winograd_2x3_8x8_qs8::output(const int* output_transform_buf,
        megdnn_assert(filter_dtype.enumv() == DTypeEnum::QuantizedS16);
        scale_filter = filter_dtype.param<dtype::QuantizedS16>().scale;
    }
    ::megdnn::winograd::StrategyHelper<int8_t, int8_t, int16_t, int,
                                       param::MatrixMul::Format::MK8>::
            output(output_transform_buf, bias, output, transform_mid_buf, bmode,
                   nonline_mode, oh_start, ow_start, OH, OW, oc_start, oc_end,
                   unit_idx, nr_units_in_tile, OUTPUT_BLOCK_SIZE, KERNEL_SIZE,
                   {0, 1, -1}, dst_dtype, scale_input * scale_filter, 2.0f,
                   1.0f);
    ::megdnn::winograd::StrategyHelper<
            int8_t, int8_t, int16_t, int, param::ConvBias::Format::NCHW,
            param::MatrixMul::Format::MK8>::output(output_transform_buf, bias,
                                                   output, transform_mid_buf,
                                                   bmode, nonline_mode,
                                                   oh_start, ow_start, OH, OW,
                                                   oc_start, oc_end, unit_idx,
                                                   nr_units_in_tile,
                                                   OUTPUT_BLOCK_SIZE,
                                                   KERNEL_SIZE, {0, 1, -1},
                                                   dst_dtype,
                                                   scale_input * scale_filter,
                                                   2.0f, 1.0f);
 }

 }  // namespace winograd
--- a/dnn/src/fallback/conv_bias/winograd/strategy.h
+++ b/dnn/src/fallback/conv_bias/winograd/strategy.h
@@ -28,6 +28,7 @@ MEGDNN_REG_WINOGRAD_STRATEGY(int8_t, int8_t, int16_t, int, 2, 3, 1, 1,

 MEGDNN_REG_WINOGRAD_STRATEGY(int8_t, int8_t, int16_t, int, 2, 3, 8, 8,
                             winograd_2x3_8x8_qs8)

 }
 }  // namespace fallback
 }  // namespace megdnn
--- a/dnn/src/naive/winograd_filter_preprocess/opr_impl.cpp
+++ b/dnn/src/naive/winograd_filter_preprocess/opr_impl.cpp
@@ -6,7 +6,8 @@
 *
 * Unless required by applicable law or agreed to in writing,
 * software distributed under the License is distributed on an
 * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or
 * implied.
 */

 #include "src/naive/winograd_filter_preprocess/opr_impl.h"
@@ -49,17 +50,16 @@ void WinogradFilterPreprocessImpl::exec(_megdnn_tensor_in src,
    size_t m = param().output_block_size;

    bool execed = false;
 #define cb(_ctype, _dst_type, _input_filter_compute_type,                     \
           _output_compute_type, _format, rescale)                            \
    if (param().format == _format) {                                          \
        return winograd::StrategyHelper<                                      \
                _ctype, _dst_type, _input_filter_compute_type,                \
                _output_compute_type, _format>::filter(src_ptr, dst_ptr,      \
                                                       workspace_ptr, OC, IC, \
                                                       0, OC, m, FW,          \
                                                       interp_points,         \
                                                       src.layout.dtype,      \
                                                       rescale);              \

 #define cb(_ctype, _dst_type, _input_filter_compute_type,                    \
           _output_compute_type, _format, rescale)                           \
    if (param().format == _format) {                                         \
        return winograd::StrategyHelper<                                     \
                _ctype, _dst_type, _input_filter_compute_type,               \
                _output_compute_type, param::ConvBias::Format::NCHW,         \
                _format>::filter(src_ptr, dst_ptr, workspace_ptr, OC, IC, 0, \
                                 OC, m, FW, interp_points, src.layout.dtype, \
                                 rescale);                                   \
    }

 #define DISPATCH_FORMAT_MK4(_ctype, _dst_type, _input_filter_compute_type,  \
@@ -110,8 +110,9 @@ void WinogradFilterPreprocessImpl::exec(_megdnn_tensor_in src,
        DISPATCH_KERNEL(dt_float16, dt_float16, dt_float16, dt_float16,      \
                        DISPATCH_FORMAT_MK8, 1.0f, _midout_tag, 2);          \
    })
    //! normal nchw mode

    if (src.layout.ndim <= 5) {
        //! dispatch_dtype with consider layout and format.
        if (FW == 3) {
            if (m == 2) {
                std::vector<float> interp_points = {0, 1, -1};
@@ -131,22 +132,20 @@ void WinogradFilterPreprocessImpl::exec(_megdnn_tensor_in src,
                DISPATCH_DTYPE(3);
            }
        }
    }
 #undef cb
 #undef DISPATCH_FORMAT_MK4
 #undef DISPATCH_FORMAT_MK8
 #undef DISPATCH_DTYPE
 #define cb(_ctype, _dst_type, _input_filter_compute_type,                     \
           _output_compute_type, _format, rescale)                            \
    if (param().format == _format) {                                          \
        return winograd::StrategyHelperNchwxx<                                \
                _ctype, _dst_type, _input_filter_compute_type,                \
                _output_compute_type, _format>::filter(src_ptr, dst_ptr,      \
                                                       workspace_ptr, OC, IC, \
                                                       0, OC, m, FW,          \
                                                       interp_points,         \
                                                       src.layout.dtype,      \
                                                       rescale);              \
    } else {
 #define cb(_ctype, _dst_type, _input_filter_compute_type,                    \
           _output_compute_type, _format, rescale)                           \
    if (param().format == _format) {                                         \
        return winograd::StrategyHelper<                                     \
                _ctype, _dst_type, _input_filter_compute_type,               \
                _output_compute_type, param::ConvBias::Format::NCHW88,       \
                _format>::filter(src_ptr, dst_ptr, workspace_ptr, OC, IC, 0, \
                                 OC, m, FW, interp_points, src.layout.dtype, \
                                 rescale);                                   \
    }

 #define DISPATCH_FORMAT_MK8(_ctype, _dst_type, _input_filter_compute_type,  \
@@ -159,8 +158,6 @@ void WinogradFilterPreprocessImpl::exec(_megdnn_tensor_in src,
        DISPATCH_KERNEL(dt_float32, dt_float32, dt_float32, dt_float32, \
                        DISPATCH_FORMAT_MK8, 1.0f, _midout_tag, 0);     \
    }
    //! nchwxx mode
    else {
        megdnn_assert(src.layout.ndim == 6 || src.layout.ndim == 7);
        if (FW == 3) {
            if (m == 2) {
@@ -171,11 +168,11 @@ void WinogradFilterPreprocessImpl::exec(_megdnn_tensor_in src,
                DISPATCH_DTYPE(5);
            }
        }
    }
 #undef cb
 #undef DISPATCH_FORMAT_MK8
 #undef DISPATCH_KERNEL
 #undef DISPATCH_DTYPE
    }
    megdnn_assert(execed,
                  "Unsupport winograd filter preprocess. m: %zu src: %s", m,
                  src.layout.to_string().c_str());