refactor(dnn): remove ProfileCache and matmul algo in x86

GitOrigin-RevId: 55a700d747
4 years ago · e39f938662
--- a/dnn/src/x86/conv_bias/f32/algos.cpp
+++ b/dnn/src/x86/conv_bias/f32/algos.cpp
@@ -20,7 +20,6 @@
 #include "src/x86/conv_bias/postprocess_helper.h"
 #include "src/x86/convolution/convolution_direct_special_cases.h"
 #include "src/x86/handle.h"
 #include "src/x86/profile.h"
 #include "midout.h"
@@ -487,153 +486,6 @@ SmallVector<ConvBiasImpl::NCBKern> ConvBiasImpl::AlgoDirectStride2::get_kimpls(
        const NCBKernSizeParam& param) const {
    GET_KERN;
 }
 /* ===================== matmul algo ===================== */
 WorkspaceBundle ConvBiasImpl::AlgoMatrixMul::get_bundle(
        const NCBKernSizeParam& param) {
    UNPACK_CONV_F32_NCB_KERN_SIZES(param);
    MEGDNN_MARK_USED_VAR(N);
    MEGDNN_MARK_USED_VAR(OC);
    auto IW2 = IH + 2 * PH;
    auto IH2 = IW + 2 * PW;
    bool can_matrix_mul_direct =
            (FH == 1 && FW == 1 && SH == 1 && SW == 1 && PH == 0 && PW == 0);
    // temp space to store padding-free src (with 4 extra floats)
    // temp space to store unrolled matrix (with 4 extra floats)
    // workspace for matrix mul opr
    size_t part0, part1, part2;
    if (can_matrix_mul_direct) {
        part0 = part1 = 0;
    } else {
        part0 = (IC * IH2 * IW2 + 4) * sizeof(float);
        part1 = (IC * FH * FW * OH * OW + 4) * sizeof(float);
    }
    {
        TensorLayout A_, B_, C_;
        A_ = TensorLayout({OC, IC * FH * FW}, dtype::Float32());
        B_ = TensorLayout({IC * FH * FW, OH * OW}, dtype::Float32());
        C_ = TensorLayout({OC, OH * OW}, dtype::Float32());
        part2 = get_matmul_opr()->get_workspace_in_bytes(A_, B_, C_);
    }
    return {nullptr, {part0, part1, part2}};
 }
 bool ConvBiasImpl::AlgoMatrixMul::is_preferred(
        const NCBKernSizeParam& param) const {
    auto&& fm = param.filter_meta;
    if (fm.dilation[0] != 1 || fm.dilation[1] != 1) {
        return false;
    }
    // single channel conv should never use matrix mul
    if (fm.ocpg == 1 || fm.icpg == 1)
        return false;
    // 1x1 conv should always use matrix mul
    if (fm.spatial[0] == 1 && fm.spatial[1] == 1)
        return true;
    // if stride is not 1x1, always use matrix mul
    if (fm.stride[0] != 1 || fm.stride[1] != 1)
        return true;
    int f = find_nearest_elem<int>(
            std::round(geometric_mean(fm.spatial[0], fm.spatial[1])),
            {2, 3, 4, 5, 6, 7});
    int oc = find_nearest_elem<int>(fm.ocpg, {4, 8, 16, 32, 64, 96, 128});
    int ic = find_nearest_elem<int>(fm.icpg, {4, 8, 16, 32, 64, 96, 128});
    int on = std::round(geometric_mean(param.osz[0], param.osz[1]));
    ProfileElement cur(f, oc, ic, on);
    auto H = static_cast<HandleImpl*>(inplace_cpu_handle().get());
    auto&& target = std::lower_bound(H->profile_cache().begin(),
                                     H->profile_cache().end(), cur);
    megdnn_assert_internal(target->f == cur.f);
    megdnn_assert_internal(target->oc == cur.oc);
    megdnn_assert_internal(target->ic == cur.ic);
    return on < target->on_threshold;
 }
 MatrixMul* ConvBiasImpl::AlgoMatrixMul::get_matmul_opr() {
    static CpuOprDelegationStorage<> storage;
    return storage.get<MatrixMul>();
 }
 void ConvBiasImpl::AlgoMatrixMul::kimpl(const NCBKernParam& param,
                                        const NCBKernIndex& ncb_index) {
    UNPACK_CONV_F32_NCB_KERN_SIZES(param);
    auto IH2 = IH + 2 * PH;
    auto IW2 = IW + 2 * PW;
    size_t group_id = ncb_index.ndrange_id[0];
    bool is_xcorr = !param.filter_meta.should_flip;
    auto bundle = get_bundle(param);
    bundle.set(param.workspace_ptr);
    // workspace = tmp..src2
    for (size_t n = 0; n < N; ++n) {
        float* src = const_cast<float*>(param.src<float>(n, group_id));
        float* dst = param.dst<float>(n, group_id);
        float* bias_ptr = static_cast<float*>(
                const_cast<void*>(param.bias<void>(n, group_id)));
        float *B, *src2;
        if (FH == 1 && FW == 1 && SH == 1 && SW == 1 && PH == 0 && PW == 0) {
            // special case: 1x1
            B = src;
        } else {
            src2 = static_cast<float*>(bundle.get(0));
            // copy src to src2;
            float* src2_ptr = src2;
            const float* src_ptr = src;
            rep(ic, IC) {
                if (PH != 0) {
                    std::memset(src2_ptr, 0, sizeof(float) * PH * IW2);
                    src2_ptr += PH * IW2;
                }
                rep(ih, IH) {
                    if (PW != 0)
                        rep(pw, PW) * (src2_ptr++) = 0.0f;
                    std::memcpy(src2_ptr, src_ptr, sizeof(float) * IW);
                    src2_ptr += IW;
                    src_ptr += IW;
                    if (PW != 0)
                        rep(pw, PW) * (src2_ptr++) = 0.0f;
                }
                if (PH != 0) {
                    std::memset(src2_ptr, 0, sizeof(float) * PH * IW2);
                    src2_ptr += PH * IW2;
                }
            }
            B = static_cast<float*>(bundle.get(1));
            if (SH == 1 && SW == 1) {
                if (is_xcorr) {
                    img2col<true>(src2, B, OC, OH, OW, IC, IH2, IW2, FH, FW);
                } else {
                    img2col<false>(src2, B, OC, OH, OW, IC, IH2, IW2, FH, FW);
                }
            } else {
                if (is_xcorr) {
                    img2col_stride<true>(src2, B, OC, OH, OW, IC, IH2, IW2, FH,
                                         FW, SH, SW);
                } else {
                    img2col_stride<false>(src2, B, OC, OH, OW, IC, IH2, IW2, FH,
                                          FW, SH, SW);
                }
            }
        }
        {
            TensorND A_, B_, C_;
            A_.layout = TensorLayout({OC, IC * FH * FW}, dtype::Float32());
            A_.raw_ptr = const_cast<float*>(param.filter<float>(group_id));
            B_.layout = TensorLayout({IC * FH * FW, OH * OW}, dtype::Float32());
            B_.raw_ptr = B;
            C_.layout = TensorLayout({OC, OH * OW}, dtype::Float32());
            C_.raw_ptr = dst;
            Workspace workspace(static_cast<dt_byte*>(bundle.get(2)),
                                bundle.get_size(2));
            get_matmul_opr()->exec(A_, B_, C_, workspace);
        }
        PostProcess<float>::run(dst, bias_ptr, dst, param.bias_mode,
                                param.nonlineMode, param.bias_type,
                                param.dst_type, 1_z, OC, OH, OW);
    }
 }
 #if MEGDNN_X86_WITH_MKL_DNN
 static inline void mkldnn_fp32_conv_instance(
--- a/dnn/src/x86/conv_bias/f32/algos.h
+++ b/dnn/src/x86/conv_bias/f32/algos.h
@@ -123,47 +123,6 @@ public:
    MEGDNN_WINOGRAD_ALGO_FUN_DECLARE(AlgoDataType::FLOAT32);
 };
 /* ===================== matmul algo ===================== */
 class ConvBiasImpl::AlgoMatrixMul final : public AlgoBase {
    static MatrixMul* get_matmul_opr();
    static WorkspaceBundle get_bundle(const NCBKernSizeParam& param);
    static void kimpl(const NCBKernParam& param, const NCBKernIndex&);
 public:
    bool is_reproducible() const override { return true; }
    const char* name() const override { return "X86_CONV_BIAS_MATMUL"; }
    bool usable(const NCBKernSizeParam& param,
                AlgoSelectionStrategy) const override {
        auto&& fm = param.filter_meta;
        return fm.format == Param::Format::NCHW && fm.spatial_ndim == 2 &&
               param.src_type.enumv() == DTypeEnum::Float32 &&
               param.filter_type.enumv() == DTypeEnum::Float32 &&
               param.dst_type.enumv() == DTypeEnum::Float32 &&
               fm.dilation[0] == 1 && fm.dilation[1] == 1 &&
               //! The matmul opr is only used in single thread
               //! TODO:support the no pack matmul algo in fallback im2col +
               //! matmul
               param.nr_threads == 1_z;
    }
    bool is_preferred(const NCBKernSizeParam&) const override;
    size_t get_workspace(const NCBKernSizeParam& param) const override {
        return get_bundle(param).total_size_in_bytes();
    }
    SmallVector<NCBKern> dispatch_kerns(
            const NCBKernSizeParam& param) const override {
        size_t group = param.filter_meta.group;
        return {{kimpl, {group, 1_z, 1_z}}};
    }
    void* type() const override;
    ConvAlgoTypePack get_algo_type() const override {
        return {AlgoDataType::FLOAT32, AlgoCategory::IM2COL};
    }
 };
 #if MEGDNN_X86_WITH_MKL_DNN
 class ConvBiasImpl::AlgoMkldnnConv final : public AlgoBase {
    static void kern_mkldnn_fp32(const NCBKernParam& param,
--- a/dnn/src/x86/conv_bias/opr_impl.cpp
+++ b/dnn/src/x86/conv_bias/opr_impl.cpp
@@ -47,10 +47,6 @@ void* ConvBiasImpl::AlgoDirectStride2::type() const {
    return x86_algo_type;
 }
 void* ConvBiasImpl::AlgoMatrixMul::type() const {
    return x86_algo_type;
 }
 void* ConvBiasImpl::AlgoDirectAvx2Stride1Int8::type() const {
    return x86_algo_type;
 }
@@ -82,7 +78,6 @@ class ConvBiasImpl::AlgoPack : NonCopyableObj {
    AlgoAVX2DirectConvStride2 avx2_stride2_direct;
    AlgoChanWiseAvx2Stride1Qint8 avx2_stride1_chanwsie_qint8;
    AlgoChanWiseAvx2Stride2Qint8 avx2_stride2_chanwsie_qint8;
    AlgoMatrixMul matmul;
 #if MEGDNN_X86_WITH_MKL_DNN
    AlgoMkldnnMatmulQint8 mkldnn_matmul_qint8;
    //! Because the mkldnnconv need handle
@@ -107,7 +102,6 @@ public:
        all_algos.emplace_back(&avx2_stride2_chanwsie_qint8);
        all_algos.emplace_back(&avx2_stride1_direct_int8);
        all_algos.emplace_back(&avx2_stride2_direct);
        all_algos.emplace_back(&matmul);
        static CpuOprDelegationStorage<> storage;
        auto matmul_opr = storage.get<MatrixMul>();
--- a/dnn/src/x86/conv_bias/opr_impl.h
+++ b/dnn/src/x86/conv_bias/opr_impl.h
@@ -31,7 +31,6 @@ public:
    class AlgoDirectStride2;
    class AlgoFP32WinogradF63_8x8;
    class AlgoFP32WinogradF23_8x8;
    class AlgoMatrixMul;
    class AlgoDirectAvx2Stride1Int8;
    class AlgoAVX2DirectConvStride2;
    class AlgoChanWiseAvx2Stride1Qint8;
--- a/dnn/src/x86/handle.h
+++ b/dnn/src/x86/handle.h
@@ -11,8 +11,6 @@
 #pragma once
 #include "src/fallback/handle.h"
 #include "src/x86/profile.h"
 #if MEGDNN_X86_WITH_MKL_DNN
 #include <mkldnn.hpp>
 #endif
@@ -22,8 +20,6 @@ namespace x86 {
 class HandleImpl : public fallback::HandleImpl {
 public:
    const ProfileCache& profile_cache() { return m_profile_cache; }
    HandleImpl(megcoreComputingHandle_t computing_handle,
               HandleType type = HandleType::X86);
@@ -37,7 +33,6 @@ public:
 #endif
 private:
    ProfileCache m_profile_cache = get_profile_cache();
 #if MEGDNN_X86_WITH_MKL_DNN
    dnnl::engine m_mkldnn_engine;
    dnnl::stream m_mkldnn_stream;
--- a/dnn/src/x86/profile.cpp
+++ b/dnn/src/x86/profile.cpp
@@ -1,324 +0,0 @@
 /**
 * \file dnn/src/x86/profile.cpp
 * MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
 *
 * Copyright (c) 2014-2020 Megvii Inc. All rights reserved.
 *
 * Unless required by applicable law or agreed to in writing,
 * software distributed under the License is distributed on an
 * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 */
 #include "src/x86/profile.h"
 namespace megdnn {
 namespace x86 {
 ProfileCache get_profile_cache()
 {
    ProfileCache vec;
    vec.clear();
    vec.reserve(294);
    vec.push_back(ProfileElement(2, 4, 4, 49));
    vec.push_back(ProfileElement(2, 4, 8, 25));
    vec.push_back(ProfileElement(2, 4, 16, 19));
    vec.push_back(ProfileElement(2, 4, 32, 14));
    vec.push_back(ProfileElement(2, 4, 64, 13));
    vec.push_back(ProfileElement(2, 4, 96, 15));
    vec.push_back(ProfileElement(2, 4, 128, 15));
    vec.push_back(ProfileElement(2, 8, 4, 241));
    vec.push_back(ProfileElement(2, 8, 8, 121));
    vec.push_back(ProfileElement(2, 8, 16, 57));
    vec.push_back(ProfileElement(2, 8, 32, 29));
    vec.push_back(ProfileElement(2, 8, 64, 17));
    vec.push_back(ProfileElement(2, 8, 96, 39));
    vec.push_back(ProfileElement(2, 8, 128, 29));
    vec.push_back(ProfileElement(2, 16, 4, 1000000000));
    vec.push_back(ProfileElement(2, 16, 8, 273));
    vec.push_back(ProfileElement(2, 16, 16, 177));
    vec.push_back(ProfileElement(2, 16, 32, 137));
    vec.push_back(ProfileElement(2, 16, 64, 1000000000));
    vec.push_back(ProfileElement(2, 16, 96, 1000000000));
    vec.push_back(ProfileElement(2, 16, 128, 1000000000));
    vec.push_back(ProfileElement(2, 32, 4, 1000000000));
    vec.push_back(ProfileElement(2, 32, 8, 1000000000));
    vec.push_back(ProfileElement(2, 32, 16, 1000000000));
    vec.push_back(ProfileElement(2, 32, 32, 1000000000));
    vec.push_back(ProfileElement(2, 32, 64, 1000000000));
    vec.push_back(ProfileElement(2, 32, 96, 1000000000));
    vec.push_back(ProfileElement(2, 32, 128, 1000000000));
    vec.push_back(ProfileElement(2, 64, 4, 1000000000));
    vec.push_back(ProfileElement(2, 64, 8, 1000000000));
    vec.push_back(ProfileElement(2, 64, 16, 1000000000));
    vec.push_back(ProfileElement(2, 64, 32, 1000000000));
    vec.push_back(ProfileElement(2, 64, 64, 1000000000));
    vec.push_back(ProfileElement(2, 64, 96, 1000000000));
    vec.push_back(ProfileElement(2, 64, 128, 1000000000));
    vec.push_back(ProfileElement(2, 96, 4, 1000000000));
    vec.push_back(ProfileElement(2, 96, 8, 1000000000));
    vec.push_back(ProfileElement(2, 96, 16, 1000000000));
    vec.push_back(ProfileElement(2, 96, 32, 1000000000));
    vec.push_back(ProfileElement(2, 96, 64, 1000000000));
    vec.push_back(ProfileElement(2, 96, 96, 1000000000));
    vec.push_back(ProfileElement(2, 96, 128, 1000000000));
    vec.push_back(ProfileElement(2, 128, 4, 1000000000));
    vec.push_back(ProfileElement(2, 128, 8, 1000000000));
    vec.push_back(ProfileElement(2, 128, 16, 1000000000));
    vec.push_back(ProfileElement(2, 128, 32, 1000000000));
    vec.push_back(ProfileElement(2, 128, 64, 1000000000));
    vec.push_back(ProfileElement(2, 128, 96, 1000000000));
    vec.push_back(ProfileElement(2, 128, 128, 1000000000));
    vec.push_back(ProfileElement(3, 4, 4, 10));
    vec.push_back(ProfileElement(3, 4, 8, 5));
    vec.push_back(ProfileElement(3, 4, 16, 7));
    vec.push_back(ProfileElement(3, 4, 32, 7));
    vec.push_back(ProfileElement(3, 4, 64, 6));
    vec.push_back(ProfileElement(3, 4, 96, 5));
    vec.push_back(ProfileElement(3, 4, 128, 5));
    vec.push_back(ProfileElement(3, 8, 4, 14));
    vec.push_back(ProfileElement(3, 8, 8, 13));
    vec.push_back(ProfileElement(3, 8, 16, 13));
    vec.push_back(ProfileElement(3, 8, 32, 13));
    vec.push_back(ProfileElement(3, 8, 64, 11));
    vec.push_back(ProfileElement(3, 8, 96, 11));
    vec.push_back(ProfileElement(3, 8, 128, 12));
    vec.push_back(ProfileElement(3, 16, 4, 37));
    vec.push_back(ProfileElement(3, 16, 8, 29));
    vec.push_back(ProfileElement(3, 16, 16, 21));
    vec.push_back(ProfileElement(3, 16, 32, 19));
    vec.push_back(ProfileElement(3, 16, 64, 14));
    vec.push_back(ProfileElement(3, 16, 96, 13));
    vec.push_back(ProfileElement(3, 16, 128, 13));
    vec.push_back(ProfileElement(3, 32, 4, 69));
    vec.push_back(ProfileElement(3, 32, 8, 105));
    vec.push_back(ProfileElement(3, 32, 16, 105));
    vec.push_back(ProfileElement(3, 32, 32, 49));
    vec.push_back(ProfileElement(3, 32, 64, 29));
    vec.push_back(ProfileElement(3, 32, 96, 27));
    vec.push_back(ProfileElement(3, 32, 128, 39));
    vec.push_back(ProfileElement(3, 64, 4, 193));
    vec.push_back(ProfileElement(3, 64, 8, 161));
    vec.push_back(ProfileElement(3, 64, 16, 137));
    vec.push_back(ProfileElement(3, 64, 32, 113));
    vec.push_back(ProfileElement(3, 64, 64, 1000000000));
    vec.push_back(ProfileElement(3, 64, 96, 1000000000));
    vec.push_back(ProfileElement(3, 64, 128, 1000000000));
    vec.push_back(ProfileElement(3, 96, 4, 1000000000));
    vec.push_back(ProfileElement(3, 96, 8, 305));
    vec.push_back(ProfileElement(3, 96, 16, 1000000000));
    vec.push_back(ProfileElement(3, 96, 32, 1000000000));
    vec.push_back(ProfileElement(3, 96, 64, 1000000000));
    vec.push_back(ProfileElement(3, 96, 96, 1000000000));
    vec.push_back(ProfileElement(3, 96, 128, 1000000000));
    vec.push_back(ProfileElement(3, 128, 4, 1000000000));
    vec.push_back(ProfileElement(3, 128, 8, 1000000000));
    vec.push_back(ProfileElement(3, 128, 16, 1000000000));
    vec.push_back(ProfileElement(3, 128, 32, 1000000000));
    vec.push_back(ProfileElement(3, 128, 64, 1000000000));
    vec.push_back(ProfileElement(3, 128, 96, 1000000000));
    vec.push_back(ProfileElement(3, 128, 128, 1000000000));
    vec.push_back(ProfileElement(4, 4, 4, 7));
    vec.push_back(ProfileElement(4, 4, 8, 7));
    vec.push_back(ProfileElement(4, 4, 16, 5));
    vec.push_back(ProfileElement(4, 4, 32, 6));
    vec.push_back(ProfileElement(4, 4, 64, 5));
    vec.push_back(ProfileElement(4, 4, 96, 5));
    vec.push_back(ProfileElement(4, 4, 128, 5));
    vec.push_back(ProfileElement(4, 8, 4, 14));
    vec.push_back(ProfileElement(4, 8, 8, 12));
    vec.push_back(ProfileElement(4, 8, 16, 5));
    vec.push_back(ProfileElement(4, 8, 32, 6));
    vec.push_back(ProfileElement(4, 8, 64, 6));
    vec.push_back(ProfileElement(4, 8, 96, 6));
    vec.push_back(ProfileElement(4, 8, 128, 5));
    vec.push_back(ProfileElement(4, 16, 4, 14));
    vec.push_back(ProfileElement(4, 16, 8, 14));
    vec.push_back(ProfileElement(4, 16, 16, 13));
    vec.push_back(ProfileElement(4, 16, 32, 13));
    vec.push_back(ProfileElement(4, 16, 64, 13));
    vec.push_back(ProfileElement(4, 16, 96, 13));
    vec.push_back(ProfileElement(4, 16, 128, 13));
    vec.push_back(ProfileElement(4, 32, 4, 37));
    vec.push_back(ProfileElement(4, 32, 8, 31));
    vec.push_back(ProfileElement(4, 32, 16, 29));
    vec.push_back(ProfileElement(4, 32, 32, 21));
    vec.push_back(ProfileElement(4, 32, 64, 21));
    vec.push_back(ProfileElement(4, 32, 96, 29));
    vec.push_back(ProfileElement(4, 32, 128, 21));
    vec.push_back(ProfileElement(4, 64, 4, 137));
    vec.push_back(ProfileElement(4, 64, 8, 113));
    vec.push_back(ProfileElement(4, 64, 16, 89));
    vec.push_back(ProfileElement(4, 64, 32, 69));
    vec.push_back(ProfileElement(4, 64, 64, 45));
    vec.push_back(ProfileElement(4, 64, 96, 37));
    vec.push_back(ProfileElement(4, 64, 128, 35));
    vec.push_back(ProfileElement(4, 96, 4, 137));
    vec.push_back(ProfileElement(4, 96, 8, 113));
    vec.push_back(ProfileElement(4, 96, 16, 105));
    vec.push_back(ProfileElement(4, 96, 32, 77));
    vec.push_back(ProfileElement(4, 96, 64, 53));
    vec.push_back(ProfileElement(4, 96, 96, 45));
    vec.push_back(ProfileElement(4, 96, 128, 39));
    vec.push_back(ProfileElement(4, 128, 4, 137));
    vec.push_back(ProfileElement(4, 128, 8, 121));
    vec.push_back(ProfileElement(4, 128, 16, 153));
    vec.push_back(ProfileElement(4, 128, 32, 97));
    vec.push_back(ProfileElement(4, 128, 64, 1000000000));
    vec.push_back(ProfileElement(4, 128, 96, 1000000000));
    vec.push_back(ProfileElement(4, 128, 128, 1000000000));
    vec.push_back(ProfileElement(5, 4, 4, 8));
    vec.push_back(ProfileElement(5, 4, 8, 9));
    vec.push_back(ProfileElement(5, 4, 16, 5));
    vec.push_back(ProfileElement(5, 4, 32, 5));
    vec.push_back(ProfileElement(5, 4, 64, 5));
    vec.push_back(ProfileElement(5, 4, 96, 5));
    vec.push_back(ProfileElement(5, 4, 128, 5));
    vec.push_back(ProfileElement(5, 8, 4, 7));
    vec.push_back(ProfileElement(5, 8, 8, 6));
    vec.push_back(ProfileElement(5, 8, 16, 5));
    vec.push_back(ProfileElement(5, 8, 32, 5));
    vec.push_back(ProfileElement(5, 8, 64, 5));
    vec.push_back(ProfileElement(5, 8, 96, 5));
    vec.push_back(ProfileElement(5, 8, 128, 5));
    vec.push_back(ProfileElement(5, 16, 4, 21));
    vec.push_back(ProfileElement(5, 16, 8, 12));
    vec.push_back(ProfileElement(5, 16, 16, 12));
    vec.push_back(ProfileElement(5, 16, 32, 11));
    vec.push_back(ProfileElement(5, 16, 64, 11));
    vec.push_back(ProfileElement(5, 16, 96, 11));
    vec.push_back(ProfileElement(5, 16, 128, 11));
    vec.push_back(ProfileElement(5, 32, 4, 23));
    vec.push_back(ProfileElement(5, 32, 8, 14));
    vec.push_back(ProfileElement(5, 32, 16, 14));
    vec.push_back(ProfileElement(5, 32, 32, 13));
    vec.push_back(ProfileElement(5, 32, 64, 13));
    vec.push_back(ProfileElement(5, 32, 96, 13));
    vec.push_back(ProfileElement(5, 32, 128, 13));
    vec.push_back(ProfileElement(5, 64, 4, 77));
    vec.push_back(ProfileElement(5, 64, 8, 39));
    vec.push_back(ProfileElement(5, 64, 16, 37));
    vec.push_back(ProfileElement(5, 64, 32, 29));
    vec.push_back(ProfileElement(5, 64, 64, 29));
    vec.push_back(ProfileElement(5, 64, 96, 21));
    vec.push_back(ProfileElement(5, 64, 128, 21));
    vec.push_back(ProfileElement(5, 96, 4, 113));
    vec.push_back(ProfileElement(5, 96, 8, 77));
    vec.push_back(ProfileElement(5, 96, 16, 61));
    vec.push_back(ProfileElement(5, 96, 32, 39));
    vec.push_back(ProfileElement(5, 96, 64, 37));
    vec.push_back(ProfileElement(5, 96, 96, 31));
    vec.push_back(ProfileElement(5, 96, 128, 29));
    vec.push_back(ProfileElement(5, 128, 4, 113));
    vec.push_back(ProfileElement(5, 128, 8, 97));
    vec.push_back(ProfileElement(5, 128, 16, 69));
    vec.push_back(ProfileElement(5, 128, 32, 53));
    vec.push_back(ProfileElement(5, 128, 64, 39));
    vec.push_back(ProfileElement(5, 128, 96, 31));
    vec.push_back(ProfileElement(5, 128, 128, 31));
    vec.push_back(ProfileElement(6, 4, 4, 7));
    vec.push_back(ProfileElement(6, 4, 8, 3));
    vec.push_back(ProfileElement(6, 4, 16, 5));
    vec.push_back(ProfileElement(6, 4, 32, 4));
    vec.push_back(ProfileElement(6, 4, 64, 5));
    vec.push_back(ProfileElement(6, 4, 96, 4));
    vec.push_back(ProfileElement(6, 4, 128, 4));
    vec.push_back(ProfileElement(6, 8, 4, 11));
    vec.push_back(ProfileElement(6, 8, 8, 5));
    vec.push_back(ProfileElement(6, 8, 16, 5));
    vec.push_back(ProfileElement(6, 8, 32, 5));
    vec.push_back(ProfileElement(6, 8, 64, 5));
    vec.push_back(ProfileElement(6, 8, 96, 5));
    vec.push_back(ProfileElement(6, 8, 128, 5));
    vec.push_back(ProfileElement(6, 16, 4, 13));
    vec.push_back(ProfileElement(6, 16, 8, 11));
    vec.push_back(ProfileElement(6, 16, 16, 11));
    vec.push_back(ProfileElement(6, 16, 32, 5));
    vec.push_back(ProfileElement(6, 16, 64, 5));
    vec.push_back(ProfileElement(6, 16, 96, 5));
    vec.push_back(ProfileElement(6, 16, 128, 11));
    vec.push_back(ProfileElement(6, 32, 4, 21));
    vec.push_back(ProfileElement(6, 32, 8, 14));
    vec.push_back(ProfileElement(6, 32, 16, 13));
    vec.push_back(ProfileElement(6, 32, 32, 13));
    vec.push_back(ProfileElement(6, 32, 64, 13));
    vec.push_back(ProfileElement(6, 32, 96, 13));
    vec.push_back(ProfileElement(6, 32, 128, 13));
    vec.push_back(ProfileElement(6, 64, 4, 39));
    vec.push_back(ProfileElement(6, 64, 8, 29));
    vec.push_back(ProfileElement(6, 64, 16, 29));
    vec.push_back(ProfileElement(6, 64, 32, 21));
    vec.push_back(ProfileElement(6, 64, 64, 21));
    vec.push_back(ProfileElement(6, 64, 96, 21));
    vec.push_back(ProfileElement(6, 64, 128, 21));
    vec.push_back(ProfileElement(6, 96, 4, 97));
    vec.push_back(ProfileElement(6, 96, 8, 61));
    vec.push_back(ProfileElement(6, 96, 16, 39));
    vec.push_back(ProfileElement(6, 96, 32, 37));
    vec.push_back(ProfileElement(6, 96, 64, 29));
    vec.push_back(ProfileElement(6, 96, 96, 29));
    vec.push_back(ProfileElement(6, 96, 128, 21));
    vec.push_back(ProfileElement(6, 128, 4, 77));
    vec.push_back(ProfileElement(6, 128, 8, 61));
    vec.push_back(ProfileElement(6, 128, 16, 39));
    vec.push_back(ProfileElement(6, 128, 32, 37));
    vec.push_back(ProfileElement(6, 128, 64, 29));
    vec.push_back(ProfileElement(6, 128, 96, 29));
    vec.push_back(ProfileElement(6, 128, 128, 23));
    vec.push_back(ProfileElement(7, 4, 4, 5));
    vec.push_back(ProfileElement(7, 4, 8, 4));
    vec.push_back(ProfileElement(7, 4, 16, 4));
    vec.push_back(ProfileElement(7, 4, 32, 4));
    vec.push_back(ProfileElement(7, 4, 64, 4));
    vec.push_back(ProfileElement(7, 4, 96, 4));
    vec.push_back(ProfileElement(7, 4, 128, 3));
    vec.push_back(ProfileElement(7, 8, 4, 5));
    vec.push_back(ProfileElement(7, 8, 8, 5));
    vec.push_back(ProfileElement(7, 8, 16, 5));
    vec.push_back(ProfileElement(7, 8, 32, 5));
    vec.push_back(ProfileElement(7, 8, 64, 5));
    vec.push_back(ProfileElement(7, 8, 96, 5));
    vec.push_back(ProfileElement(7, 8, 128, 5));
    vec.push_back(ProfileElement(7, 16, 4, 13));
    vec.push_back(ProfileElement(7, 16, 8, 11));
    vec.push_back(ProfileElement(7, 16, 16, 5));
    vec.push_back(ProfileElement(7, 16, 32, 5));
    vec.push_back(ProfileElement(7, 16, 64, 5));
    vec.push_back(ProfileElement(7, 16, 96, 5));
    vec.push_back(ProfileElement(7, 16, 128, 5));
    vec.push_back(ProfileElement(7, 32, 4, 21));
    vec.push_back(ProfileElement(7, 32, 8, 13));
    vec.push_back(ProfileElement(7, 32, 16, 13));
    vec.push_back(ProfileElement(7, 32, 32, 13));
    vec.push_back(ProfileElement(7, 32, 64, 13));
    vec.push_back(ProfileElement(7, 32, 96, 13));
    vec.push_back(ProfileElement(7, 32, 128, 12));
    vec.push_back(ProfileElement(7, 64, 4, 37));
    vec.push_back(ProfileElement(7, 64, 8, 21));
    vec.push_back(ProfileElement(7, 64, 16, 14));
    vec.push_back(ProfileElement(7, 64, 32, 14));
    vec.push_back(ProfileElement(7, 64, 64, 14));
    vec.push_back(ProfileElement(7, 64, 96, 13));
    vec.push_back(ProfileElement(7, 64, 128, 14));
    vec.push_back(ProfileElement(7, 96, 4, 61));
    vec.push_back(ProfileElement(7, 96, 8, 39));
    vec.push_back(ProfileElement(7, 96, 16, 37));
    vec.push_back(ProfileElement(7, 96, 32, 31));
    vec.push_back(ProfileElement(7, 96, 64, 21));
    vec.push_back(ProfileElement(7, 96, 96, 21));
    vec.push_back(ProfileElement(7, 96, 128, 21));
    vec.push_back(ProfileElement(7, 128, 4, 61));
    vec.push_back(ProfileElement(7, 128, 8, 31));
    vec.push_back(ProfileElement(7, 128, 16, 37));
    vec.push_back(ProfileElement(7, 128, 32, 11));
    vec.push_back(ProfileElement(7, 128, 64, 13));
    vec.push_back(ProfileElement(7, 128, 96, 23));
    vec.push_back(ProfileElement(7, 128, 128, 21));
    return vec;
 }
 } // namespace fallback
 } // namespace megdnn
 // vim: syntax=cpp.doxygen
--- a/dnn/src/x86/profile.h
+++ b/dnn/src/x86/profile.h
@@ -1,45 +0,0 @@
 /**
 * \file dnn/src/x86/profile.h
 * MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
 *
 * Copyright (c) 2014-2020 Megvii Inc. All rights reserved.
 *
 * Unless required by applicable law or agreed to in writing,
 * software distributed under the License is distributed on an
 * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 */
 #pragma once
 #include <vector>
 namespace megdnn {
 namespace x86 {
 struct ProfileElement {
    // when output_size > on_threshold, DIRECT is faster,
    // otherwise MATRIX_MUL is faster
    int f, ic, oc, on_threshold;
    ProfileElement(int f, int ic, int oc, int on_threshold):
        f(f), ic(ic), oc(oc), on_threshold(on_threshold)
    {
    }
    bool operator<(const ProfileElement &rhs) const
    {
        if (this->f < rhs.f) return true;
        if (this->f > rhs.f) return false;
        if (this->ic < rhs.ic) return true;
        if (this->ic > rhs.ic) return false;
        if (this->oc < rhs.oc) return true;
        if (this->oc > rhs.oc) return false;
        return false;
    }
 };
 using ProfileCache = std::vector<ProfileElement>;
 ProfileCache get_profile_cache();
 } // namespace fallback
 } // namespace megdnn
 // vim: syntax=cpp.doxygen
--- a/dnn/src/x86/separable_conv/opr_impl.cpp
+++ b/dnn/src/x86/separable_conv/opr_impl.cpp
@@ -63,7 +63,6 @@
 #include "./sep_conv_filter.h"
 #include "src/common/utils.h"
 #include "src/x86/utils.h"
 #include "src/x86/profile.h"
 #include "src/x86/handle.h"
 #include <cstring>
--- a/dnn/src/x86/separable_filter/opr_impl.cpp
+++ b/dnn/src/x86/separable_filter/opr_impl.cpp
@@ -14,7 +14,6 @@
 #include "src/common/cv/helper.h"
 #include "src/common/utils.h"
 #include "src/x86/utils.h"
 #include "src/x86/profile.h"
 #include "src/x86/handle.h"
 #include <cstring>
--- a/dnn/test/x86/conv_bias.cpp
+++ b/dnn/test/x86/conv_bias.cpp
@@ -1599,73 +1599,6 @@ TEST_F(X86_MULTI_THREADS, CONV_BIAS_IM2COLMATMUL_QINT8_FILTER_PREPROCESS) {
 #undef cb
 }
 TEST_F(X86, CONV_BIAS_MATMUL) {
    using namespace conv_bias;
    std::vector<TestArg> args;
    auto run = [&](size_t oc, size_t ic, size_t w, size_t h, size_t kernel,
                   size_t p, NonlineMode nonline_mode) {
        if (w + 2 * p < kernel || h + 2 * p < kernel)
            return;
        param::ConvBias param;
        param.stride_h = 1;
        param.stride_w = 1;
        param.pad_h = p;
        param.pad_w = p;
        param.nonlineMode = nonline_mode;
        //! no bias
        param.sparse = param::ConvBias::Sparse::DENSE;
        args.emplace_back(param, TensorShape{1, ic, h, w},
                          TensorShape{oc, ic, kernel, kernel}, TensorShape{});
        //! bias channel
        args.emplace_back(param, TensorShape{2, ic, h, w},
                          TensorShape{oc, ic, kernel, kernel},
                          TensorShape{1, oc, 1, 1});
        //! bias
        args.emplace_back(param, TensorShape{2, ic, h, w},
                          TensorShape{oc, ic, kernel, kernel},
                          TensorShape{2, oc, (h + param.pad_h * 2 - kernel) + 1,
                                      (w + param.pad_w * 2 - kernel) + 1});
        //! gruop
        param.sparse = param::ConvBias::Sparse::GROUP;
        args.emplace_back(
                param, TensorShape{2, 2 * ic, h, w},
                TensorShape{2, oc, ic, kernel, kernel},
                TensorShape{2, 2 * oc, (h + param.pad_h * 2 - kernel) + 1,
                            (w + param.pad_w * 2 - kernel) + 1});
    };
    for (size_t kernel : {2, 3, 5, 7})
        for (size_t ic : {1, 2, 3, 4})
            for (size_t oc : {1, 2, 3, 4})
                for (size_t p : {0, 2})
                    for (size_t size : {20, 21, 22, 23, 24})
                        for (NonlineMode nonline_mode :
                             {NonlineMode::RELU, NonlineMode::SIGMOID,
                              NonlineMode::H_SWISH, NonlineMode::IDENTITY}) {
                            run(oc, ic, size, size, kernel, p, nonline_mode);
                        }
    Checker<ConvBias> checker(handle());
    checker.set_before_exec_callback(
            conv_bias::ConvBiasAlgoChecker<ConvBiasForward>(
                    "X86_CONV_BIAS_MATMUL"));
    checker.set_epsilon(1);
    UniformIntRNG rng{-50, 50};
    checker.set_dtype(0, dtype::Float32())
            .set_dtype(1, dtype::Float32())
            .set_dtype(2, dtype::Float32())
            .set_rng(0, &rng)
            .set_rng(1, &rng)
            .set_rng(2, &rng);
    for (auto&& arg : args) {
        checker.set_param(arg.param).exec(
                {arg.src, arg.filter, arg.bias, {}, {}});
    }
 }
 #if MEGDNN_WITH_BENCHMARK
 #if MEGDNN_X86_WITH_MKL_DNN
 static void x86_benchmark_fp32_mkldnn(Handle* handle) {
--- a/dnn/test/x86/convolution.cpp
+++ b/dnn/test/x86/convolution.cpp
@@ -182,49 +182,6 @@ TEST_F(X86, DEFAULT_CONV_DIRECT_STRIDE2) {
    }
 }
 TEST_F(X86, DEFAULT_CONV_MATMUL) {
    using namespace convolution;
    std::vector<TestArg> args;
    auto run = [&](size_t oc, size_t ic, size_t w, size_t h, size_t kernel,
                   size_t p) {
        if (w + 2 * p < kernel || h + 2 * p < kernel)
            return;
        param::Convolution param;
        param.stride_h = 1;
        param.stride_w = 1;
        param.pad_h = p;
        param.pad_w = p;
        //! no bias
        args.emplace_back(param, TensorShape{1, ic, h, w},
                          TensorShape{oc, ic, kernel, kernel});
    };
    for (size_t kernel : {2, 3, 5, 7})
        for (size_t ic : {1, 2, 3, 4})
            for (size_t oc : {1, 2, 3, 4})
                for (size_t p : {0, 2})
                    for (size_t size : {20, 21, 22, 23, 24}) {
                        run(oc, ic, size, size, kernel, p);
                    }
    Checker<ConvolutionForward> checker(handle());
    checker.set_before_exec_callback(AlgoChecker<ConvolutionForward>(
            "CONVOLUTION_DEFAULT_X86_CONV_BIAS_MATMUL"));
    UniformIntRNG rng{-50, 50};
    checker.set_dtype(0, dtype::Float32())
            .set_dtype(1, dtype::Float32())
            .set_dtype(2, dtype::Float32())
            .set_rng(0, &rng)
            .set_rng(1, &rng)
            .set_rng(2, &rng);
    for (auto&& arg : args) {
        checker.set_param(arg.param).exec({arg.src, arg.filter, {}});
    }
 }
 #if MEGDNN_X86_WITH_MKL_DNN
 TEST_F(X86, CONVOLUTION_FORWARD_INT8) {
    Checker<ConvolutionForward> checker(handle());