refactor(megdnn): add default algo for convolution forward

GitOrigin-RevId: a12a7d399a
4 years ago · 0d720653ac
--- a/dnn/src/cuda/convolution/forward/algos.cpp
+++ b/dnn/src/cuda/convolution/forward/algos.cpp
@@ -0,0 +1,172 @@
 /**
 * \file dnn/src/cuda/convolution/forward/algos.cpp
 * MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
 *
 * Copyright (c) 2014-2020 Megvii Inc. All rights reserved.
 *
 * Unless required by applicable law or agreed to in writing,
 * software distributed under the License is distributed on an
 * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or
 * implied.
 */

 #include "src/cuda/convolution/forward/algos.h"
 #include "src/cuda/conv_bias/opr_impl.h"
 #include "src/cuda/conv_bias/algo.h"
 #include "src/common/algo_base.h"
 #include "src/common/algo_chooser.h"

 using namespace megdnn;
 using namespace cuda;

 namespace {
 std::pair<TensorLayoutArray, ConvBiasForward::Param> sub_opr_config(
        const TensorLayout& src, const TensorLayout& filter,
        const TensorLayout& dst, const ConvolutionForwardImpl* opr) {
    auto conv_param = opr->param();
    DType bias_type;
    if (src.dtype.enumv() == DTypeEnum::QuantizedS8) {
        bias_type = dtype::QuantizedS32(
                src.dtype.param<dtype::QuantizedS8>().scale *

                filter.dtype.param<dtype::QuantizedS8>().scale);
    } else if (src.dtype.enumv() == DTypeEnum::Quantized8Asymm) {
        bias_type = dtype::QuantizedS32(
                src.dtype.param<dtype::Quantized8Asymm>().scale *

                filter.dtype.param<dtype::Quantized8Asymm>().scale);
    } else if (src.dtype.enumv() == DTypeEnum::Uint8 ||
               src.dtype.enumv() == DTypeEnum::Int8) {
        bias_type = dtype::Int32{};
    } else if (src.dtype.enumv() == DTypeEnum::Quantized4Asymm) {
        bias_type = dtype::QuantizedS32(
                src.dtype.param<dtype::Quantized4Asymm>().scale *

                filter.dtype.param<dtype::Quantized4Asymm>().scale);
    } else {
        megdnn_assert(src.dtype.category() == DTypeCategory::FLOAT);
        bias_type = src.dtype;
    }

    std::pair<TensorLayoutArray, ConvBiasForward::Param> ret;
    ret.second = {param::ConvBias::NonlineMode::IDENTITY,
                  conv_param.mode,
                  conv_param.sparse,
                  conv_param.format,
                  conv_param.pad_h,
                  conv_param.pad_w,
                  conv_param.stride_h,
                  conv_param.stride_w,
                  conv_param.dilate_h,
                  conv_param.dilate_w,
                  conv_param.compute_mode};
    ret.first.push_back(TensorLayout({}, bias_type));
    ret.first.push_back(TensorLayout({}, dst.dtype));
    return ret;
 }

 }  // namespace

 ConvolutionForwardImpl::AlgoPack::AlgoPack() {
    all_algos.push_back(&algo_default);

    for (auto&& algo : all_algos) {
        m_all_algos_map.emplace(algo->info().desc, algo);
    }
 }

 ConvolutionForwardImpl::AlgoPack ConvolutionForwardImpl::sm_algo_pack;

 MEGDNN_DEF_GET_ALGO_FROM_DESC(ConvolutionForwardImpl)

 ConvolutionForwardImpl::AlgoBase::SizeArgs::SizeArgs(ConvolutionForwardImpl* o,
                                                     const TensorLayout& src,
                                                     const TensorLayout& filter,
                                                     const TensorLayout& dst)
        : opr{o}, layout_src{&src}, layout_filter{&filter}, layout_dst{&dst} {}

 ConvolutionForwardImpl::AlgoBase::ExecArgs::ExecArgs(
        ConvolutionForwardImpl* opr, _megdnn_tensor_in src,
        _megdnn_tensor_in filter, _megdnn_tensor_out dst,
        _megdnn_workspace workspace)
        : SizeArgs(opr, src.layout, filter.layout, dst.layout),
          tensor_src{src},
          tensor_filter{filter},
          tensor_dst{dst},
          workspace{workspace} {}

 std::string ConvolutionForwardImpl::AlgoBase::SizeArgs::to_string() const {
    return megdnn_mangle(ssprintf("src=%s, filter=%s, dst=%s",
                                  layout_src->to_string().c_str(),
                                  layout_filter->to_string().c_str(),
                                  layout_dst->to_string().c_str()));
 }

 /* ===================== default algo ===================== */
 std::vector<Algorithm::SearchItem>
 ConvolutionForwardImpl::AlgoDefault::get_subopr_list(
        const TensorLayoutArray& layouts, const OperatorBase* opr) const {
    auto&& config =
            sub_opr_config(layouts[0], layouts[1], layouts[2],
                           static_cast<const ConvolutionForwardImpl*>(opr));

    TensorLayoutArray conv_bias_layouts = {layouts[0], layouts[1],
                                           config.first[0], config.first[1],
                                           layouts[2]};
    std::string param_str;
    Algorithm::serialize_write_pod(config.second, param_str);
    return {{Algorithm::OprType::CONVBIAS_FORWARD, param_str,
             conv_bias_layouts}};
 }

 bool ConvolutionForwardImpl::AlgoDefault::is_available(
        const SizeArgs& args) const {
    auto conv_bias_opr =
            args.opr->handle()->create_operator<ConvBiasForward>();
    auto&& config = sub_opr_config(
            *args.layout_src, *args.layout_filter, *args.layout_dst,
            args.opr);
    conv_bias_opr->param() = config.second;
    return get_algorithm(static_cast<ConvBiasForwardImpl*>(conv_bias_opr.get()),
                         *args.layout_src, *args.layout_filter, config.first[0],
                         config.first[1], *args.layout_dst);
 }


 size_t ConvolutionForwardImpl::AlgoDefault::get_workspace_in_bytes(
        const SizeArgs& args) const {
    auto conv_bias_opr = args.opr->handle()->create_operator<ConvBiasForward>();
    if (args.opr->execution_policy().algo.valid() &&
        !args.opr->execution_policy().sub_policy.empty()) {
        megdnn_assert(args.opr->execution_policy().sub_policy.size() == 1);
        conv_bias_opr->execution_policy() =
                args.opr->execution_policy().sub_policy[0];
    }

    auto&& config = sub_opr_config(
            *args.layout_src, *args.layout_filter, *args.layout_dst,
            args.opr);
    conv_bias_opr->param() = config.second;
    return conv_bias_opr->get_workspace_in_bytes(
            *args.layout_src, *args.layout_filter, config.first[0],
            config.first[1], *args.layout_dst, nullptr);
 }

 void ConvolutionForwardImpl::AlgoDefault::exec(const ExecArgs& args) const {
    auto conv_bias_opr = args.opr->handle()->create_operator<ConvBiasForward>();
    if (args.opr->execution_policy().algo.valid()) {
        megdnn_assert(args.opr->execution_policy().sub_policy.size() == 1);
        conv_bias_opr->execution_policy() =
                args.opr->execution_policy().sub_policy[0];
    }

    auto&& config = sub_opr_config(
            *args.layout_src, *args.layout_filter, *args.layout_dst,
            args.opr);
    conv_bias_opr->param() = config.second;
    conv_bias_opr->exec(args.tensor_src, args.tensor_filter,
                        {nullptr, config.first[0]}, {nullptr, config.first[1]},
                        args.tensor_dst, nullptr, args.workspace);
 }

 // vim: syntax=cpp.doxygen
--- a/dnn/src/cuda/convolution/forward/algos.h
+++ b/dnn/src/cuda/convolution/forward/algos.h
@@ -0,0 +1,111 @@
 /**
 * \file dnn/src/cuda/convolution/forward/algos.h
 * MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
 *
 * Copyright (c) 2014-2020 Megvii Inc. All rights reserved.
 *
 * Unless required by applicable law or agreed to in writing,
 * software distributed under the License is distributed on an
 * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or
 * implied.
 */

 #pragma once
 #include "megdnn/oprs.h"
 #include "src/common/algo_base.h"
 #include "src/common/metahelper.h"
 #include "src/common/utils.h"
 #include "src/cuda/convolution/opr_impl.h"

 #include <unordered_map>

 namespace megdnn {
 namespace cuda {

 /*!
 * \brief base class for convolutionForward algos
 *
 */
 class ConvolutionForwardImpl::AlgoBase : public Algorithm {
 protected:
    ~AlgoBase() = default;

 public:
    enum class AlgoType : uint32_t {
        CUDA_DEFAULT,
    };
    using Mapper = std::unordered_map<AlgorithmDesc, AlgoBase*>;

    AlgoBase() : Algorithm() { m_handle_type = Handle::HandleType::CUDA; }

    struct SizeArgs {
        ConvolutionForwardImpl* opr;
        const TensorLayout *layout_src, *layout_filter, *layout_dst;

        std::string to_string() const;
        SizeArgs(ConvolutionForwardImpl* opr, const TensorLayout& src,
                 const TensorLayout& filter, const TensorLayout& dst);
    };
    struct ExecArgs : public SizeArgs {
        TensorND tensor_src, tensor_filter, tensor_dst;
        Workspace workspace;

        ExecArgs(ConvolutionForwardImpl* opr, _megdnn_tensor_in src,
                 _megdnn_tensor_in filter, _megdnn_tensor_out dst,
                 _megdnn_workspace workspace);
    };

    virtual bool is_available(const SizeArgs& args) const = 0;
    virtual size_t get_workspace_in_bytes(const SizeArgs& args) const = 0;
    virtual void exec(const ExecArgs&) const = 0;

    bool is_available_wk(const SizeArgs& args, size_t limit) const {
        return is_available(args) && get_workspace_in_bytes(args) <= limit;
    }
    bool is_available_reproducible(
            const SizeArgs& args, bool reproducible = true,
            size_t limit = std::numeric_limits<size_t>::max()) const {
        return (!reproducible || is_reproducible()) &&
               is_available_wk(args, limit);
    }
    AlgoBase& check_workspace(const SizeArgs& args,
                              const Workspace& workspace) {
        auto req = get_workspace_in_bytes(args);
        megdnn_assert(req <= workspace.size,
                      "convolution fwd algo %s: required workspace %zu bytes, "
                      "got %zu",
                      name(), req, workspace.size);
        return *this;
    }
 };

 class ConvolutionForwardImpl::AlgoDefault final : public AlgoBase {
 public:
    AlgoDefault() = default;
    bool is_available(const SizeArgs&) const override;
    size_t get_workspace_in_bytes(const SizeArgs& /* args */) const override;
    const char* name() const override { return "DEFAULT"; }
    void exec(const ExecArgs&) const override;
    bool is_reproducible() const override { return true; }
    std::vector<SearchItem> get_subopr_list(
            const TensorLayoutArray& layouts,
            const OperatorBase* opr) const override;
    MEGDNN_DECL_ALGO_TYPE(CUDA_DEFAULT)
 };

 class ConvolutionForwardImpl::AlgoPack : NonCopyableObj {
 private:
    AlgoBase::Mapper m_all_algos_map;

 public:
    AlgoPack();
    AlgoDefault algo_default;
    std::vector<AlgoBase*> all_algos;

    const AlgoBase::Mapper& all_algos_map() const { return m_all_algos_map; }
 };

 }  // namespace cuda
 }  // namespace megdnn

 // vim: syntax=cpp.doxygen
--- a/dnn/src/cuda/convolution/opr_impl.cpp
+++ b/dnn/src/cuda/convolution/opr_impl.cpp
@@ -12,6 +12,7 @@
 #include "src/cuda/convolution/opr_impl.h"
 #include "megdnn/dtype.h"
 #include "src/cuda/convolution/helper.h"
 #include "src/cuda/convolution/forward/algos.h"
 #include "src/cuda/convolution/backward_data/algo.h"
 #include "src/cuda/convolution/backward_filter/algo.h"
 #include "src/cuda/conv_bias/opr_impl.h"
@@ -28,108 +29,34 @@ using namespace convolution;
    TO_STRING(CUDNN_MINOR) "." TO_STRING(CUDNN_PATCHLEVEL)

 /* ============== ConvolutionForwardImpl ============== */
 ConvolutionForwardImpl::ConvBiasExtraData
 ConvolutionForwardImpl::conv_bias_extra_data(const TensorLayout& src,
                                             const TensorLayout& filter,
                                             const TensorLayout& dst) {
    auto conv_param = param();
    DType bias_type;
    if (src.dtype.enumv() == DTypeEnum::QuantizedS8) {
        bias_type = dtype::QuantizedS32(
                src.dtype.param<dtype::QuantizedS8>().scale *

                filter.dtype.param<dtype::QuantizedS8>().scale);
    } else if (src.dtype.enumv() == DTypeEnum::Quantized8Asymm) {
        bias_type = dtype::QuantizedS32(
                src.dtype.param<dtype::Quantized8Asymm>().scale *

                filter.dtype.param<dtype::Quantized8Asymm>().scale);
    } else if (src.dtype.enumv() == DTypeEnum::Uint8 ||
               src.dtype.enumv() == DTypeEnum::Int8) {
        bias_type = dtype::Int32{};
    } else if (src.dtype.enumv() == DTypeEnum::Quantized4Asymm) {
        bias_type = dtype::QuantizedS32(
                src.dtype.param<dtype::Quantized4Asymm>().scale *

                filter.dtype.param<dtype::Quantized4Asymm>().scale);
    } else {
        megdnn_assert(src.dtype.category() == DTypeCategory::FLOAT);
        bias_type = src.dtype;
    }
    ConvBiasExtraData ret = {this->handle()->create_operator<ConvBiasForward>(),
                             TensorLayout(bias_type), TensorLayout(dst.dtype)};
    ret.convbias_opr->param() = {param::ConvBias::NonlineMode::IDENTITY,
                                 conv_param.mode,
                                 conv_param.sparse,
                                 conv_param.format,
                                 conv_param.pad_h,
                                 conv_param.pad_w,
                                 conv_param.stride_h,
                                 conv_param.stride_w,
                                 conv_param.dilate_h,
                                 conv_param.dilate_w,
                                 conv_param.compute_mode};
    ret.convbias_opr->execution_policy() = {this->execution_policy().algo, {}};
    return ret;
 }

 ConvolutionForwardImpl::Algorithm*
 ConvolutionForwardImpl::get_algorithm_heuristic(const TensorLayout& src,
                                                const TensorLayout& filter,
                                                const TensorLayout& dst,
                                                size_t workspace_limit_in_bytes,
                                                bool reproducible) {
    auto extra_data = conv_bias_extra_data(src, filter, dst);
    return static_cast<ConvBiasForwardImpl*>(extra_data.convbias_opr.get())
            ->get_algorithm_heuristic(src, filter, extra_data.bias_layout,
                                      extra_data.z_layout, dst,
                                      workspace_limit_in_bytes, reproducible);
 }

 ConvolutionForwardImpl::Algorithm*
 ConvolutionForwardImpl::get_algorithm_from_desc(
        const ConvolutionForward::AlgorithmDesc& desc) {
    auto conv_param = param();
    auto convbias_opr = this->handle()->create_operator<ConvBiasForward>();
    convbias_opr->param() = {param::ConvBias::NonlineMode::IDENTITY,
                             conv_param.mode,
                             conv_param.sparse,
                             conv_param.format,
                             conv_param.pad_h,
                             conv_param.pad_w,
                             conv_param.stride_h,
                             conv_param.stride_w,
                             conv_param.dilate_h,
                             conv_param.dilate_w,
                             conv_param.compute_mode};
    convbias_opr->execution_policy() = {this->execution_policy().algo, {}};

    return static_cast<ConvBiasForwardImpl*>(convbias_opr.get())
            ->get_algorithm_from_desc(desc);
    AlgoBase::SizeArgs args{this, src, filter, dst};
    MEGDNN_MARK_USED_VAR(workspace_limit_in_bytes);
    MEGDNN_MARK_USED_VAR(reproducible);
    return &sm_algo_pack.algo_default;
 }

 std::vector<ConvolutionForwardImpl::Algorithm*>
 ConvolutionForwardImpl::get_all_algorithms(const TensorLayout& src,
                                           const TensorLayout& filter,
                                           const TensorLayout& dst) {
    auto extra_data = conv_bias_extra_data(src, filter, dst);
    return static_cast<ConvBiasForwardImpl*>(extra_data.convbias_opr.get())
            ->get_all_algorithms(src, filter, extra_data.bias_layout,
                                 extra_data.z_layout, dst);
    AlgoBase::SizeArgs args{this, src, filter, dst};
    return megdnn::get_all_algorithms<ConvolutionForwardImpl>(args);
 }

 size_t ConvolutionForwardImpl::get_workspace_in_bytes(
        const TensorLayout& src, const TensorLayout& filter,
        const TensorLayout& dst,
        const PreprocessedFilter* preprocessed_filter) {
    auto extra_data = conv_bias_extra_data(src, filter, dst);
    return static_cast<ConvBiasForwardImpl*>(extra_data.convbias_opr.get())
            ->get_workspace_in_bytes(
                    src, filter, extra_data.bias_layout, extra_data.z_layout,
                    dst,
                    reinterpret_cast<const ConvolutionBase<
                            param::ConvBias>::PreprocessedFilter*>(
                            preprocessed_filter));
    MEGDNN_MARK_USED_VAR(preprocessed_filter);
    AlgoBase::SizeArgs args{this, src, filter, dst};
    return megdnn::get_algorithm(this, src, filter, dst)
            ->get_workspace_in_bytes(args);
 }

 void ConvolutionForwardImpl::exec(_megdnn_tensor_in src,
@@ -137,20 +64,15 @@ void ConvolutionForwardImpl::exec(_megdnn_tensor_in src,
                                  _megdnn_tensor_out dst,
                                  const PreprocessedFilter* preprocessed_filter,
                                  _megdnn_workspace workspace) {
    auto extra_data =
            conv_bias_extra_data(src.layout, filter.layout, dst.layout);
    TensorND bias(nullptr, extra_data.bias_layout);
    TensorND z(nullptr, extra_data.z_layout);
    return static_cast<ConvBiasForwardImpl*>(extra_data.convbias_opr.get())
            ->exec(src, filter, bias, z, dst,
                   reinterpret_cast<const ConvolutionBase<
                           param::ConvBias>::PreprocessedFilter*>(
                           preprocessed_filter),
                   workspace);
    check_exec(src.layout, filter.layout, dst.layout, workspace.size,
               preprocessed_filter);
    AlgoBase::ExecArgs args(this, src, filter, dst, workspace);
    auto&& algo = get_algorithm(this, src.layout, filter.layout, dst.layout);
    algo->check_workspace(args, workspace).exec(args);
 }

 const char* ConvolutionForwardImpl::get_algorithm_set_name() const {
    return "CUDACONV0+CUDNN" CUDNN_VERSION_STR;
    return "CUDA CONVOLUTION_FORWARD" ;
 }

 /* ============== ConvolutionBackwardDataImpl ============== */
--- a/dnn/src/cuda/convolution/opr_impl.h
+++ b/dnn/src/cuda/convolution/opr_impl.h
@@ -6,7 +6,8 @@
 *
 * Unless required by applicable law or agreed to in writing,
 * software distributed under the License is distributed on an
 * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or
 * implied.
 */
 #pragma once

@@ -16,58 +17,56 @@
 namespace megdnn {
 namespace cuda {

 class ConvolutionForwardImpl: public ConvolutionForward {
    public:
        using ConvolutionForward::ConvolutionForward;
        void exec(_megdnn_tensor_in src, _megdnn_tensor_in filter,
                  _megdnn_tensor_out dst,
                  const PreprocessedFilter* preprocessed_filter,
                  _megdnn_workspace workspace) override;

        size_t get_workspace_in_bytes(
                const TensorLayout& src, const TensorLayout& filter,
                const TensorLayout& dst,
                const PreprocessedFilter* preprocessed_filter) override;
        const char* get_algorithm_set_name() const override;

        SmallVector<TensorLayout> deduce_preprocessed_filter_layout(
                const TensorLayout&, const TensorLayout&,
                const TensorLayout&) override {
            return {};
        }
        size_t get_preprocess_workspace_in_bytes(
                const TensorLayout& , const TensorLayout& ,
                const TensorLayout& ) override{
            return 0;
        }
        void exec_preprocess(const TensorLayout&, _megdnn_tensor_in,
                             const TensorLayout&, PreprocessedFilter*,
                             _megdnn_workspace) override {
            megdnn_throw("cuda exec_preprocess has not implemeted yet");
        }

        Algorithm* get_algorithm_from_desc(const AlgorithmDesc& desc) override;

    protected:
        struct ConvBiasExtraData{
            std::unique_ptr<ConvBiasForward> convbias_opr;
            TensorLayout bias_layout;
            TensorLayout z_layout;
        };

        std::vector<Algorithm*> get_all_algorithms(
                const TensorLayout& src, const TensorLayout& filter,
                const TensorLayout& dst) override;
        Algorithm* get_algorithm_heuristic(const TensorLayout& src,
                                           const TensorLayout& filter,
                                           const TensorLayout& dst,
                                           size_t workspace_limit_in_bytes,
                                           bool reproducible) override;

    private:
        ConvBiasExtraData conv_bias_extra_data(const TensorLayout&,
                                               const TensorLayout&,
                                               const TensorLayout&);
 class ConvolutionForwardImpl : public ConvolutionForward {
 public:
    using ConvolutionForward::ConvolutionForward;
    void exec(_megdnn_tensor_in src, _megdnn_tensor_in filter,
              _megdnn_tensor_out dst,
              const PreprocessedFilter* preprocessed_filter,
              _megdnn_workspace workspace) override;

    size_t get_workspace_in_bytes(
            const TensorLayout& src, const TensorLayout& filter,
            const TensorLayout& dst,
            const PreprocessedFilter* preprocessed_filter) override;
    const char* get_algorithm_set_name() const override;

    SmallVector<TensorLayout> deduce_preprocessed_filter_layout(
            const TensorLayout&, const TensorLayout&,
            const TensorLayout&) override {
        return {};
    }
    size_t get_preprocess_workspace_in_bytes(const TensorLayout&,
                                             const TensorLayout&,
                                             const TensorLayout&) override {
        return 0;
    }
    void exec_preprocess(const TensorLayout&, _megdnn_tensor_in,
                         const TensorLayout&, PreprocessedFilter*,
                         _megdnn_workspace) override {
        megdnn_throw("cuda exec_preprocess has not implemeted yet");
    }

    Algorithm* get_algorithm_from_desc(const AlgorithmDesc& desc) override;

    class AlgoBase;
    class AlgoDefault;
    class AlgoPack;

    static const AlgoPack& algo_pack() { return sm_algo_pack; }

 protected:
    std::vector<Algorithm*> get_all_algorithms(
            const TensorLayout& src, const TensorLayout& filter,
            const TensorLayout& dst) override;
    Algorithm* get_algorithm_heuristic(const TensorLayout& src,
                                       const TensorLayout& filter,
                                       const TensorLayout& dst,
                                       size_t workspace_limit_in_bytes,
                                       bool reproducible) override;

 private:
    static AlgoPack sm_algo_pack;
 };

 class ConvolutionBackwardDataImpl : public ConvolutionBackwardData {
@@ -122,6 +121,7 @@ protected:
                                       const TensorLayout& grad,
                                       size_t workspace_limit_in_bytes,
                                       bool reproducible) override;

 private:
    Algorithm* get_algorithm_heuristic(const TensorLayout& filter,
                                       const CanonizedFilterMeta& filter_meta,
@@ -141,12 +141,10 @@ public:
    size_t get_workspace_in_bytes(const TensorLayout& src,
                                  const TensorLayout& diff,
                                  const TensorLayout& grad) override;
    AlgorithmInfo get_algorithm_info_heuristic(const TensorLayout& src,
                                       const TensorLayout& diff,
                                       const TensorLayout& grad,
                                       const CanonizedFilterMeta& grad_meta,
                                       size_t workspace_limit_in_bytes,
                                       bool reproducible) {
    AlgorithmInfo get_algorithm_info_heuristic(
            const TensorLayout& src, const TensorLayout& diff,
            const TensorLayout& grad, const CanonizedFilterMeta& grad_meta,
            size_t workspace_limit_in_bytes, bool reproducible) {
        return get_algorithm_heuristic(src, diff, grad, grad_meta,
                                       workspace_limit_in_bytes, reproducible)
                ->info();
@@ -162,7 +160,6 @@ public:
                ->info();
    }


    const char* get_algorithm_set_name() const override;

    class AlgoBase;
@@ -187,6 +184,7 @@ protected:
                                       const TensorLayout& grad,
                                       size_t workspace_limit_in_bytes,
                                       bool reproducible) override;

 private:
    Algorithm* get_algorithm_heuristic(const TensorLayout& src,
                                       const TensorLayout& diff,
--- a/dnn/test/common/checker.h
+++ b/dnn/test/common/checker.h
@@ -532,6 +532,30 @@ private:
    bool* m_require_algo;
 };

 template <typename Opr>
 void construct_sub_execution_policy_heuristic(ExecutionPolicy& policy,
                                              const TensorLayoutArray& layouts,
                                              const std::string& param,
                                              Handle* handle) {
    megdnn_assert(layouts.size() == OprTrait<Opr>::arity);
    auto opr = handle->create_operator<Opr>();
    opr->param() = Algorithm::deserialize_read_pod<typename Opr::Param>(param);
    if (!policy.algo.valid()) {
        policy.algo = AlgoProxy<Opr, OprTrait<Opr>::arity>::
                get_algorithm_info_heuristic(opr.get(), layouts).desc;
    }

    Algorithm* algo = opr->get_algorithm_from_desc(policy.algo);
    std::vector<Algorithm::SearchItem>&& sub_items =
            algo->get_subopr_list(layouts, opr.get());
    FOREACH_OPR_TYPE_DISPATCH(sub_items, {
        policy.sub_policy.push_back(ExecutionPolicy{});
        construct_sub_execution_policy_heuristic<_Opr>(
                policy.sub_policy.back(), _item.layouts, _item.param,
                handle);
    });
 }

 }  // namespace test
 }  // namespace megdnn

--- a/dnn/test/common/convolution.cpp
+++ b/dnn/test/common/convolution.cpp
@@ -570,6 +570,8 @@ void convolution::test_conv_config_combinations(int k_size,
            .set_param(param);
        auto opr = checker.opr();
        opr->param() = param;
        std::string param_str;
        Algorithm::serialize_write_pod(opr->param(), param_str);
        TensorLayout ily{ishp, inp_type}, fly{fshp, inp_type}, oly;
        oly.dtype = out_type;
        opr->deduce_layout(ily, fly, oly);
@@ -581,10 +583,14 @@ void convolution::test_conv_config_combinations(int k_size,
        for (auto algo : opr->get_all_algorithms_info(ily, fly, oly)) {
            used_algos.insert(algo.desc);
            opr->execution_policy().algo = algo.desc;

            construct_sub_execution_policy_heuristic<ConvolutionForward>(
                    opr->execution_policy(), {ily, fly, oly}, param_str,
                    opr->handle());
            checker
                .set_epsilon(eps_getter(dtype == 1, 0, algo.name.c_str()))
                .execs({ishp, fshp, {}});
            opr->execution_policy().algo.reset();
            opr->execution_policy() = {};
            ASSERT_TRUE(checker.prev_succ()) << errmsg(algo.name.c_str());
        }

@@ -597,13 +603,19 @@ void convolution::test_conv_config_combinations(int k_size,

            auto opr = checker_bwd_data.opr();
            opr->param() = param;
            std::string param_str;
            Algorithm::serialize_write_pod(opr->param(), param_str);
            for (auto algo: opr->get_all_algorithms_info(fly, oly, ily)) {
                used_algos_bwd_data.insert(algo.desc);
                opr->execution_policy().algo = algo.desc;
                construct_sub_execution_policy_heuristic<
                        ConvolutionBackwardData>(opr->execution_policy(),
                                                 {fly, oly, ily}, param_str,
                                                 opr->handle());
                checker_bwd_data
                    .set_epsilon(eps_getter(dtype == 1, 1, algo.name.c_str()))
                    .execl({fly, oly, ily});
                opr->execution_policy().algo.reset();
                opr->execution_policy() = {};
                ASSERT_TRUE(checker_bwd_data.prev_succ()) <<
                    errmsg(algo.name.c_str());
            }
@@ -618,13 +630,19 @@ void convolution::test_conv_config_combinations(int k_size,

            auto opr = checker_bwd_filter.opr();
            opr->param() = param;
            std::string param_str;
            Algorithm::serialize_write_pod(opr->param(), param_str);
            for (auto algo: opr->get_all_algorithms_info(ily, oly, fly)) {
                used_algos_bwd_flt.insert(algo.desc);
                opr->execution_policy().algo = algo.desc;
                construct_sub_execution_policy_heuristic<
                        ConvolutionBackwardFilter>(opr->execution_policy(),
                                                   {ily, oly, fly}, param_str,
                                                   opr->handle());
                checker_bwd_filter
                    .set_epsilon(eps_getter(dtype == 1, 2, algo.name.c_str()))
                    .execl({ily, oly, fly});
                opr->execution_policy().algo.reset();
                opr->execution_policy() = {};
                ASSERT_TRUE(checker_bwd_filter.prev_succ()) <<
                    errmsg(algo.name.c_str());
            }
--- a/dnn/test/common/opr_proxy.h
+++ b/dnn/test/common/opr_proxy.h
@@ -338,6 +338,7 @@ struct OprProxyProfilingBase
                       FastRunCache& cache) {
        megdnn_assert(layouts.size() == arity);
        auto opr = handle->create_operator<Opr>();

        opr->param() =
                Algorithm::deserialize_read_pod<typename Opr::Param>(param);
        SmallVector<size_t> sizes_in_bytes;
@@ -427,9 +428,9 @@ struct OprProxyProfilingBase
            auto&& search_items =
                    flatten_search_space(layouts, param_str, opr->handle());
            FOREACH_OPR_TYPE_DISPATCH(search_items, {
                OprProxyProfilingBase<_Opr>::search(_item.layouts, param_str, W,
                                                    opr->handle(), warmup_times,
                                                    exec_times, cache);
                OprProxyProfilingBase<_Opr>::search(
                        _item.layouts, _item.param, W, opr->handle(),
                        warmup_times, exec_times, cache);
            });

            construct_execution_policy(layouts, param_str, opr->handle(), cache,
--- a/dnn/test/cuda/chanwise_convolution.cpp
+++ b/dnn/test/cuda/chanwise_convolution.cpp
@@ -273,10 +273,14 @@ TEST_F(CUDA, CHANWISE_CONVOLUTION_FORWARD) {
    Checker<Convolution> checker(handle_cuda());
    bool require_algo = false;
    checker.set_before_exec_callback(AlgoChecker<ConvolutionForward>(
            ConvBiasForward::algo_name<ConvBiasForward::DirectParam>(
                    "CHANNEL_WISE", {})
                    .c_str(),
            ExecutionPolicyAlgoName{
                    "DEFAULT",
                    {{ConvBiasForward::algo_name<ConvBiasForward::DirectParam>(
                              "CHANNEL_WISE", {})
                              .c_str(),
                      {}}}},
            &require_algo));

    for (auto dtype : std::vector<DType>{dtype::Float32(), dtype::Float16()}) {
        checker.set_dtype(0, dtype).set_dtype(1, dtype).set_dtype(2, dtype);
        if (dtype.enumv() == DTypeEnum::Float16)
@@ -306,8 +310,12 @@ TEST_F(CUDA, CHANWISE_CONVOLUTION_FORWARD_SMALL) {
    Checker<Convolution> checker(handle_cuda());
    bool require_algo = false;
    checker.set_before_exec_callback(AlgoChecker<ConvolutionForward>(
            ConvBiasForward::algo_name<ConvBiasForward::DirectParam>(
                    "CHANNEL_WISE_SMALL", {}).c_str(),
            ExecutionPolicyAlgoName{
                    "DEFAULT",
                    {{ConvBiasForward::algo_name<ConvBiasForward::DirectParam>(
                              "CHANNEL_WISE_SMALL", {})
                              .c_str(),
                      {}}}},
            &require_algo));
    for (auto dtype : std::vector<DType> {
             dtype::Float32(),
@@ -338,6 +346,7 @@ TEST_F(CUDA, CHANWISE_CONVOLUTION_BACKWARD_DATA) {
    bool require_algo = false;
    checker.set_before_exec_callback(AlgoChecker<ConvolutionBackwardData>(
            "CHANNEL_WISE", &require_algo));

    for (auto dtype : std::vector<DType>{dtype::Float32(), dtype::Float16()}) {
        checker.set_dtype(0, dtype).set_dtype(1, dtype).set_dtype(2, dtype);
        if (dtype.enumv() == DTypeEnum::Float16)
@@ -368,9 +377,8 @@ TEST_F(CUDA, CHANWISE_CONVOLUTION_BACKWARD_DATA) {
 TEST_F(CUDA, CHANWISE_CONVOLUTION_BACKWARD_DATA_SMALL) {
    Checker<ConvolutionBackwardData> checker(handle_cuda());
    bool require_algo = false;
    checker.set_before_exec_callback(
            AlgoChecker<ConvolutionBackwardData>(
                "CHANNEL_WISE_SMALL", &require_algo));
    checker.set_before_exec_callback(AlgoChecker<ConvolutionBackwardData>(
            "CHANNEL_WISE_SMALL", &require_algo));
    for (auto dtype : std::vector<DType> {
            dtype::Float32(),
 #if CUDA_VERSION >= 9000
@@ -396,10 +404,14 @@ TEST_F(CUDA, CHANWISE_CONVOLUTION_BACKWARD_FILTER) {
    Checker<ConvolutionBackwardFilter> checker(handle_cuda());
    bool require_algo = false;
    checker.set_before_exec_callback(AlgoChecker<ConvolutionBackwardFilter>(
                "CHANNEL_WISE", &require_algo));
            "CHANNEL_WISE", &require_algo));
    UniformFloatRNG rng(-0.1, 0.1);
    for (auto dtype : std::vector<DType>{dtype::Float32(), dtype::Float16()}) {
        checker.set_dtype(0, dtype).set_dtype(1, dtype).set_dtype(2, dtype).set_rng(0, &rng).set_rng(1, &rng);
        checker.set_dtype(0, dtype)
                .set_dtype(1, dtype)
                .set_dtype(2, dtype)
                .set_rng(0, &rng)
                .set_rng(1, &rng);
        if (dtype.enumv() == DTypeEnum::Float16)
            checker.set_epsilon(2e-1);
        // simple case
@@ -514,7 +526,7 @@ TEST_F(CUDA, CHANWISE_CONVOLUTION_BENCH_ALL_ALGO_FWD) {

    auto run = [&](size_t N, size_t C, size_t IH, size_t IW, size_t FH,
                   size_t FW) {
        checker.proxy()->target_execution_policy.algo.reset();
        checker.proxy()->target_execution_policy = {};
        checker.execs({{N, C, IH, IW}, {C, 1, 1, FH, FW}, {}});
    };

@@ -614,7 +626,7 @@ TEST_F(CUDA, BENCHMARK_CHANWISE_CONV_ALL_ALGO_FORWARD) {
                .set_dtype(2, dtype::Float32())
                .set_rng(0, &rng)
                .set_rng(1, &rng);
        bencher.proxy()->target_execution_policy.algo.reset();
        bencher.proxy()->target_execution_policy = {};
        auto time_in_ms_fp32 = bencher.execs({src, filter, {}}) / RUNS;

        bencher.set_param(param)
@@ -623,7 +635,7 @@ TEST_F(CUDA, BENCHMARK_CHANWISE_CONV_ALL_ALGO_FORWARD) {
                .set_dtype(2, dtype::Float16())
                .set_rng(0, &rng)
                .set_rng(1, &rng);
        bencher.proxy()->target_execution_policy.algo.reset();
        bencher.proxy()->target_execution_policy = {};
        auto time_in_ms_fp16 = bencher.execs({src, filter, {}}) / RUNS;

        bencher.proxy()->target_execution_policy.algo.reset();
@@ -677,10 +689,13 @@ TEST_F(CUDA, BENCHMARK_CHANWISE_CONV_FORWARD_FLOAT) {
    CUBenchmarker<ConvolutionForward> bencher(handle_cuda());
    size_t RUNS = 1;
    bencher.set_display(false).set_times(RUNS);
    bencher.set_before_exec_callback(AlgoChecker<ConvolutionForward>(
            ConvBiasForward::algo_name<ConvBiasForward::DirectParam>(
                    "CHANNEL_WISE", {})
                    .c_str()));
    bencher.set_before_exec_callback(
            AlgoChecker<ConvolutionForward>(ExecutionPolicyAlgoName{
                    "DEFAULT",
                    {{ConvBiasForward::algo_name<ConvBiasForward::DirectParam>(
                              "CHANNEL_WISE", {})
                              .c_str(),
                      {}}}}));

    Convolution::Param param;
    param.format = ConvBias::Param::Format::NCHW;
@@ -783,17 +798,24 @@ TEST_F(CUDA, BENCHMARK_CHANWISE_CONV_FORWARD_FLOAT_SMALL) {
                .set_dtype(2, dtype::Float32())
                .set_rng(0, &rng)
                .set_rng(1, &rng)
                .set_before_exec_callback(AlgoChecker<ConvolutionForward>(
                        ConvBiasForward::algo_name<
                                ConvBiasForward::DirectParam>("CHANNEL_WISE",
                                                              {})
                                .c_str()));
                .set_before_exec_callback(
                        AlgoChecker<ConvolutionForward>(ExecutionPolicyAlgoName{
                                "DEFAULT",
                                {{ConvBiasForward::algo_name<
                                          ConvBiasForward::DirectParam>(
                                          "CHANNEL_WISE", {})
                                          .c_str(),
                                  {}}}}));
        auto time_in_ms_fp32_normal = bencher.execs({src, filter, {}}) / RUNS;

        bencher.set_before_exec_callback(AlgoChecker<ConvolutionForward>(
                ConvBiasForward::algo_name<ConvBiasForward::DirectParam>(
                        "CHANNEL_WISE", {})
                        .c_str()));
                ExecutionPolicyAlgoName{"DEFAULT",
                                        {{ConvBiasForward::algo_name<
                                                  ConvBiasForward::DirectParam>(
                                                  "CHANNEL_WISE", {})
                                                  .c_str(),
                                          {}}}}));

        auto time_in_ms_fp32_small = bencher.execs({src, filter, {}}) / RUNS;

        bencher.set_param(param)
--- a/dnn/test/cuda/convolution.cpp
+++ b/dnn/test/cuda/convolution.cpp
@@ -135,10 +135,13 @@ TEST_F(CUDA, CONV_FORWARD_MATMUL_NCHW4) {
            .set_rng(1, &int_rng)
            .set_param(param);

    checker.set_before_exec_callback(AlgoChecker<Convolution>(
            ConvBiasForward::algo_name<ConvBiasForward::MatmulParam>(
                    "MATMUL8X8X32", {})
                    .c_str()));
    checker.set_before_exec_callback(
            AlgoChecker<ConvolutionForward>(ExecutionPolicyAlgoName{
                    "DEFAULT",
                    {{ConvBiasForward::algo_name<ConvBiasForward::MatmulParam>(
                              "MATMUL8X8X32", {})
                              .c_str(),
                      {}}}}));

    param.sparse = Convolution::Param::Sparse::DENSE;
    param.pad_h = param.pad_w = 1;
--- a/dnn/test/cuda/dilated_convolution.cpp
+++ b/dnn/test/cuda/dilated_convolution.cpp
@@ -30,19 +30,26 @@ TEST_F(CUDA, DILATED_CONVOLUTION_FORWARD)
    auto args = get_dilated_args();
    Checker<ConvolutionForward> checker(handle_cuda());
 #if CUDNN_VERSION >= 7500
    checker.set_before_exec_callback(AlgoChecker<ConvolutionForward>(
            ConvBiasForward::algo_name<ConvBiasForward::DefaultParam>(
                    "CUDNN:Convolution:CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_"
                    "PRECOMP_"
                    "GEMM" CUDNN_VERSION_STRING,
                    {})
                    .c_str()));
    checker.set_before_exec_callback(
            AlgoChecker<ConvolutionForward>(ExecutionPolicyAlgoName{
                    "DEFAULT",
                    {{ConvBiasForward::algo_name<ConvBiasForward::DefaultParam>(
                              "CUDNN:Convolution:CUDNN_CONVOLUTION_FWD_ALGO_"
                              "IMPLICIT_"
                              "PRECOMP_"
                              "GEMM" CUDNN_VERSION_STRING,
                              {})
                              .c_str(),
                      {}}}}));
    printf("cudnn version >= 7.5, use cudnn impl for dilated convolution\n");
 #else
    checker.set_before_exec_callback(AlgoChecker<ConvolutionForward>(
            ConvBiasForward::algo_name<ConvBiasForward::MatmulParam>("MATMUL",
                                                                     {})
                    .c_str()));
    checker.set_before_exec_callback(
            AlgoChecker<ConvolutionForward>(ExecutionPolicyAlgoName{
                    "DEFAULT",
                    {{ConvBiasForward::algo_name<ConvBiasForward::MatmulParam>(
                              "MATMUL", {})
                              .c_str(),
                      {}}}}));
 #endif
    NormalRNG default_rng;
    for (auto &&arg: args) {
--- a/dnn/test/cuda/group_conv.cpp
+++ b/dnn/test/cuda/group_conv.cpp
@@ -116,12 +116,17 @@ TEST_F(CUDA, GROUP_CONV_FORWARD_1x1) {
        std::string conv1x1_name =
                ConvBiasForward::algo_name<ConvBiasForward::MatmulParam>(
                        "MATMUL1X1", {});
        checker.set_before_exec_callback(AlgoChecker<Convolution>(
                ConvBiasForward::algo_name<ConvBiasForward::DirectParam>(
                        ssprintf("%s:%s", "CUDA:GROUP_CONV",
                                 conv1x1_name.c_str()),
                        {})
                        .c_str()));
        checker.set_before_exec_callback(
                AlgoChecker<ConvolutionForward>(ExecutionPolicyAlgoName{
                        "DEFAULT",
                        {{ConvBiasForward::algo_name<
                                  ConvBiasForward::DirectParam>(
                                  ssprintf("%s:%s", "CUDA:GROUP_CONV",
                                           conv1x1_name.c_str())
                                          .c_str(),
                                  {})
                                  .c_str(),
                          {}}}}));
 #endif
        Convolution::Param param;
        param.sparse = Convolution::Param::Sparse::GROUP;
--- a/src/opr/impl/search_policy/algo_chooser.cpp
+++ b/src/opr/impl/search_policy/algo_chooser.cpp
@@ -231,7 +231,7 @@ void AlgoChooser<Opr>::profile(ExeContext& ctx, bool require_reproducible) {
                                   algo.name.c_str(), str_on_inp_shape.c_str());
        ImplExecutionPolicy policy;
        policy.algo = algo.desc;
        ctx.construct_execution_policy_from_cache(require_reproducible, policy);
        ctx.construct_execution_policy(require_reproducible, policy);
        if (ctx.get_workspace_size_bytes(policy) >= workspace_limit)
            continue;

@@ -302,7 +302,7 @@ AlgoChooser<Opr>::choose_by_profile(ExeContext& ctx, bool require_reproducible,
        });
    }
    typename AlgoChooser<Opr>::ImplExecutionPolicy policy;
    ctx.construct_execution_policy_from_cache(require_reproducible, policy);
    ctx.construct_execution_policy(require_reproducible, policy);
    return policy;
    MIDOUT_E
 }
@@ -324,6 +324,11 @@ size_t AlgoChooser<Opr>::setup_algo(const FixedTensorLayouts& layouts,
    ImplExecutionPolicy policy;
    if (auto algo_choose_hook = mgb_opr->algo_chooser()) {
        policy = algo_choose_hook(mgb_opr);
        ctx.construct_execution_policy(
                mgb_opr->execution_policy().strategy ==
                        mixin::AlgoChooserHelper::ExecutionPolicy::Strategy::
                                HEURISTIC_REPRODUCIBLE,
                policy, false);
    }
    if (!policy.algo.valid()) {
        policy = get_policy(ctx);
@@ -520,13 +525,26 @@ AlgoChooser<Opr>::ExeContext::get_all_candidates() const {
 }

 template <typename Opr>
 void AlgoChooser<Opr>::ExeContext::construct_execution_policy_from_cache(
 void AlgoChooser<Opr>::ExeContext::construct_execution_policy(
        bool require_reproducible,
        typename AlgoChooser<Opr>::ImplExecutionPolicy& policy) const {
        typename AlgoChooser<Opr>::ImplExecutionPolicy& policy,
        bool retrive_from_cache) const {
    if (!policy.algo.valid()) {
        policy.algo = get_profile_result_from_cache(require_reproducible).desc;
        if (retrive_from_cache) {
            policy.algo =
                    get_profile_result_from_cache(require_reproducible).desc;
        } else {
            auto workspace_limit = WorkspaceLimitGetter::get_workspace_limit(
                    owner_graph(), m_cn, m_execution_policy.workspace_limit);
            policy.algo = APPLY(m_megdnn_opr->get_algorithm_info_heuristic(
                                        args..., workspace_limit,
                                        require_reproducible),
                                m_layouts)
                                  .desc;
        }
        mgb_assert(policy.algo.valid(),
                   "No cache found, maybe some error occured");
                   "No algo found from cache or heuristic, maybe some error "
                   "occured");
    }

    Algorithm* algo = m_megdnn_opr->get_algorithm_from_desc(policy.algo);
@@ -544,8 +562,9 @@ void AlgoChooser<Opr>::ExeContext::construct_execution_policy_from_cache(
                _item.param, m_base_mgb_opr, m_cn, m_execution_policy,
                m_allow_weight_preprocess);
        policy.sub_policy.push_back({});
        sub_ctx.construct_execution_policy_from_cache(require_reproducible,
                                                      policy.sub_policy.back());
        sub_ctx.construct_execution_policy(require_reproducible,
                                           policy.sub_policy.back(),
                                           retrive_from_cache);
    });

    return;
@@ -672,11 +691,11 @@ AlgoChooser<Opr>::ExeContext::construct_fake_preprocess_filter() const {
    AlgoChooser<megdnn::Opr>::ExeContext::get_workspace_size_bytes(            \
            const typename AlgoChooser<megdnn::Opr>::ImplExecutionPolicy&      \
                    policy) const;                                             \
    template void AlgoChooser<megdnn::Opr>::ExeContext::                       \
            construct_execution_policy_from_cache(                             \
                    bool require_reproducible,                                 \
                    typename AlgoChooser<megdnn::Opr>::ImplExecutionPolicy&    \
                            policy) const;                                     \
    template void                                                              \
    AlgoChooser<megdnn::Opr>::ExeContext::construct_execution_policy(          \
            bool require_reproducible,                                         \
            typename AlgoChooser<megdnn::Opr>::ImplExecutionPolicy& policy,    \
            bool retrive_from_cache) const;                                    \
    template Maybe<AlgoChooserProfileCache::ResultEntry>                       \
    AlgoChooser<megdnn::Opr>::ExeContext::profile_single_algo(                 \
            const typename AlgoChooser<megdnn::Opr>::ImplExecutionPolicy&      \
--- a/src/opr/include/megbrain/opr/search_policy/algo_chooser.h
+++ b/src/opr/include/megbrain/opr/search_policy/algo_chooser.h
@@ -129,13 +129,16 @@ public:
        ImplAlgo get_profile_result_from_cache(bool require_reproducible) const;

        /**
         * \brief construct execution policy from cache.
         * \brief construct execution policy from cache or heuristic.
         *
         * \param require_reproducible select algo which is reproducible
         * \param policy execution policy
         * \param retrive_from_cache retrive algo from cache if set True, get
         *     from heuristic otherwise.
         */
        void construct_execution_policy_from_cache(
                bool require_reproducible, ImplExecutionPolicy& policy) const;
        void construct_execution_policy(
                bool require_reproducible, ImplExecutionPolicy& policy,
                bool retrive_from_cache = true) const;

    private:
        Maybe<PreprocessFilter<Opr>> construct_fake_preprocess_filter() const;