feat(mgb/opr): add fast profile and combined Execution strategy

GitOrigin-RevId: 843dc3a790
4 years ago · a3ea1f153c
--- a/dnn/include/megdnn/basic_types.h
+++ b/dnn/include/megdnn/basic_types.h
@@ -506,10 +506,66 @@ struct DynOutMallocPolicyCall {
    }
 };


 template <typename T>
 class EnumClassBit {
    std::underlying_type_t<T> m_val;

    constexpr EnumClassBit(std::underlying_type_t<T> v) : m_val(v) {}

 public:
    constexpr EnumClassBit(T v)
            : m_val(static_cast<std::underlying_type_t<T>>(v)) {}

    constexpr operator T() const { return static_cast<T>(m_val); }

    constexpr explicit operator bool() const { return m_val; }

 #define DEF_OPR(op)                                                     \
    constexpr EnumClassBit operator op(const EnumClassBit& rhs) const { \
        return m_val op rhs.m_val;                                      \
    }

    DEF_OPR(&)
    DEF_OPR(|)
    DEF_OPR (^)

    constexpr EnumClassBit operator~() const { return ~m_val; }

 #undef DEF_OPR
 };

 #endif  // MEGDNN_CC_HOST

 }  // namespace megdnn

 #define _MEGDNN_DECBO_SINGLE_OPR(cls, op)                                    \
    inline constexpr ::megdnn::EnumClassBit<cls> operator op(cls x, cls y) { \
        return ::megdnn::EnumClassBit<cls>(x)                                \
                op ::megdnn::EnumClassBit<cls>(y);                           \
    }                                                                        \
    inline constexpr ::megdnn::EnumClassBit<cls> operator op(                \
            ::megdnn::EnumClassBit<cls> x, cls y) {                          \
        return x op ::megdnn::EnumClassBit<cls>(y);                          \
    }

 #define _MEGDNN_DECBO_SINGLE_OPR_ASSIGN(cls, op)          \
    inline constexpr cls& operator op##=(cls& x, cls y) { \
        x = x op ::megdnn::EnumClassBit<cls>(y);          \
        return x;                                         \
    }

 #define MEGDNN_DEF_ENUM_CLASS_BIT_OPR(cls)                          \
    _MEGDNN_DECBO_SINGLE_OPR(cls, &)                                \
    _MEGDNN_DECBO_SINGLE_OPR(cls, |)                                \
    _MEGDNN_DECBO_SINGLE_OPR(cls, ^)                                \
    _MEGDNN_DECBO_SINGLE_OPR_ASSIGN(cls, &)                         \
    _MEGDNN_DECBO_SINGLE_OPR_ASSIGN(cls, |)                         \
    _MEGDNN_DECBO_SINGLE_OPR_ASSIGN(cls, ^)                         \
    inline constexpr ::megdnn::EnumClassBit<cls> operator~(cls x) { \
        return ~::megdnn::EnumClassBit<cls>(x);                     \
    }

 #include "megdnn/internal/visibility_epilogue.h"

 // vim: syntax=cpp.doxygen
--- a/dnn/include/megdnn/oprs/base.h
+++ b/dnn/include/megdnn/oprs/base.h
@@ -251,6 +251,8 @@ protected:
    Handle::HandleType m_handle_type = Handle::HandleType::NAIVE;
 };

 MEGDNN_DEF_ENUM_CLASS_BIT_OPR(Algorithm::Attribute)

 //! policy for executing the operator
 struct ExecutionPolicy {
    //! INVALID_ALGO_TYPE algo_type means using heuristic
--- a/dnn/scripts/gen_flatbuffers_schema.py
+++ b/dnn/scripts/gen_flatbuffers_schema.py
@@ -53,9 +53,13 @@ class FlatBuffersWriter(IndentWriterBase):
            e = self._enums[(p, e)]
            self._write_doc(e.name)
            self._write("enum %s%s : uint {", p, e.name, indent=1)
            for member in e.members:
            for idx, member in enumerate(e.members):
                self._write_doc(member)
                self._write("%s,", scramble_enum_member_name(str(member)))
                if e.combined:
                    self._write("%s=%d,", scramble_enum_member_name(str(member)),
                            1<<idx)
                else:
                    self._write("%s,", scramble_enum_member_name(str(member)))
            self._write("}\n", indent=-1)

    def _write_doc(self, doc):
--- a/dnn/scripts/gen_param_defs.py
+++ b/dnn/scripts/gen_param_defs.py
@@ -80,13 +80,13 @@ class member_defs:
        :attr member_alias: list of (member, alias) pairs
        """
        __slots__ = ['name', 'name_field', 'members', 'default',
                     'member_alias']
                     'member_alias', 'combined']

        all_enums = {}
        """(param_name, name) => enum"""

        def __init__(self, param_name, name, name_field, members, default,
                     member_alias):
                member_alias, combined = False):
            name = member_defs.Doc.make(name)
            assert name.id[0].isupper()
            members = tuple(map(member_defs.Doc.make, members))
@@ -97,6 +97,7 @@ class member_defs:
                default = name_field.index(default)
            assert isinstance(default, int)
            self.name = name
            self.combined = combined
            self.name_field = self.get_name_field(name.id, name_field)
            self.members = members
            self.default = default
@@ -197,6 +198,12 @@ class ParamDef:
            self.name.id, name, name_field, members, default, member_alias))
        return self

    def add_bit_combination_enum(self, name, *members, default=0,
                 name_field=None, member_alias=[]):
        self.members.append(member_defs.Enum(
            self.name.id, name, name_field, members, default, member_alias, True))
        return self

    def add_enum_alias(self, name, src_class, src_name=None, name_field=None,
                       default=None):
        self.members.append(member_defs.EnumAlias(
@@ -463,8 +470,12 @@ class SerializedDType(_ParamDefBase):
        for idx, emem in enumerate(e.members):
            self._write('%s = "%s"', emem, emem)
            self._write_doc(emem)
            self._enum_member2num.append('id({}.{}):{}'.format(
                qualname, emem, idx))
            if e.combined:
                self._enum_member2num.append('id({}.{}):{}'.format(
                    qualname, emem, 1<<idx))
            else:
                self._enum_member2num.append('id({}.{}):{}'.format(
                    qualname, emem, idx))

        for emem, emem_alis in e.member_alias:
            self._write('%s = %s', emem_alis, emem)
@@ -622,6 +633,8 @@ class CPPWriter(IndentWriterBase):
        for idx, i in enumerate(e.members):
            self._write_doc(i)
            v = '{} = {}'.format(i, idx)
            if e.combined:
                v = '{} = 1 << {}'.format(i, idx)
            if i is not e.members[-1] or e.member_alias:
                v += ','
            self._write(v)
@@ -672,7 +685,6 @@ class CPPEnumValueWriter(CPPWriter):
            self._write('static const uint32_t %s = %s;', alias, mem)
        self._write('};', indent=-1)


    def _on_member_enum_alias(self, e):
        s = e.src_enum
        self._write('typedef %s::%s %s;', e.src_class, e.src_name, e.name)
--- a/dnn/scripts/gen_tablegen.py
+++ b/dnn/scripts/gen_tablegen.py
@@ -91,12 +91,17 @@ class ConverterWriter(IndentWriterBase):
        def format(v):
            return '\"{}\"'.format(str(v))
        enum_def += ','.join(format(i) for i in e.members)
        enum_def += "]"

        if e.combined:
            enum_def += "], 1"
        else:
            enum_def += "], 0"

        if ENUM_TO_STRING_SPECIAL_RULES.count((p.name, e.name)):
            enum_def += ", 1" # whether generate ToStringTrait
        enum_def += ">"
        self._write("def {} : {};".format(td_class, enum_def))

        self._write("def {} : {};".format(td_class, enum_def))
        if self._skip_current_param:
            return

--- a/dnn/src/common/algo_base.h
+++ b/dnn/src/common/algo_base.h
@@ -21,8 +21,6 @@

 namespace megdnn {

 MEGDNN_DEF_ENUM_CLASS_BIT_OPR(AlgoAttribute)

 #define MEGDNN_DECL_ALGO_TYPE(_type)                              \
    uint32_t type() const override {                              \
        return static_cast<std::underlying_type<AlgoType>::type>( \
--- a/dnn/src/common/utils.h
+++ b/dnn/src/common/utils.h
@@ -692,61 +692,6 @@ inline void* get_origin_ptr(const TensorND* tensor, void* ptr) {
                              tensor->layout.span().low_byte);
 }

 template <typename T>
 class EnumClassBit {
    std::underlying_type_t<T> m_val;

    constexpr EnumClassBit(std::underlying_type_t<T> v) : m_val(v) {}

 public:
    constexpr EnumClassBit(T v)
            : m_val(static_cast<std::underlying_type_t<T>>(v)) {}

    constexpr operator T() const { return static_cast<T>(m_val); }

    constexpr explicit operator bool() const { return m_val; }

 #define DEF_OPR(op)                                                     \
    constexpr EnumClassBit operator op(const EnumClassBit& rhs) const { \
        return m_val op rhs.m_val;                                      \
    }

    DEF_OPR(&)
    DEF_OPR(|)
    DEF_OPR (^)

    constexpr EnumClassBit operator~() const { return ~m_val; }

 #undef DEF_OPR
 };

 #define _MEGDNN_DECBO_SINGLE_OPR(cls, op)                                    \
    inline constexpr ::megdnn::EnumClassBit<cls> operator op(cls x, cls y) { \
        return ::megdnn::EnumClassBit<cls>(x)                                \
                op ::megdnn::EnumClassBit<cls>(y);                           \
    }                                                                        \
    inline constexpr ::megdnn::EnumClassBit<cls> operator op(                \
            ::megdnn::EnumClassBit<cls> x, cls y) {                          \
        return x op ::megdnn::EnumClassBit<cls>(y);                          \
    }

 #define _MEGDNN_DECBO_SINGLE_OPR_ASSIGN(cls, op)          \
    inline constexpr cls& operator op##=(cls& x, cls y) { \
        x = x op ::megdnn::EnumClassBit<cls>(y);          \
        return x;                                         \
    }

 #define MEGDNN_DEF_ENUM_CLASS_BIT_OPR(cls)                          \
    _MEGDNN_DECBO_SINGLE_OPR(cls, &)                                \
    _MEGDNN_DECBO_SINGLE_OPR(cls, |)                                \
    _MEGDNN_DECBO_SINGLE_OPR(cls, ^)                                \
    _MEGDNN_DECBO_SINGLE_OPR_ASSIGN(cls, &)                         \
    _MEGDNN_DECBO_SINGLE_OPR_ASSIGN(cls, |)                         \
    _MEGDNN_DECBO_SINGLE_OPR_ASSIGN(cls, ^)                         \
    inline constexpr ::megdnn::EnumClassBit<cls> operator~(cls x) { \
        return ~::megdnn::EnumClassBit<cls>(x);                     \
    }

 }  // namespace megdnn

 // vim: syntax=cpp.doxygen
--- a/dnn/src/cuda/convolution3d/backward_filter/algo.h
+++ b/dnn/src/cuda/convolution3d/backward_filter/algo.h
@@ -218,4 +218,3 @@ public:
 }  // namespace megdnn

 // vim: syntax=cpp.doxygen

--- a/imperative/python/megengine/functional/debug_param.py
+++ b/imperative/python/megengine/functional/debug_param.py
@@ -8,9 +8,12 @@
 # "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 import os

 from ..core.ops import builtin
 from ..logger import get_logger
 from ..utils.deprecation import deprecated

 Strategy = builtin.ops.Convolution.Strategy

 _execution_strategy = os.getenv("MEGENGINE_EXECUTION_STRATEGY", "HEURISTIC")

 if os.getenv("MEGENGINE_CONV_EXECUTION_STRATEGY") != None:
@@ -19,7 +22,7 @@ if os.getenv("MEGENGINE_CONV_EXECUTION_STRATEGY") != None:
    )


 def get_execution_strategy() -> str:
 def get_execution_strategy() -> Strategy:
    """
    Returns the execution strategy of :class:`~.Conv2d` and :func:'~.matmul'

@@ -28,12 +31,22 @@ def get_execution_strategy() -> str:
    return _execution_strategy


 def set_execution_strategy(option: str):
 def set_execution_strategy(option):
    """
    Sets the execution strategy of :class:`~.Conv2d` and :func:'~.matmul'

    :param option: Decides how :class:`~.Conv2d` and :func:'~.matmul' algorithms are chosen.
        Available values:
    :param option: Decides how :class:`~.Conv2d`and :func:'~.matmul' algorithms are chosen.
        Available value Strategy
        * HEURISTIC uses heuristic to choose the fastest algorithm.
        * PROFILE runs possible algorithms on real device to find the best one.
        * REPRODUCIBLE uses the algorithms that is reproducible.
        * OPTMIZED uses the algorithms that is optimized.

        The default strategy is HEURISTIC, this options can be combined to
        form a combination option, e.g. PROFILE | REPRODUCIBLE
        can combined a option that uses the fastest of profiling result that is also reproducible.

        Available values string:

        * 'HEURISTIC' uses heuristic to choose the fastest algorithm.
        * 'PROFILE' runs possible algorithms on real device to find the best one.
@@ -45,18 +58,29 @@ def set_execution_strategy(option: str):

        It can also be set through the environment variable 'MEGENGINE_EXECUTION_STRATEGY'.
    """
    valid_option = (
        "HEURISTIC",
        "PROFILE",
        "PROFILE_HEURISTIC",
        "PROFILE_REPRODUCIBLE",
        "HEURISTIC_REPRODUCIBLE",
    )
    if not option in valid_option:
        raise ValueError("Valid option can only be one of {}".format(valid_option))
    valid_string_option = {
        "REPRODUCIBLE": Strategy.REPRODUCIBLE,
        "HEURISTIC": Strategy.HEURISTIC,
        "PROFILE": Strategy.PROFILE,
    }

    global _execution_strategy  # pylint: disable=global-statement
    _execution_strategy = option
    if isinstance(option, Strategy):
        _execution_strategy = option
        return

    assert isinstance(option, str)

    strategy_tmp = Strategy(0)
    for opt in option.split("_"):
        if not opt in valid_string_option:
            raise ValueError(
                "Valid option can only be one of {}, or combine them with '_'.".format(
                    valid_string_option.keys()
                )
            )
        strategy_tmp = strategy_tmp | valid_string_option[opt]
    _execution_strategy = strategy_tmp


@deprecated(version="1.3", reason="use get_execution_strategy() instead")
--- a/imperative/python/test/integration/test_correctness_mnistnet.py
+++ b/imperative/python/test/integration/test_correctness_mnistnet.py
@@ -19,6 +19,7 @@ import megengine.autodiff as ad
 import megengine.functional as F
 from megengine import jit
 from megengine.core._trace_option import set_symbolic_shape
 from megengine.core.ops import builtin
 from megengine.core.tensor.utils import make_shape_tuple
 from megengine.functional.debug_param import set_execution_strategy
 from megengine.jit import SublinearMemoryConfig
@@ -33,6 +34,8 @@ from megengine.module import (
 from megengine.optimizer import SGD
 from megengine.tensor import Tensor

 Strategy = builtin.ops.Convolution.Strategy


 def get_gpu_name():
    try:
@@ -242,7 +245,7 @@ def test_correctness():
    else:
        model_name = "mnist_model_with_test_cpu.mge"
    model_path = os.path.join(os.path.dirname(__file__), model_name)
    set_execution_strategy("HEURISTIC_REPRODUCIBLE")
    set_execution_strategy(Strategy.HEURISTIC | Strategy.REPRODUCIBLE)

    run_train(model_path, False, False, max_err=1e-5)
    run_train(model_path, True, False, max_err=1e-5)
--- a/imperative/tablegen/autogen.cpp
+++ b/imperative/tablegen/autogen.cpp
@@ -337,6 +337,20 @@ static void gen_op_def_pybind11_single(raw_ostream &os, MgbOp& op, EnumContext&
                        className, attr->getEnumName(), i
                    ));
                }
                if (attr->getEnumCombinedFlag()) {
                    //! define operator |
                    os << formatv(
                            "\n    .def(\"__or__\", []({0}::{1} s0, {0}::{1} s1) {{ "
                            "\n         return static_cast<{0}::{1}>(uint32_t(s0) | uint32_t(s1));"
                            "\n      })",
                            className, attr->getEnumName());
                    //! define operator &
                    os << formatv(
                            "\n    .def(\"__and__\", []({0}::{1} s0, {0}::{1} s1) {{"
                            "\n         return static_cast<{0}::{1}>(uint32_t(s0) & uint32_t(s1));"
                            "\n    })",
                            className, attr->getEnumName());
                }
                os << formatv(
                    "\n    .def(py::init([](const std::string& in) {"
                    "\n        auto&& str = normalize_enum(in);"
--- a/imperative/tablegen/helper.h
+++ b/imperative/tablegen/helper.h
@@ -77,6 +77,9 @@ struct MgbEnumAttrMixin : public MgbAttrWrapperBase {
    bool supportToString() const {
        return getBaseRecord()->getValueAsBit("supportToString");
    }
    bool getEnumCombinedFlag() const {
        return getBaseRecord()->getValueAsBit("enumCombined");
    }
 };

 struct MgbHashableAttrMixin : public MgbAttrWrapperBase {
--- a/sdk/load-and-run/src/mgblar.cpp
+++ b/sdk/load-and-run/src/mgblar.cpp
@@ -142,8 +142,16 @@ R"__usage__(
 #if MGB_ENABLE_FASTRUN
 R"__usage__(
  --fast-run
    Enable fast-run mode. Operators with multiple algorithms would be profiled
    on the real device with actual input shapes.
    This param will be deperated later, please replace with param --full-profile.
 --full-profile
    Enable full-profile mode. Operators with multiple algorithms would be profiled
    on the real device with actual input shapes, all algorithms will be profiled
    include naive algorithms.
    See `mgb::gopt::enable_opr_algo_profiling_inplace` for more details.
 --fast-profile
    Enable fast-profile mode. Operators with multiple algorithms would be profiled
    on the real device with actual input shapes, this mode will only profile the
    well optimized algorithms to get the profile result fast.
    See `mgb::gopt::enable_opr_algo_profiling_inplace` for more details.
 )__usage__"
 #endif
@@ -511,7 +519,8 @@ struct Args {
    bool disable_assert_throw = false;
    bool share_param_mem = false;
 #if MGB_ENABLE_FASTRUN
    bool use_fast_run = false;
    bool use_full_profile = false;
    bool use_fast_profile = false;
 #endif
    bool reproducible = false;
    std::string fast_run_cache_path;
@@ -695,18 +704,20 @@ void run_test_st(Args &env) {
    using S = opr::mixin::AlgoChooserHelper::ExecutionPolicy::Strategy;
    S strategy = S::HEURISTIC;
 #if MGB_ENABLE_FASTRUN
    if (env.use_fast_run) {
    if (env.use_full_profile) {
        if (env.reproducible) {
            strategy = S::PROFILE_REPRODUCIBLE;
            strategy = S::PROFILE | S::REPRODUCIBLE;
        } else {
            strategy = S::PROFILE;
        }
    } else if (env.use_fast_profile) {
        strategy = S::PROFILE | S::OPTMIZED;
    } else if (env.reproducible) {
        strategy = S::HEURISTIC_REPRODUCIBLE;
        strategy = S::HEURISTIC | S::REPRODUCIBLE;
    }
 #else
    if (env.reproducible) {
        strategy = S::HEURISTIC_REPRODUCIBLE;
        strategy = S::HEURISTIC | S::REPRODUCIBLE;
    }
 #endif
    mgb::gopt::modify_opr_algo_strategy_inplace(vars, strategy);
@@ -729,11 +740,12 @@ void run_test_st(Args &env) {
                    std::make_shared<InFilePersistentCache>(buf.get(), flen));
 #if MGB_ENABLE_FASTRUN
        } else {
            mgb_assert(env.use_fast_run, "fast-run should be enabled");
            mgb_assert(env.use_full_profile || env.use_fast_profile,
                       "fast-run or fast-profile should be enabled");
            PersistentCache::set_impl(
                    std::make_shared<InFilePersistentCache>());
        }
        if (!env.use_fast_run)
        if (!env.use_full_profile && !env.use_fast_profile)
 #endif
            mgb::gopt::enable_opr_use_profiling_cache_inplace(vars);
    }
@@ -1314,7 +1326,18 @@ Args Args::from_argv(int argc, char **argv) {
        }
 #if MGB_ENABLE_FASTRUN
        if (!strcmp(argv[i], "--fast-run")) {
            ret.use_fast_run = true;
            mgb_log_warn(
                    "--fast-run param will be deperated later, please replace "
                    "with --full-profile or --fast-profile.");
            ret.use_full_profile = true;
            continue;
        }
        if (!strcmp(argv[i], "--full-profile")) {
            ret.use_full_profile = true;
            continue;
        }
        if (!strcmp(argv[i], "--fast-profile")) {
            ret.use_fast_profile = true;
            continue;
        }
 #endif
--- a/src/core/impl/utils/persistent_cache.cpp
+++ b/src/core/impl/utils/persistent_cache.cpp
@@ -188,7 +188,7 @@ AlgoChooserProfileCache::get(const Key &key) {
        auto entry_len = read_uint32();
        mgb_assert(buf + entry_len <= buf_end);
        auto nr = sscanf(reinterpret_cast<const char*>(buf), ENTRY_FMT,
                         &i.reproducible, &i.time, &i.workspace);
                         &i.attribute, &i.time, &i.workspace);
        mgb_assert(nr == 3);
        buf += entry_len;
    }
@@ -210,10 +210,10 @@ void AlgoChooserProfileCache::put(const Key &key, Result &result) {
        auto &&cur = result[i];

        if (prev.workspace <= cur.workspace &&
                prev.reproducible == cur.reproducible) {
            prev.attribute == cur.attribute) {
            result.erase(result.begin() + i);
        } else {
            ++ i;
            ++i;
        }
    }

@@ -235,8 +235,8 @@ void AlgoChooserProfileCache::put(const Key &key, Result &result) {
        write_uint32(0);
        pos = val.size();
        val.resize(pos + SPR_SIZE);
        uint32_t nr = snprintf(&val[pos], SPR_SIZE,
                ENTRY_FMT, i.reproducible, i.time, i.workspace);
        uint32_t nr = snprintf(&val[pos], SPR_SIZE, ENTRY_FMT, i.attribute,
                               i.time, i.workspace);
        //! for memory boundary failed, snprintf ret do not contain \0
        nr += 1;
        mgb_assert(nr < SPR_SIZE);
--- a/src/core/include/megbrain/common.h
+++ b/src/core/include/megbrain/common.h
@@ -12,6 +12,8 @@
 #pragma once

 #include "megbrain_build_config.h"
 #include "megbrain/opr/param_defs.h"
 #include "megdnn/basic_types.h"

 #include <memory>
 #include <string>
@@ -242,6 +244,16 @@ inline constexpr std::size_t operator"" _z(unsigned long long n) {
    return n;
 }
 #endif

 #define MGB_DEF_ENUM_CLASS_BIT_OPR(cls) \
    MEGDNN_DEF_ENUM_CLASS_BIT_OPR(cls)

 }   // namespace mgb

 namespace megdnn {
 namespace param {
 MGB_DEF_ENUM_CLASS_BIT_OPR(ExecutionPolicy::Strategy)
 }
 }  // namespace megdnn

 // vim: syntax=cpp.doxygen foldmethod=marker foldmarker=f{{{,f}}}
--- a/src/core/include/megbrain/comp_node.h
+++ b/src/core/include/megbrain/comp_node.h
@@ -12,7 +12,6 @@
 #pragma once

 #include "megbrain/utils/hash.h"
 #include "megbrain/utils/enum_class_bit.h"
 #include "megbrain/utils/metahelper.h"
 #include "megbrain/utils/thin/hash_table.h"
 #include "megbrain/utils/thread.h"
--- a/src/core/include/megbrain/graph/operator_node.h
+++ b/src/core/include/megbrain/graph/operator_node.h
@@ -16,7 +16,6 @@
 #include "megbrain/graph/symbol_var.h"

 #include "megbrain/utils/hashable.h"
 #include "megbrain/utils/enum_class_bit.h"
 #include "megbrain/utils/thin/hash_table.h"
 #include "megbrain/utils/small_vector.h"

--- a/src/core/include/megbrain/graph/var_node.h
+++ b/src/core/include/megbrain/graph/var_node.h
@@ -12,7 +12,6 @@
 #pragma once

 #include "megbrain/graph/bases.h"
 #include "megbrain/utils/enum_class_bit.h"
 #include "megbrain/utils/comp_node_sync_manager.h"
 #include "megbrain/utils/small_vector.h"
 #include "megbrain/utils/mempool.h"
--- a/src/core/include/megbrain/ir/base.td
+++ b/src/core/include/megbrain/ir/base.td
@@ -33,10 +33,11 @@ class MgbHashableAttrMixin {
  string reprFunction = "std::to_string($0)";
 }

 class MgbEnumAttrMixin<string namespace, string name, list<string> members, bit toString> {
 class MgbEnumAttrMixin<string namespace, string name, list<string> members, bit combined, bit toString> {
  string parentNamespace = namespace;
  string enumName = name;
  list<string> enumMembers = members;
  bit enumCombined = combined;
  bit supportToString = toString;
 }

@@ -166,8 +167,8 @@ class MgbTupleAttr<list<MgbAttrWrapper> args>:
 }

 // -- enum types
 class MgbEnumAttr<string namespace, string enumName, list<string> members, bit toString=0>:
    HashableAttr<namespace # "::" # enumName>, MgbEnumAttrMixin<namespace, enumName, members, toString> {
 class MgbEnumAttr<string namespace, string enumName, list<string> members, bit combined, bit toString=0>:
    HashableAttr<namespace # "::" # enumName>, MgbEnumAttrMixin<namespace, enumName, members, combined, toString> {
  let storageType = "::mlir::IntegerAttr";
  let convertFromStorage = "static_cast<" # returnType # ">($_self.getInt())";
  let constBuilderCall = "$_builder.getI32IntegerAttr(static_cast<int32_t>($0))";
@@ -176,7 +177,7 @@ class MgbEnumAttr<string namespace, string enumName, list<string> members, bit t
 }

 class MgbEnumAliasAttr<string namespace, string enumName, MgbEnumAttr base>:
    MgbEnumAttr<namespace, enumName, base.enumMembers>, MgbAliasAttrMixin<base>;
    MgbEnumAttr<namespace, enumName, base.enumMembers, 0>, MgbAliasAttrMixin<base>;

 // -- other types
 def MgbDTypeAttr: HashableAttr<"::megdnn::DType"> {
--- a/src/core/include/megbrain/utils/enum_class_bit.h
+++ b/src/core/include/megbrain/utils/enum_class_bit.h
@@ -1,89 +0,0 @@
 /**
 * \file src/core/include/megbrain/utils/enum_class_bit.h
 * MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
 *
 * Copyright (c) 2014-2021 Megvii Inc. All rights reserved.
 *
 * Unless required by applicable law or agreed to in writing,
 * software distributed under the License is distributed on an
 * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 */

 #pragma once

 #include <type_traits>

 namespace mgb {
    template<typename T>
    class EnumClassBit {
        std::underlying_type_t<T> m_val;

        constexpr EnumClassBit(std::underlying_type_t<T> v):
            m_val(v)
        {
        }

        public:
            constexpr EnumClassBit(T v):
                m_val(static_cast<std::underlying_type_t<T>>(v))
            {
            }

            constexpr operator T() const {
                return static_cast<T>(m_val);
            }

            constexpr explicit operator bool() const {
                return m_val;
            }

 #define DEF_OPR(op) \
            constexpr EnumClassBit operator op (\
                    const EnumClassBit &rhs) const { \
                return m_val op rhs.m_val; \
            }

            DEF_OPR(&)
            DEF_OPR(|)
            DEF_OPR(^)

            constexpr EnumClassBit operator ~() const {
                return ~m_val;
            }


 #undef DEF_OPR
    };

 }

 #define _MGB_DECBO_SINGLE_OPR(cls, op) \
     inline constexpr ::mgb::EnumClassBit<cls> operator op (cls x, cls y) { \
         return ::mgb::EnumClassBit<cls>(x) op ::mgb::EnumClassBit<cls>(y); \
     } \
     inline constexpr ::mgb::EnumClassBit<cls> operator op ( \
             ::mgb::EnumClassBit<cls> x, cls y) { \
         return x op ::mgb::EnumClassBit<cls>(y); \
     }

 #define _MGB_DECBO_SINGLE_OPR_ASSIGN(cls, op) \
     inline constexpr cls& operator op##= (cls& x, cls y) { \
         x = x op ::mgb::EnumClassBit<cls>(y); \
         return x; \
     }

 #define MGB_DEF_ENUM_CLASS_BIT_OPR(cls) \
    _MGB_DECBO_SINGLE_OPR(cls, &) \
    _MGB_DECBO_SINGLE_OPR(cls, |) \
    _MGB_DECBO_SINGLE_OPR(cls, ^) \
    _MGB_DECBO_SINGLE_OPR_ASSIGN(cls, &) \
    _MGB_DECBO_SINGLE_OPR_ASSIGN(cls, |) \
    _MGB_DECBO_SINGLE_OPR_ASSIGN(cls, ^) \
    inline constexpr ::mgb::EnumClassBit<cls> operator ~ (cls x) { \
        return ~::mgb::EnumClassBit<cls>(x); \
    } \



 // vim: syntax=cpp.doxygen foldmethod=marker foldmarker=f{{{,f}}}

--- a/src/core/include/megbrain/utils/persistent_cache.h
+++ b/src/core/include/megbrain/utils/persistent_cache.h
@@ -100,8 +100,7 @@ namespace mgb {

            struct ResultEntry {
                std::string algo;   //! identifier of the algorithm
                //! sscanf will up bool as int
                int reproducible;  //! whether algorithm is reproducible
                uint32_t attribute;  //! algo attribute, e.g. reproducible
                double time;        //! execution time in seconds
                size_t workspace;   //! workspace in bytes
            };
--- a/src/gopt/impl/inference.cpp
+++ b/src/gopt/impl/inference.cpp
@@ -54,7 +54,6 @@ using namespace gopt;

 namespace {


 template <typename SharedDeviceTensor, typename MultipleDeviceTensorHolder>
 void param_merge(OptState& opt_state) {
    auto rewriter = opt_state.graph().make_rewriter();
@@ -102,7 +101,7 @@ void param_merge(OptState& opt_state) {
    rewriter.apply_inplace();
 }

 }
 }  // namespace

 /* ================ global functions ================ */

@@ -190,12 +189,10 @@ void gopt::enable_opr_algo_profiling_inplace(

 void gopt::enable_opr_use_profiling_cache_inplace(
        const VarNodeArrayView& dest_vars) {
    modify_opr_algo_strategy_inplace(
            dest_vars, opr::mixin::AlgoChooserHelper::ExecutionPolicy::
                               Strategy::PROFILE_HEURISTIC);
    using S = megdnn::param::ExecutionPolicy::Strategy;
    modify_opr_algo_strategy_inplace(dest_vars, S::PROFILE | S::HEURISTIC);
 }


 void gopt::set_opr_algo_workspace_limit_inplace(
        const VarNodeArrayView& dest_vars, size_t workspace_limit) {
    static const ThinHashMap<Typeinfo*, void (*)(OperatorNodeBase&, size_t)>
--- a/src/gopt/test/inference.cpp
+++ b/src/gopt/test/inference.cpp
@@ -1693,7 +1693,22 @@ TEST(TestGoptInference, ProfileCache) {
    using S = opr::Convolution::ExecutionPolicy::Strategy;
    ASSERT_EQ(S::HEURISTIC, conv.execution_policy_transient().strategy);
    gopt::enable_opr_use_profiling_cache_inplace({z + 2.3f});
    ASSERT_EQ(S::PROFILE_HEURISTIC, conv.execution_policy().strategy);
    ASSERT_EQ(S::PROFILE | S::HEURISTIC, conv.execution_policy().strategy);
 }

 TEST(TestGoptInference, FastProfileCache) {
    HostTensorGenerator<> gen;
    auto graph = ComputingGraph::make();
    auto host_x = gen({4, 3, 8, 9}), host_y = gen({2, 3, 3, 3});
    auto x = opr::Host2DeviceCopy::make(*graph, host_x),
         y = opr::Host2DeviceCopy::make(*graph, host_y),
         z = opr::Convolution::make(x, y);
    auto&& conv = z.node()->owner_opr()->cast_final_safe<opr::Convolution>();
    using S = opr::Convolution::ExecutionPolicy::Strategy;
    ASSERT_EQ(S::HEURISTIC, conv.execution_policy_transient().strategy);
    gopt::modify_opr_algo_strategy_inplace({z + 2.3f},
                                           S::PROFILE | S::OPTMIZED);
    ASSERT_EQ(S::PROFILE | S::OPTMIZED, conv.execution_policy().strategy);
 }

 TEST(TestGoptInference, AlgoWorkspaceLimit) {
--- a/src/opr/impl/dnn/dnn.sereg.h
+++ b/src/opr/impl/dnn/dnn.sereg.h
@@ -20,7 +20,6 @@
 #include "megbrain/opr/dnn/lrn.h"
 #include "megbrain/opr/dnn/fake_quant.h"
 #include "megbrain/opr/dnn/tqt.h"

 #include "megbrain/serialization/sereg.h"
 #include "megdnn/opr_param_defs.h"
 #include "megdnn/oprs/nn.h"
--- a/src/opr/impl/search_policy/algo_chooser.cpp
+++ b/src/opr/impl/search_policy/algo_chooser.cpp
@@ -284,8 +284,9 @@ namespace mgb {
 namespace opr {

 template <typename Opr>
 void AlgoChooser<Opr>::profile(ExeContext& ctx, bool require_reproducible) {
    if (ctx.get_profile_result_from_cache(require_reproducible).valid())
 void AlgoChooser<Opr>::profile(ExeContext& ctx,
                               ExecutionStrategy select_strategy) {
    if (ctx.get_profile_result_from_cache(select_strategy).valid())
        return;
    AlgoChooserProfileCache::Result prof_rst;

@@ -305,7 +306,7 @@ void AlgoChooser<Opr>::profile(ExeContext& ctx, bool require_reproducible) {
                                   algo.name.c_str(), str_on_inp_shape.c_str());
        ImplExecutionPolicy policy;
        policy.algo = algo.desc;
        ctx.construct_execution_policy(require_reproducible, policy);
        ctx.construct_execution_policy(select_strategy, policy);
        if (ctx.get_workspace_size_bytes(policy) >= workspace_limit)
            continue;

@@ -354,7 +355,8 @@ void AlgoChooser<Opr>::profile(ExeContext& ctx, bool require_reproducible) {

 template <typename Opr>
 typename AlgoChooser<Opr>::ImplExecutionPolicy
 AlgoChooser<Opr>::choose_by_profile(ExeContext& ctx, bool require_reproducible,
 AlgoChooser<Opr>::choose_by_profile(ExeContext& ctx,
                                    ExecutionStrategy select_strategy,
                                    bool enable_update) {
    MIDOUT_B(Opr, midout_iv(MGB_HASH_STR("AlgoChooser::choose_by_profile")))
    if (ctx.owner_graph()->options().no_profiling_on_shape_change) {
@@ -376,11 +378,11 @@ AlgoChooser<Opr>::choose_by_profile(ExeContext& ctx, bool require_reproducible,
                    to_fixed_layouts<_Opr>(_item.layouts), megdnn_opr.get(),
                    _item.param, ctx.mgb_opr(), ctx.comp_node(),
                    ctx.execution_policy(), ctx.allow_weight_preprocess());
            AlgoChooser<_Opr>::profile(sub_ctx, require_reproducible);
            AlgoChooser<_Opr>::profile(sub_ctx, select_strategy);
        });
    }
    typename AlgoChooser<Opr>::ImplExecutionPolicy policy;
    ctx.construct_execution_policy(require_reproducible, policy);
    ctx.construct_execution_policy(select_strategy, policy);
    return policy;
    MIDOUT_E
 }
@@ -402,11 +404,9 @@ size_t AlgoChooser<Opr>::setup_algo(const FixedTensorLayouts& layouts,
    ImplExecutionPolicy policy;
    if (auto algo_choose_hook = mgb_opr->algo_chooser()) {
        policy = algo_choose_hook(mgb_opr);
        ctx.construct_execution_policy(
                mgb_opr->execution_policy().strategy ==
                        mixin::AlgoChooserHelper::ExecutionPolicy::Strategy::
                                HEURISTIC_REPRODUCIBLE,
                policy, false);
        ctx.construct_execution_policy((ExecutionStrategy::HEURISTIC |
                                        ExecutionStrategy::REPRODUCIBLE),
                                       policy, false);
    }
    if (!policy.algo.valid()) {
        policy = get_policy(ctx);
@@ -419,10 +419,9 @@ size_t AlgoChooser<Opr>::setup_algo(const FixedTensorLayouts& layouts,
    Algorithm* palgo = megdnn_opr->get_algorithm_from_desc(policy.algo);
    mgb_assert(palgo, "Unknown algo description");
    ret.append("): algo=" + std::string(palgo->name()));
    ret.append(ssprintf(" workspace=%.2fMiB reproducible=%d",
    ret.append(ssprintf(" workspace=%.2fMiB attirbute=%d",
                        workspace / (1024 * 1024.0),
                        palgo->contain_attribute(
                                megdnn::AlgoAttribute::REPRODUCIBLE)));
                        static_cast<uint32_t>(palgo->attribute())));
    mgb_log_debug("%s", ret.c_str());

    megdnn_opr->execution_policy() = policy;
@@ -432,41 +431,39 @@ size_t AlgoChooser<Opr>::setup_algo(const FixedTensorLayouts& layouts,
 template <typename Opr>
 typename AlgoChooser<Opr>::ImplExecutionPolicy AlgoChooser<Opr>::get_policy(
        ExeContext& ctx) {
    using S = mixin::AlgoChooserHelper::ExecutionPolicy::Strategy;
    MGB_MARK_USED_VAR(TIMEOUT_TOLERANCE);
    switch (ctx.execution_policy().strategy) {
        case S::HEURISTIC:
            return ctx.choose_by_heuristic();
        case S::HEURISTIC_REPRODUCIBLE:
            return ctx.choose_by_heuristic(true);
        case S::PROFILE_HEURISTIC: {
            ImplExecutionPolicy policy = choose_by_profile(ctx, false, false);
            if (!policy.algo.valid())
                policy = ctx.choose_by_heuristic();
            return policy;
        }
    auto opr_strategy = ctx.execution_policy().strategy;
    if ((opr_strategy & ExecutionStrategy::HEURISTIC) &&
               (opr_strategy & ExecutionStrategy::PROFILE)) {
        ImplExecutionPolicy policy =
                choose_by_profile(ctx, opr_strategy, false);
        if (!policy.algo.valid())
            policy = ctx.choose_by_heuristic(opr_strategy);
        return policy;
    } else if ((opr_strategy & ExecutionStrategy::HEURISTIC)) {
        return ctx.choose_by_heuristic(opr_strategy);
    }
 #if MGB_ENABLE_FASTRUN
        case S::PROFILE:
            return choose_by_profile(ctx, false);
        case S::PROFILE_REPRODUCIBLE:
            return choose_by_profile(ctx, true);
    else if (opr_strategy & ExecutionStrategy::PROFILE) {
        return choose_by_profile(ctx, opr_strategy);
    }
 #endif
        default:
            mgb_throw(GraphError, "bad convolution ExecutionPolicy strategy");
    else {
        mgb_throw(GraphError, "bad convolution ExecutionPolicy strategy");
    }
 }

 #define INST(Opr)                                                            \
    template AlgoChooser<megdnn::Opr>::ImplExecutionPolicy                   \
    AlgoChooser<megdnn::Opr>::get_policy(ExeContext& ctx);                   \
    template void AlgoChooser<megdnn::Opr>::profile(                         \
            ExeContext& ctx, bool require_reproducible);                     \
    template AlgoChooser<megdnn::Opr>::ImplExecutionPolicy                   \
    AlgoChooser<megdnn::Opr>::choose_by_profile(                             \
            ExeContext& ctx, bool require_reproducible, bool enable_update); \
    template size_t AlgoChooser<megdnn::Opr>::setup_algo(                    \
            const FixedTensorLayouts& layouts, megdnn::Opr* megdnn_opr,      \
            const MGBOpr* mgb_opr, bool allow_weight_preprocess);            \
 #define INST(Opr)                                                       \
    template AlgoChooser<megdnn::Opr>::ImplExecutionPolicy              \
    AlgoChooser<megdnn::Opr>::get_policy(ExeContext& ctx);              \
    template void AlgoChooser<megdnn::Opr>::profile(ExeContext& ctx,    \
                                                    ExecutionStrategy); \
    template AlgoChooser<megdnn::Opr>::ImplExecutionPolicy              \
    AlgoChooser<megdnn::Opr>::choose_by_profile(                        \
            ExeContext& ctx, ExecutionStrategy, bool enable_update);    \
    template size_t AlgoChooser<megdnn::Opr>::setup_algo(               \
            const FixedTensorLayouts& layouts, megdnn::Opr* megdnn_opr, \
            const MGBOpr* mgb_opr, bool allow_weight_preprocess);

 MGB_FOREACH_FASTRUN_OPR(INST)

@@ -498,7 +495,7 @@ AlgoChooser<Opr>::ExeContext::ExeContext(
 template <typename Opr>
 typename AlgoChooser<Opr>::ImplAlgo
 AlgoChooser<Opr>::ExeContext::get_profile_result_from_cache(
        bool require_reproducible) const {
        ExecutionStrategy select_strategy) const {
    MIDOUT_B(Opr,
             midout_iv(MGB_HASH_STR(
                     "AlgoChooser::ExeContext::get_profile_result_from_cache")))
@@ -522,7 +519,9 @@ AlgoChooser<Opr>::ExeContext::get_profile_result_from_cache(
    if (prof.empty())
        return {};
    for (auto&& i : prof) {
        if ((!require_reproducible || i.reproducible)) {
        if (!(select_strategy & ExecutionStrategy::REPRODUCIBLE) ||
            static_cast<AlgoAttribute>(i.attribute) &
                    AlgoAttribute::REPRODUCIBLE) {
            auto iter = algo_map.find(i.algo);
            mgb_assert(iter != algo_map.end(),
                       "algorithm %s exists in "
@@ -550,7 +549,8 @@ AlgoChooser<Opr>::ExeContext::get_profile_result_from_cache(

 template <typename Opr>
 typename AlgoChooser<Opr>::ImplExecutionPolicy
 AlgoChooser<Opr>::ExeContext::choose_by_heuristic(bool reproducible) const {
 AlgoChooser<Opr>::ExeContext::choose_by_heuristic(
        ExecutionStrategy select_strategy) const {
    if (m_execution_policy.workspace_limit !=
        std::numeric_limits<decltype(
                m_execution_policy.workspace_limit)>::max()) {
@@ -558,6 +558,8 @@ AlgoChooser<Opr>::ExeContext::choose_by_heuristic(bool reproducible) const {
                "workspace_limit should not be setted if choose algo by "
                "heuristic");
    }
    bool reproducible = static_cast<bool>(select_strategy &
                                          ExecutionStrategy::REPRODUCIBLE);
    auto workspace_limit = WorkspaceLimitGetter::get_workspace_limit(
            owner_graph(), m_cn, m_execution_policy.workspace_limit);
    ImplExecutionPolicy policy;
@@ -579,7 +581,8 @@ AlgoChooser<Opr>::ExeContext::choose_by_heuristic(bool reproducible) const {
                to_fixed_layouts<_Opr>(_item.layouts), megdnn_opr.get(),
                _item.param, m_base_mgb_opr, m_cn, m_execution_policy,
                m_allow_weight_preprocess);
        policy.sub_policy.push_back(sub_ctx.choose_by_heuristic(reproducible));
        policy.sub_policy.push_back(
                sub_ctx.choose_by_heuristic(select_strategy));
    });

    return policy;
@@ -588,9 +591,8 @@ AlgoChooser<Opr>::ExeContext::choose_by_heuristic(bool reproducible) const {
 template <typename Opr>
 std::vector<typename AlgoChooser<Opr>::ImplAlgo>
 AlgoChooser<Opr>::ExeContext::get_all_candidates() const {
    auto heu = choose_by_heuristic();
    auto&& ret =
            APPLY(m_megdnn_opr->get_all_algorithms_info(args...), m_layouts);
    auto heu = choose_by_heuristic(ExecutionStrategy::HEURISTIC);
    auto&& ret = APPLY(m_megdnn_opr->get_all_algorithms_info(args...), m_layouts);
    bool found = false;
    for (size_t i = 0; i < ret.size(); ++i) {
        if (ret[i].desc == heu.algo) {
@@ -611,19 +613,21 @@ AlgoChooser<Opr>::ExeContext::get_all_candidates() const {

 template <typename Opr>
 void AlgoChooser<Opr>::ExeContext::construct_execution_policy(
        bool require_reproducible,
        ExecutionStrategy select_strategy,
        typename AlgoChooser<Opr>::ImplExecutionPolicy& policy,
        bool retrive_from_cache) const {
    bool reproducible = static_cast<bool>(select_strategy &
                                          ExecutionStrategy::REPRODUCIBLE);
    if (!policy.algo.valid()) {
        if (retrive_from_cache) {
            policy.algo =
                    get_profile_result_from_cache(require_reproducible).desc;
                    get_profile_result_from_cache(select_strategy).desc;
        } else {
            auto workspace_limit = WorkspaceLimitGetter::get_workspace_limit(
                    owner_graph(), m_cn, m_execution_policy.workspace_limit);
            policy.algo = APPLY(m_megdnn_opr->get_algorithm_info_heuristic(
                                        args..., workspace_limit,
                                        require_reproducible),
                                        reproducible),
                                m_layouts)
                                  .desc;
        }
@@ -647,7 +651,7 @@ void AlgoChooser<Opr>::ExeContext::construct_execution_policy(
                _item.param, m_base_mgb_opr, m_cn, m_execution_policy,
                m_allow_weight_preprocess);
        policy.sub_policy.push_back({});
        sub_ctx.construct_execution_policy(require_reproducible,
        sub_ctx.construct_execution_policy(select_strategy,
                                           policy.sub_policy.back(),
                                           retrive_from_cache);
    });
@@ -718,8 +722,7 @@ AlgoChooser<Opr>::ExeContext::profile_single_algo(
        return None;
    return AlgoChooserProfileCache::ResultEntry{
            palgo->name(),
            palgo->contain_attribute(
                    megdnn::AlgoAttribute::REPRODUCIBLE),
            static_cast<uint32_t>(palgo->attribute()),
            rst.val().time, param.workspace};
 }

@@ -768,10 +771,10 @@ AlgoChooser<Opr>::ExeContext::construct_fake_preprocess_filter() const {
            bool allow_weight_preprocess);                                     \
    template typename AlgoChooser<megdnn::Opr>::ImplExecutionPolicy            \
    AlgoChooser<megdnn::Opr>::ExeContext::choose_by_heuristic(                 \
            bool reproducible) const;                                          \
            ExecutionStrategy select_strategy) const;                          \
    template typename AlgoChooser<megdnn::Opr>::ImplAlgo                       \
    AlgoChooser<megdnn::Opr>::ExeContext::get_profile_result_from_cache(       \
            bool require_reproducible) const;                                  \
            ExecutionStrategy select_strategy) const;                          \
    template std::vector<typename AlgoChooser<megdnn::Opr>::ImplAlgo>          \
    AlgoChooser<megdnn::Opr>::ExeContext::get_all_candidates() const;          \
    template size_t                                                            \
@@ -780,7 +783,7 @@ AlgoChooser<Opr>::ExeContext::construct_fake_preprocess_filter() const {
                    policy) const;                                             \
    template void                                                              \
    AlgoChooser<megdnn::Opr>::ExeContext::construct_execution_policy(          \
            bool require_reproducible,                                         \
            ExecutionStrategy select_strategy,                                 \
            typename AlgoChooser<megdnn::Opr>::ImplExecutionPolicy& policy,    \
            bool retrive_from_cache) const;                                    \
    template Maybe<AlgoChooserProfileCache::ResultEntry>                       \
--- a/src/opr/include/megbrain/opr/search_policy/algo_chooser.h
+++ b/src/opr/include/megbrain/opr/search_policy/algo_chooser.h
@@ -35,6 +35,13 @@ MGB_FOREACH_FASTRUN_OPR(cb)
 #undef cb

 namespace mgb {

 //! define logical operation of megdnn::param::ExecutionPolicy::Strategy::Enum
 //! and megdnn::detail::AlgoAttribute enum
 using ExecutionStrategy = megdnn::param::ExecutionPolicy::Strategy;

 using AlgoAttribute = megdnn::AlgoAttribute;

 namespace opr {

 /* =================== AlgoChooser =================== */
@@ -103,7 +110,7 @@ public:
        const FixedTensorLayouts& layouts() const { return m_layouts; }

        ImplExecutionPolicy choose_by_heuristic(
                bool reproducible = false) const;
                ExecutionStrategy select_strategy) const;

        //! get all candidate algos, and the one choose_by_heuristic() is
        //! put first
@@ -126,19 +133,20 @@ public:
                const ImplExecutionPolicy& policy, double& timeout) const;

        //! get all profile algorithm from cache, return invalid if not exists
        ImplAlgo get_profile_result_from_cache(bool require_reproducible) const;
        ImplAlgo get_profile_result_from_cache(
                ExecutionStrategy select_strategy) const;

        /**
         * \brief construct execution policy from cache or heuristic.
         *
         * \param require_reproducible select algo which is reproducible
         * \param select_strategy select algo which matched this strategy
         * \param policy execution policy
         * \param retrive_from_cache retrive algo from cache if set True, get
         *     from heuristic otherwise.
         */
        void construct_execution_policy(
                bool require_reproducible, ImplExecutionPolicy& policy,
                bool retrive_from_cache = true) const;
        void construct_execution_policy(ExecutionStrategy select_strategy,
                                        ImplExecutionPolicy& policy,
                                        bool retrive_from_cache = true) const;

    private:
        Maybe<PreprocessFilter<Opr>> construct_fake_preprocess_filter() const;
@@ -153,11 +161,11 @@ private:


    //! profile and save to cache
    static void profile(ExeContext& ctx, bool require_reproducible);
    static void profile(ExeContext& ctx, ExecutionStrategy select_strategy);

    static ImplExecutionPolicy choose_by_profile(ExeContext& ctx,
                                                 bool require_reproducible,
                                                 bool enable_update = true);
    static ImplExecutionPolicy choose_by_profile(
            ExeContext& ctx, ExecutionStrategy select_strategy,
            bool enable_update = true);

 public:
    /*!
--- a/src/opr/include/megbrain/opr/search_policy/algo_chooser_helper.h
+++ b/src/opr/include/megbrain/opr/search_policy/algo_chooser_helper.h
@@ -13,7 +13,6 @@
 #pragma once

 #include "megbrain/graph/operator_node.h"
 #include "megbrain/opr/param_defs.h"
 #include "megdnn/oprs/base.h"
 #include "megdnn/oprs/nn.h"

@@ -73,7 +72,6 @@ protected:

 };
 }  // namespace mixin

 }  // namespace opr
 }  // namespace mgb

--- a/src/opr/test/blas.cpp
+++ b/src/opr/test/blas.cpp
@@ -429,10 +429,11 @@ TEST(TestOprDNN, MatrixMulExePolicy) {
    auto cn = CompNode::load("cpux");

 #if MGB_ENABLE_FASTRUN
    for (auto strategy : {S::PROFILE, S::HEURISTIC, S::PROFILE_REPRODUCIBLE,
                          S::PROFILE_HEURISTIC}) {
    for (auto strategy :
         SmallVector<S>{S::PROFILE, S::HEURISTIC, S::PROFILE | S::REPRODUCIBLE,
                        S::PROFILE | S::HEURISTIC}) {
 #else
    for (auto strategy: {S:HEURISTIC, S::PROFILE_HEURISTIC}) {
    for (auto strategy: {S:HEURISTIC, S::PROFILE | S::HEURISTIC}) {
 #endif

        auto graph = ComputingGraph::make();
--- a/src/opr/test/dnn/convolution.cpp
+++ b/src/opr/test/dnn/convolution.cpp
@@ -355,11 +355,13 @@ TEST(TestOprDNN, ConvBiasExePolicy) {
    auto cn = CompNode::load("cpux");

 #if MGB_ENABLE_FASTRUN
    for (auto strategy: {S::PROFILE, S::HEURISTIC, S::PROFILE_REPRODUCIBLE, S::PROFILE_HEURISTIC}) {
    for (auto strategy :
         SmallVector<S>{S::PROFILE, S::HEURISTIC, S::PROFILE | S::REPRODUCIBLE,
          S::PROFILE | S::HEURISTIC, S::PROFILE | S::OPTMIZED}) {
 #else
    for (auto strategy: {S:HEURISTIC, S::PROFILE_HEURISTIC}) {
    for (auto strategy :
         SmallVector<S>{S : HEURISTIC, S::PROFILE | S::HEURISTIC}) {
 #endif

        auto graph = ComputingGraph::make();
        HostTensorGenerator<> gen;

@@ -397,7 +399,8 @@ TEST(TestOprDNN, ConvBiasExePolicy_Quantized8Asym) {

    auto cn = CompNode::load("cpux");

    for (auto strategy: {S::PROFILE, S::PROFILE_REPRODUCIBLE}) {
    for (auto strategy :
         SmallVector<S>{S::PROFILE, S::PROFILE | S::REPRODUCIBLE}) {

        auto graph = ComputingGraph::make();
        HostTensorGenerator<> gen;
@@ -439,10 +442,12 @@ TEST(TestOprDNN, ConvolutionExePolicy) {
    PersistentCacheHook cache_hook{on_get};

 #if MGB_ENABLE_FASTRUN
    for (auto strategy : {S::PROFILE, S::HEURISTIC, S::PROFILE_REPRODUCIBLE,
                          S::PROFILE_HEURISTIC}) {
    for (auto strategy :
         SmallVector<S>{S::PROFILE, S::HEURISTIC, S::PROFILE | S::REPRODUCIBLE,
          S::PROFILE | S::HEURISTIC, S::PROFILE | S::OPTMIZED}) {
 #else
    for (auto strategy: {S:HEURISTIC, S::PROFILE_HEURISTIC}) {
    for (auto strategy :
         SmallVector<S>{S : HEURISTIC, S::PROFILE | S::HEURISTIC}) {
 #endif
        using Checker = AutoOprChecker<2, 1>;

@@ -522,10 +527,11 @@ TEST(TestOprDNN, ConvolutionBackwardDataBfloat16ExePolicy) {
    PersistentCacheHook cache_hook{on_get};

 #if MGB_ENABLE_FASTRUN
    for (auto strategy : {S::PROFILE, S::HEURISTIC, S::PROFILE_REPRODUCIBLE,
                          S::PROFILE_HEURISTIC}) {
    for (auto strategy :
         {S::PROFILE, S::HEURISTIC, S(S::PROFILE | S::REPRODUCIBLE),
          S(S::PROFILE | S::HEURISTIC)}) {
 #else
    for (auto strategy: {S:HEURISTIC, S::PROFILE_HEURISTIC}) {
    for (auto strategy: {S:HEURISTIC, S(S::PROFILE | S::HEURISTIC)}) {
 #endif
        using Checker = AutoOprChecker<2, 1>;

@@ -1183,9 +1189,12 @@ TEST(TestOprDNN, Convolution3DExePolicy) {
    using S = Policy::Strategy;

 #if MGB_ENABLE_FASTRUN
    for (auto strategy: {S::PROFILE, S::HEURISTIC, S::PROFILE_REPRODUCIBLE, S::PROFILE_HEURISTIC}) {
    for (auto strategy :
         SmallVector<S>{S::PROFILE, S::HEURISTIC, S::PROFILE | S::REPRODUCIBLE,
          S::PROFILE | S::HEURISTIC}) {
 #else
    for (auto strategy: {S:HEURISTIC, S::PROFILE_HEURISTIC}) {
    for (auto strategy :
         SmallVector<S>{S : HEURISTIC, S::PROFILE | S::HEURISTIC}) {
 #endif

        using Checker = AutoOprChecker<2, 1>;
@@ -1660,10 +1669,12 @@ TEST(TestOprDNN, LocalShareForwardExecPolicy) {
    PersistentCacheHook cache_hook{on_get};

 #if MGB_ENABLE_FASTRUN
    for (auto strategy : {S::PROFILE, S::HEURISTIC, S::PROFILE_REPRODUCIBLE,
                          S::PROFILE_HEURISTIC}) {
    for (auto strategy :
         SmallVector<S>{S::PROFILE, S::HEURISTIC, S::PROFILE | S::REPRODUCIBLE,
          S::PROFILE | S::HEURISTIC, S::PROFILE | S::OPTMIZED}) {
 #else
    for (auto strategy: {S:HEURISTIC, S::PROFILE_HEURISTIC}) {
    for (auto strategy :
         SmallVector<S>{S : HEURISTIC, S::PROFILE | S::HEURISTIC}) {
 #endif
        auto make_graph = [&](const Checker::SymInpArray& inputs)
                -> Checker::SymOutArray {
@@ -1769,10 +1780,12 @@ TEST(TestOprDNN, DeformableConvForward) {
    Param param;

 #if MGB_ENABLE_FASTRUN
    for (auto strategy : {S::PROFILE, S::HEURISTIC, S::PROFILE_REPRODUCIBLE,
                          S::PROFILE_HEURISTIC}) {
    for (auto strategy :
         SmallVector<S>{S::PROFILE, S::HEURISTIC, S::PROFILE | S::REPRODUCIBLE,
          S::PROFILE | S::HEURISTIC, S::PROFILE | S::OPTMIZED}) {
 #else
    for (auto strategy : {S : HEURISTIC, S::PROFILE_HEURISTIC}) {
    for (auto strategy :
         SmallVector<S>{S : HEURISTIC, S::PROFILE | S::HEURISTIC}) {
 #endif
        auto make_graph = [&](const Checker::SymInpArray& inputs)
                -> Checker::SymOutArray {
@@ -1936,10 +1949,12 @@ TEST(TestOprDNN, BatchConvBiasForward) {
    param.sparse = Param::Sparse::DENSE;

 #if MGB_ENABLE_FASTRUN
    for (auto strategy : {S::PROFILE, S::HEURISTIC, S::PROFILE_REPRODUCIBLE,
                          S::PROFILE_HEURISTIC}) {
    for (auto strategy :
         SmallVector<S>{S::PROFILE, S::HEURISTIC, S::PROFILE | S::REPRODUCIBLE,
          S::PROFILE | S::HEURISTIC, S::PROFILE | S::OPTMIZED}) {
 #else
    for (auto strategy : {S : HEURISTIC, S::PROFILE_HEURISTIC}) {
    for (auto strategy :
         SmallVector<S>{S : HEURISTIC, S::PROFILE | S::HEURISTIC}) {
 #endif

        auto make_quantized = [&](SymbolVar x, const DType& dtype) {
@@ -2080,7 +2095,8 @@ TEST(TestOprDNN, HeuristicReproducible) {

    constexpr size_t PH = 1, PW = 1, SH = 1, SW = 1;

    for (auto strategy : {S::HEURISTIC, S::HEURISTIC_REPRODUCIBLE}) {
    for (auto strategy :
         SmallVector<S>{S::HEURISTIC, S::HEURISTIC | S::REPRODUCIBLE}) {
        VarNode* bwd_flt;
        auto make_graph = [&](const Checker::SymInpArray& inputs)
                -> Checker::SymOutArray {
@@ -2126,7 +2142,7 @@ TEST(TestOprDNN, HeuristicReproducible) {
            megdnn::Algorithm* palgo =
                    megdnn_opr->get_algorithm_from_desc(algo);
            mgb_assert(palgo, "Unknown algo description");
            if (strategy == S::HEURISTIC_REPRODUCIBLE) {
            if (strategy == S(S::HEURISTIC | S::REPRODUCIBLE)) {
                EXPECT_TRUE(palgo->contain_attribute(
                            megdnn::AlgoAttribute::REPRODUCIBLE));
            }
--- a/test/src/include/megbrain/test/helper.h
+++ b/test/src/include/megbrain/test/helper.h
@@ -43,6 +43,7 @@ namespace megdnn {
            std::ostream &ostr, const DType &dt) {
        return ostr << dt.name();
    }

 } // namespace megdnn

 namespace mgb {
--- a/tools/param_defs/mgb_opr_param_defs.py
+++ b/tools/param_defs/mgb_opr_param_defs.py
@@ -18,7 +18,7 @@ pdef('PersistentOutputStorage').add_fields(
 add_const('int32', 'INVALID_AXIS', 'MAX_NDIM').
 add_fields('int32', 'axis', 'INVALID_AXIS'))

 (pdef('ExecutionPolicy', 'specify how to select an algorithm for an operator').
 (pdef('ExecutionPolicy', version=0, is_legacy=True).
 add_enum('Strategy',
          Doc('HEURISTIC', 'use heuristic to choose the fastest algorithm'),
          Doc('HEURISTIC_REPRODUCIBLE', 'use heuristic to choose the fastest algorithm, '
@@ -33,6 +33,20 @@ pdef('PersistentOutputStorage').add_fields(
            Doc('workspace_limit', 'workspace limit in bytes'),
            str(2**64-1)+'ull'))

 (pdef('ExecutionPolicy', 'specify how to select an algorithm for an operator', version=1).
 add_bit_combination_enum('Strategy',
          Doc('HEURISTIC', 'use heuristic to choose the fastest algorithm'),
          Doc('PROFILE',
              'run possible algorithms on real device to find the best'),
          Doc('REPRODUCIBLE',
              'when profile or heuristic algo selection it require the algos'
              'must be reproducible'),
          Doc('OPTMIZED',
              'profile require algos are optmized to achieve fast-profile')).
 add_fields('uint64',
            Doc('workspace_limit', 'workspace limit in bytes'),
            str(2**64-1)+'ull'))

 (pdef('AssertEqual').
 add_fields('float32',
            Doc('maxerr', 'max allowed error; error is defined as the minimal '