fix(megbrain): add rdnn to copybara

GitOrigin-RevId: 7d8bf77053
3 years ago · d56570d929
--- a/imperative/src/impl/proxy_graph/mini_graph.h
+++ b/imperative/src/impl/proxy_graph/mini_graph.h
@@ -320,15 +320,12 @@ public:
            input->force_assign_dev_tensor_from_tensor(dev_tensor);
            mgb_assert(input->comp_node() == dev_tensor.comp_node());
            mgb_assert(input->shape().eq_shape(layout));
            mgb_assert(input->dtype() == layout.dtype);
            idx++;
        }
    }
    void init_output_tensor(const SmallVector<Tensor*>& outputs) {
        mgb_assert(m_opr->usable_output().size() == outputs.size());
        ::mgb::opr::intl::WorkspaceLimitHook::set_impl(
                m_opr->owner_graph(), get_workspace_limit);
@@ -347,9 +344,6 @@ public:
                mgb_assert(j < outputs.size());
                auto&& tensor = outputs[j];
                auto&& layout = tensor->layout();
                mgb_assert(var->comp_node() == tensor->comp_node());
                mgb_assert(var->shape().eq_shape(layout));
                mgb_assert(var->dtype() == layout.dtype);
                if (var->m_mem_plan.chunk().owner_var != var) {
                    tensor->assign_from_dev_tensor(
                            var->m_dev_tensor);  // memory forwarding
@@ -816,7 +810,6 @@ public:
        // minigraph.opr()->usable_output() bug execution may use the attrs for those
        // output var, so we infer attrs for all outputs, but only return
        // LogicalTensorDesc for minigraph.opr()->usable_output()
        SmallVector<LogicalTensorDesc> output_descs;
        for (size_t i = 0; i < minigraph.opr()->output().size(); ++i) {
            auto* var = minigraph.opr()->output()[i];
            auto* shape = sess.infer(sess.output_data[i].shape_infer, true);
@@ -824,19 +817,15 @@ public:
            var->shape(*shape);
        }
        for (size_t i = 0; i < minigraph.output_size(); ++i) {
        SmallVector<TensorPtr> outputs(minigraph.output_size(), {});
        for (size_t i = 0; i < outputs.size(); i++) {
            auto* ovar = minigraph.output_var(i);
            mgb_assert(ovar->dtype().valid() && ovar->comp_node().valid());
            mgb_assert(
                    ovar->shape().ndim ||
                    ovar->contain_flag(VarNode::Flag::NO_SYS_MEM_ALLOC));
            output_descs.push_back({{ovar->shape(), ovar->dtype()}, ovar->comp_node()});
        }
        SmallVector<TensorPtr> outputs(output_descs.size(), {});
        for (size_t i = 0; i < outputs.size(); i++) {
            outputs[i] =
                    Tensor::make(output_descs[i].layout, output_descs[i].comp_node);
            outputs[i] = Tensor::make(
                    TensorLayout{ovar->shape(), ovar->dtype()}, ovar->comp_node());
        }
        auto raw_outputs = to_raw_ptr_array(outputs, false);
--- a/src/rdnn/impl/algo_chooser.cpp
+++ b/src/rdnn/impl/algo_chooser.cpp
--- a/src/rdnn/impl/management.cpp
+++ b/src/rdnn/impl/management.cpp
@@ -0,0 +1,66 @@
 #include "megbrain/rdnn/management.h"
 #include "megbrain/comp_node_env.h"
 #include "megbrain/tensor.h"
 #include "megbrain/utils/metahelper.h"
 #include "megdnn/handle.h"
 #include "megdnn/oprs.h"
 /* ================== global functions ================== */
 using namespace mgb;
 using namespace mgb::opr;
 namespace {
 template <class Opr>
 class MegDNNGlobalOprContainer final : public UserDataContainer::UserData {
    MGB_TYPEINFO_OBJ_DECL;
    std::shared_ptr<megdnn::Handle> m_megdnn_handle;
    std::unique_ptr<Opr> m_opr;
 public:
    MegDNNGlobalOprContainer(CompNode cn)
            : m_megdnn_handle{intl::get_megdnn_handle_shared(cn)},
              m_opr{m_megdnn_handle->create_operator<Opr>()} {
        mgb_assert(m_opr->is_thread_safe());
    }
    Opr* get() const { return m_opr.get(); }
 };
 template <class Opr>
 MGB_TYPEINFO_OBJ_IMPL(MegDNNGlobalOprContainer<Opr>);
 }  // anonymous namespace
 std::shared_ptr<megdnn::Handle> intl::get_megdnn_handle_shared(CompNode comp_node) {
    auto& handle = MegDNNHandle::get(CompNodeEnv::from_comp_node(comp_node));
    return {handle.shared_from_this(), handle.handle()};
 }
 megdnn::Handle* intl::get_megdnn_handle(CompNode comp_node) {
    return MegDNNHandle::get(CompNodeEnv::from_comp_node(comp_node)).handle();
 }
 template <typename Opr>
 Opr* intl::get_megdnn_global_opr(CompNode comp_node) {
    using T = MegDNNGlobalOprContainer<Opr>;
    auto maker = [comp_node]() { return std::make_shared<T>(comp_node); };
    return CompNodeEnv::from_comp_node(comp_node).get_user_data<T>(maker).get();
 }
 namespace mgb {
 namespace opr {
 namespace intl {
 #define INST(o) template o* get_megdnn_global_opr<o>(CompNode)
 INST(megdnn::AddUpdate);
 INST(megdnn::Relayout);
 INST(megdnn::Checksum);
 #undef INST
 }  // namespace intl
 }  // namespace opr
 }  // namespace mgb
 // vim: syntax=cpp.doxygen foldmethod=marker foldmarker=f{{{,f}}}
--- a/src/rdnn/impl/profiler.cpp
+++ b/src/rdnn/impl/profiler.cpp
@@ -0,0 +1,400 @@
 #include "megbrain/rdnn/profiler.h"
 #include "megbrain/utils/invoke.h"
 #include "megdnn/handle.h"
 #include "megdnn/oprs/base.h"
 #if MGB_ROCM
 #include "hcc_detail/hcc_defs_prologue.h"
 #include "megcore_rocm.h"
 #endif
 //! TODO: here has to be know some megdnn::opr when there is produced midout.h
 //! fix it if there is another graceful way.
 #include "megdnn/oprs.h"
 #include "midout.h"
 MIDOUT_DECL(megbrain_opr_profile)
 #define MIDOUT_B(...) MIDOUT_BEGIN(megbrain_opr_profile, __VA_ARGS__) {
 #define MIDOUT_E \
    }            \
    MIDOUT_END();
 namespace {
 std::string serialize_policy(const megdnn::ExecutionPolicy& policy) {
    std::string ret;
    //! serialize AlgorithmDesc
    megdnn::Algorithm::serialize_write_pod(policy.algo.handle_type, ret);
    megdnn::Algorithm::serialize_write_pod(policy.algo.type, ret);
    uint32_t param_size = policy.algo.param.size();
    uint32_t name_size = policy.algo.name.size();
    megdnn::Algorithm::serialize_write_pod<uint32_t>(param_size, ret);
    megdnn::Algorithm::serialize_write_pod<uint32_t>(name_size, ret);
    ret += policy.algo.param;
    ret += policy.algo.name;
    //! serialize sub_policy
    uint32_t size = policy.sub_policy.size();
    megdnn::Algorithm::serialize_write_pod(size, ret);
    for (auto&& sub : policy.sub_policy) {
        ret += serialize_policy(sub);
    }
    return ret;
 }
 megdnn::ExecutionPolicy deserialize_policy(
        const char* buf, uint32_t size, uint32_t& offset) {
    megdnn::ExecutionPolicy ret;
 #define cb(_val, _type)                                                 \
    _val = megdnn::Algorithm::deserialize_read_pod<_type>(buf, offset); \
    offset += sizeof(_val)
    cb(ret.algo.handle_type, megdnn::Handle::HandleType);
    cb(ret.algo.type, uint32_t);
    uint32_t param_size = 0;
    uint32_t name_size = 0;
    cb(param_size, uint32_t);
    cb(name_size, uint32_t);
    if (param_size > 0) {
        ret.algo.param = std::string(buf + offset, param_size);
        offset += param_size;
    }
    if (name_size > 0) {
        ret.algo.name = std::string(buf + offset, name_size);
        offset += name_size;
    }
    uint32_t nr_policy = 0;
    cb(nr_policy, uint32_t);
 #undef cb
    for (uint32_t i = 0; i < nr_policy; i++) {
        ret.sub_policy.push_back(deserialize_policy(buf, size, offset));
    }
    return ret;
 }
 }  // namespace
 namespace mgb {
 namespace rdnn {
 #define APPLY(statement, ...)                               \
    mgb::apply(                                             \
            [&](const auto&... args) { return statement; }, \
            std::tuple_cat(__VA_ARGS__))
 ////////////// TimedProfiler::Param::ExecutionPolicyBlob //////////////////////
 template <typename Opr>
 typename TimedProfiler<Opr>::Param::ExecutionPolicyBlob TimedProfiler<Opr>::Param::
        ExecutionPolicyBlob::serialize(const megdnn::ExecutionPolicy& policy) {
    ExecutionPolicyBlob ret;
    std::string serialize_bin = serialize_policy(policy);
    mgb_assert(serialize_bin.size() < MAX_SIZE_IN_BYTES);
    memcpy(ret.data, serialize_bin.data(), serialize_bin.size());
    ret.size = serialize_bin.size();
    return ret;
 }
 template <typename Opr>
 megdnn::ExecutionPolicy TimedProfiler<Opr>::Param::ExecutionPolicyBlob::deserialize()
        const {
    uint32_t offset = 0;
    auto&& ret = deserialize_policy(data, size, offset);
    mgb_assert(offset == size);
    return std::move(ret);
 }
 #define INST(Opr)                                                            \
    template typename TimedProfiler<megdnn::Opr>::Param::ExecutionPolicyBlob \
    TimedProfiler<megdnn::Opr>::Param::ExecutionPolicyBlob::serialize(       \
            const megdnn::ExecutionPolicy& policy);                          \
    template megdnn::ExecutionPolicy                                         \
    TimedProfiler<megdnn::Opr>::Param::ExecutionPolicyBlob::deserialize() const;
 DNN_FOREACH_FASTRUN_OPR(INST)
 #undef INST
 ////////////////// TimedProfiler //////////////////////////////
 template <typename Opr>
 const double TimedProfiler<Opr>::timeout_setting =
        TimedProfiler<Opr>::init_timeout_setting();
 template <typename Opr>
 double TimedProfiler<Opr>::init_timeout_setting() {
 #if MGB_ENABLE_FASTRUN
    sys::TimedFuncInvoker::ins().register_func(
            AlgoChooserFuncId<Opr>::ID, &TimedProfiler<Opr>::prof_impl,
            &TimedProfiler<Opr>::prof_init_device);
    auto to_set = MGB_GETENV("MGB_CONV_PROFILING_TIMEOUT");
    if (to_set)
        return std::stod(to_set);
 #endif
    return 0;
 }
 #define APPLY(statement, ...)                               \
    mgb::apply(                                             \
            [&](const auto&... args) { return statement; }, \
            std::tuple_cat(__VA_ARGS__))
 template <typename Opr>
 void TimedProfiler<Opr>::preprocess(
        const TensorLayoutArray&, const megdnn::SmallVector<DeviceTensorND>&,
        UniqPtrWithCN<Opr>&, megdnn::Workspace&, std::array<TensorLayout, arity>&,
        std::array<DeviceTensorND, arity_in>&, PreprocessFilter<Opr>&) {
    // Opr is neither convbias nor convolution.This function do nothing.
 }
 //! convbias
 template <>
 void TimedProfiler<megdnn::ConvBias>::preprocess(
        const TensorLayoutArray& preprocessed_layout,
        const SmallVector<DeviceTensorND>& flt_val,
        UniqPtrWithCN<megdnn::ConvBias>& megdnn_opr, megdnn::Workspace& mdn_workspace,
        std::array<TensorLayout, arity>& layouts,
        std::array<DeviceTensorND, arity_in>& inp_val,
        PreprocessFilter<megdnn::ConvBias>& prep_flt) {
    if (!preprocessed_layout.empty()) {
        auto&& pf = prep_flt;
        pf.algorithm_id = nullptr;
        pf.tensors.resize(flt_val.size());
        for (size_t i = 0; i < flt_val.size(); i++) {
            pf.tensors[i] = flt_val[i].as_megdnn();
        }
        APPLY(megdnn_opr->exec_preprocess(args..., &pf, mdn_workspace),
              std::forward_as_tuple(
                      layouts[0], inp_val[1].as_megdnn(), inp_val[2].as_megdnn()),
              array_skip<arity_in - 1>(layouts));
    }
 }
 //! convolution
 template <>
 void TimedProfiler<megdnn::ConvolutionForward>::preprocess(
        const TensorLayoutArray& preprocessed_layout,
        const megdnn::SmallVector<DeviceTensorND>& flt_val,
        UniqPtrWithCN<megdnn::ConvolutionForward>& megdnn_opr,
        megdnn::Workspace& mdn_workspace, std::array<TensorLayout, arity>& layouts,
        std::array<DeviceTensorND, arity_in>& inp_val,
        PreprocessFilter<megdnn::ConvolutionForward>& prep_flt) {
    if (!preprocessed_layout.empty()) {
        auto&& pf = prep_flt;
        pf.algorithm_id = nullptr;
        pf.tensors.resize(flt_val.size());
        for (size_t i = 0; i < flt_val.size(); i++) {
            pf.tensors[i] = flt_val[i].as_megdnn();
        }
        APPLY(megdnn_opr->exec_preprocess(args..., &pf, mdn_workspace),
              std::forward_as_tuple(layouts[0], inp_val[1].as_megdnn()),
              array_skip<2>(layouts));
    }
 }
 template <typename Opr>
 typename TimedProfiler<Opr>::TResult TimedProfiler<Opr>::prof_impl(
        const TParam& raw_param) {
    MIDOUT_B(Opr, midout_iv(MGB_HASH_STR("TimedProfiler::prof_impl")))
 #if MGB_ROCM
    bool miopen_algo_search_enabled;
    megcore::getMIOpenAlgoSearchStatus(&miopen_algo_search_enabled);
    mgb_assert(miopen_algo_search_enabled, "MIOpen algo search not enabled");
 #endif
    auto&& param = raw_param.as_single_pod<Param>();
    CompNode cn = CompNode::load(param.comp_node_physical, param.comp_node_logical);
    auto megdnn_opr = opr::intl::create_megdnn_opr<Opr>(cn);
    std::array<TensorLayout, arity> layouts;
    auto from_enum = [&](DTypeEnum enumv) -> DType {
        switch (enumv) {
 #define cb(_dt)                  \
    case DTypeTrait<_dt>::enumv: \
        return _dt(1.0f, static_cast<uint8_t>(0))
            cb(dtype::Quantized8Asymm);
            cb(dtype::Quantized4Asymm);
 #undef cb
 #define cb(_dt)                  \
    case DTypeTrait<_dt>::enumv: \
        return _dt(1.0f)
            cb(dtype::QuantizedS8);
            cb(dtype::QuantizedS16);
            cb(dtype::QuantizedS32);
            cb(dtype::QuantizedS4);
            default:
                return DType::from_enum(enumv);
 #undef cb
        }
    };
    for (int i = 0; i < arity; ++i) {
        layouts[i] = {param.shapes[i], from_enum(param.dtypes[i])};
    }
    megdnn_opr->param() = param.opr_param;
    megdnn_opr->execution_policy() = param.execution_policy.deserialize();
    // Allocate preprocessed weight buffers.
    TensorLayoutArray preprocessed_layout;
    if_constexpr<opr_supports_preprocess<Opr>()>([&](auto _) {
        if (param.allow_weight_preprocess) {
            preprocessed_layout = APPLY(
                    _(megdnn_opr)->deduce_preprocessed_filter_layout(args...), layouts);
        }
    });
    {
        // first allocate a whole chunk to avoid memory fragmentation (here we
        // rely on memory allocator to reuse memory)
        auto align = cn.get_mem_addr_alignment();
        size_t tot_size = align;
        for (int i = 0; i < arity; ++i) {
            tot_size += layouts[i].span().high_byte + align;
        }
        for (const auto& layout : preprocessed_layout) {
            tot_size += layout.span().high_byte + align;
        }
        tot_size += param.workspace;
        DeviceTensorStorage storage{cn};
        storage.ensure_size(tot_size);
    }
    // allocate input and output memory
    std::array<DeviceTensorND, arity_in> inp_val;
    std::array<DeviceTensorND, arity_out> out_val;
    DeviceTensorND workspace;
    for (int i = 0; i < arity_in; ++i) {
        inp_val[i].comp_node(cn).dtype(layouts[i].dtype).resize(layouts[i]);
    }
    for (int i = 0; i < arity_out; ++i) {
        out_val[i]
                .comp_node(cn)
                .dtype(layouts[arity_in + i].dtype)
                .resize(layouts[arity_in + i]);
    }
    megdnn::Workspace mdn_workspace;
    // allocate workspace
    if (param.workspace) {
        workspace.comp_node(cn).dtype(dtype::Byte()).resize({param.workspace});
        mdn_workspace.size = param.workspace;
        mdn_workspace.raw_ptr = workspace.raw_ptr();
    }
    // allocate storage for preprocessed filter
    SmallVector<DeviceTensorND> flt_val(preprocessed_layout.size());
    for (size_t i = 0; i < preprocessed_layout.size(); i++) {
        flt_val[i] = {
                cn, preprocessed_layout[i], preprocessed_layout[i].dtype,
                preprocessed_layout[i].format};
    }
    for (int i = 0; i < arity_in; ++i) {
        fill_zero_dev_tensor(inp_val[i]);
    }
    PreprocessFilter<Opr> prep_flt;
    preprocess(
            preprocessed_layout, flt_val, megdnn_opr, mdn_workspace, layouts, inp_val,
            prep_flt);
    RealTimer timer;
    auto ev_start = cn.create_event(CompNode::Event::NEED_TIMER),
         ev_end = cn.create_event(CompNode::Event::NEED_TIMER);
    ev_start->record();
    if_constexpr<opr_supports_preprocess<Opr>()>(
            [&](auto _) {
                auto&& opr = _(megdnn_opr);
                PreprocessFilter<Opr>* pf =
                        preprocessed_layout.empty() ? nullptr : &prep_flt;
                APPLY(opr->exec(args.as_megdnn()..., pf, mdn_workspace), inp_val,
                      out_val);
            },
            /* else */
            [&](auto _) {
                APPLY(_(megdnn_opr)->exec(args.as_megdnn()..., mdn_workspace), inp_val,
                      out_val);
            });
    ev_end->record();
    megdnn::Algorithm* algo =
            megdnn_opr->get_algorithm_from_desc(megdnn_opr->execution_policy().algo);
    mgb_assert(algo);
    double next_report_time = 0.5;
    while (!ev_end->finished()) {
        if (timer.get_secs() >= next_report_time) {
 #if MGB_ENABLE_GETENV
            mgb_log_warn(
                    "profiling conv algo %s already took %.3f/%.3f secs"
                    " (limit can be set by MGB_CONV_PROFILING_TIMEOUT) ",
                    algo->name(), timer.get_secs(), param.actual_timeout);
 #else
            mgb_log_warn(
                    "profiling conv algo %s already took %.3f/%.3f secs", algo->name(),
                    timer.get_secs(), param.actual_timeout);
 #endif
            next_report_time = timer.get_secs() + 1;
        }
        using namespace std::literals;
 #if !__DEPLOY_ON_XP_SP2__
        std::this_thread::sleep_for(1000us);
 #endif
    }
    // release all free blocks owned by child process,
    // in order to avoid main process running out of memory
    cn.try_coalesce_all_free_memory();
    mgb_assert(ev_start->finished());
    return TResult::from_pod(Result{ev_start->elapsed_time_until(*ev_end)});
    MIDOUT_E
 };
 template <typename Opr>
 Maybe<typename TimedProfiler<Opr>::Result> TimedProfiler<Opr>::profile(
        const Param& param, double& timeout) {
    mgb_assert(timeout >= 0);
    if (!timeout) {
        timeout = timeout_setting;
    } else if (timeout_setting) {
        timeout = std::min(timeout, timeout_setting);
    }
    param.actual_timeout = timeout ? timeout : std::numeric_limits<double>::infinity();
    auto res = sys::TimedFuncInvoker::ins().invoke(
            AlgoChooserFuncId<Opr>::ID, TParam::from_pod(const_cast<Param&>(param)),
            timeout);
    if (res.valid())
        return res.val().template as_single_pod<Result>();
    return None;
 }
 template <typename Opr>
 void TimedProfiler<Opr>::prof_init_device(const TParam& raw_param) {
    MIDOUT_B(Opr, midout_iv(MGB_HASH_STR("TimedProfiler::prof_init_device")))
 #if MGB_ROCM
    megcore::enableMIOpenAlgoSearch(true);
 #endif
    auto&& param = raw_param.as_single_pod<Param>();
    CompNode cn = CompNode::load(param.comp_node_physical, param.comp_node_logical);
    // wait for cuda init, so its time does not get accounted in timeout
    cn.sync();
    MIDOUT_E
 }
 #define INST(Opr)                                                             \
    template const double TimedProfiler<megdnn::Opr>::timeout_setting;        \
    template double TimedProfiler<megdnn::Opr>::init_timeout_setting();       \
    template typename TimedProfiler<megdnn::Opr>::TResult                     \
    TimedProfiler<megdnn::Opr>::prof_impl(const TParam& raw_param);           \
    template Maybe<typename TimedProfiler<megdnn::Opr>::Result>               \
    TimedProfiler<megdnn::Opr>::profile(const Param& param, double& timeout); \
    template void TimedProfiler<megdnn::Opr>::prof_init_device(const TParam& raw_param);
 DNN_FOREACH_FASTRUN_OPR(INST)
 #undef INST
 }  // namespace rdnn
 }  // namespace mgb
 // vim: syntax=cpp.doxygen foldmethod=marker foldmarker=f{{{,f}}}
--- a/src/rdnn/include/megbrain/rdnn/algo_chooser.h
+++ b/src/rdnn/include/megbrain/rdnn/algo_chooser.h
@@ -0,0 +1,174 @@
 #pragma once
 #include <memory>
 #include "megbrain/opr/param_defs.h"
 #include "megbrain/rdnn/profiler.h"
 #include "megbrain/utils/persistent_cache.h"
 #include "megdnn/oprs/base.h"
 namespace mgb {
 namespace rdnn {
 //! define logical operation of megdnn::param::ExecutionPolicy::Strategy::Enum
 //! and megdnn::detail::AlgoAttribute enum
 using ExecutionStrategy = megdnn::param::ExecutionPolicy::Strategy;
 using AlgoAttribute = megdnn::AlgoAttribute;
 /* =================== AlgoChooser =================== */
 /*!
 * \brief choose algorithm according to ExecutionPolicy
 *
 * This class only provides static methods, and the entry point is
 * AlgoChooser::setup_algo. When profiling is needed, it would first try to
 * retrive profiling stats from cache, and run TimedProfiler when necessary
 *
 * \tparam Opr megdnn operator impl
 */
 struct AlgoChooserDesc {
    uint32_t shared_batch_size = 0;
    bool binary_equal_between_batch = false;
    bool no_profiling_on_shape_change = false;
    using WorkspaceLimitGetter = std::function<size_t(CompNode, size_t)>;
    WorkspaceLimitGetter get_workspace_limit;
 };
 template <typename Opr>
 class AlgoChooser {
    static constexpr int arity_in = OprArityTrait<Opr>::arity_in;
    static constexpr int arity_out = OprArityTrait<Opr>::arity_out;
    static constexpr int arity = OprArityTrait<Opr>::arity;
    using ImplAlgo = typename Opr::AlgorithmInfo;
    using ImplAlgoDesc = typename Opr::AlgorithmInfo::Desc;
 protected:
    using ImplExecutionPolicy = megdnn::ExecutionPolicy;
 public:
    using FixedTensorLayouts = std::array<TensorLayout, arity>;
    class AlgoChooserHelper {
        //! fastrun layouts
        FixedTensorLayouts m_fastrun_layouts;
        //! layouts used when get and set cache item
        FixedTensorLayouts m_incache_layouts;
        Opr* m_dnn_opr;
        std::string m_param;
        CompNode m_cn;
        megdnn::param::ExecutionPolicy m_execution_policy;
        bool m_allow_weight_preprocess;
        const AlgoChooserDesc& m_desc;
    public:
        MGE_WIN_DECLSPEC_FUC AlgoChooserHelper(
                const FixedTensorLayouts& layouts, Opr* megdnn_opr,
                const std::string& param_str, const CompNode& cn,
                const megdnn::param::ExecutionPolicy& execution_policy,
                bool allow_weight_preprocess, const AlgoChooserDesc& desc);
        Opr* megdnn_opr() const { return m_dnn_opr; }
        const TensorLayout& inp_layout(size_t idx) const {
            return m_fastrun_layouts[idx];
        }
        const megdnn::param::ExecutionPolicy& execution_policy() const {
            return m_execution_policy;
        }
        CompNode comp_node() const { return m_cn; }
        const std::string& param() const { return m_param; }
        bool allow_weight_preprocess() const { return m_allow_weight_preprocess; }
        megdnn::Algorithm* get_algorithm_from_desc(
                const megdnn::Algorithm::Info::Desc& desc) const {
            return m_dnn_opr->get_algorithm_from_desc(desc);
        }
        const FixedTensorLayouts& fastrun_layouts() const { return m_fastrun_layouts; }
        const FixedTensorLayouts& incache_layouts() const { return m_incache_layouts; }
        const AlgoChooserDesc& desc() const { return m_desc; }
        //! construct algo chain by heuristic
        ImplExecutionPolicy choose_by_heuristic(
                const ExecutionStrategy& selected_strategy) const;
        //! construct algo chain by profiling
        ImplExecutionPolicy choose_by_profile(
                const ExecutionStrategy& selected_strategy, bool enable_update) const;
        //! get all profile algorithm from cache, return invalid if not exists
        std::pair<ImplAlgoDesc, Maybe<AlgoChooserProfileCache::Result>>
        get_profile_result_from_cache(const ExecutionStrategy& selected_strategy) const;
        /**
         * \brief construct execution policy from cache or heuristic.
         *
         * \param selected_strategy select algo which matched this strategy
         * \param[in,out] policy execution policy
         * \param retrive_from_cache retrive algo from cache if set True, get
         *     from heuristic otherwise.
         * \param allow_log no warning log print if set True, print warning info
         * otherwise.
         */
        void construct_execution_policy(
                const ExecutionStrategy& selected_strategy, ImplExecutionPolicy& policy,
                bool retrive_from_cache = true, bool allow_log = true) const;
        //! get workspace size required for specific execution policy
        MGE_WIN_DECLSPEC_FUC size_t get_workspace_size_bytes(
                const ImplExecutionPolicy& policy,
                const FixedTensorLayouts& layouts = {}) const;
        //! get all candidate algos, and the one choose_by_heuristic() is
        //! put first
        std::vector<ImplAlgo> get_all_candidates() const;
        /*!
         * \brief profile a single algorithm
         *
         * This is actually a wrapper that constructs param and call
         * TimedProfiler<Opr>::profile for the actual profiling
         *
         * \param[in,out] timeout set the timeout, and return the actual
         *      timeout used during profiling
         */
        Maybe<AlgoChooserProfileCache::ResultEntry> profile_single_algo(
                const ImplExecutionPolicy& policy, double& timeout) const;
        //! profile and save to cache
        void profile(const ExecutionStrategy& selected_strategy) const;
        /**
         * \brief extract algo attribute from execution strategy and graph
         * option.
         *
         * \param strategy select algo which matched this strategy
         * \return pair<positive_attr, negative_attr>
         */
        std::pair<AlgoAttribute, AlgoAttribute> extract_algo_attribute(
                const ExecutionStrategy& strategy) const;
    private:
        Maybe<PreprocessFilter<Opr>> construct_fake_preprocess_filter(
                const FixedTensorLayouts& layouts = {}) const;
    };
    template <typename U>
    friend class AlgoChooser;
    //! entrance for getting algorithm according to execution strategy
    MGE_WIN_DECLSPEC_FUC static ImplExecutionPolicy get_policy(
            const AlgoChooserHelper& helper);
    //! format given layouts to string
    static std::string format_fixlayouts(const FixedTensorLayouts& layout);
 };
 }  // namespace rdnn
 }  // namespace mgb
 // vim: syntax=cpp.doxygen foldmethod=marker foldmarker=f{{{,f}}}
--- a/src/rdnn/include/megbrain/rdnn/management.h
+++ b/src/rdnn/include/megbrain/rdnn/management.h
@@ -0,0 +1,57 @@
 #pragma once
 #include "megbrain/comp_node.h"
 #include "megdnn/handle.h"
 namespace mgb {
 namespace opr {
 namespace intl {
 //! get megdnn handle from comp node
 MGE_WIN_DECLSPEC_FUC megdnn::Handle* get_megdnn_handle(CompNode comp_node);
 MGE_WIN_DECLSPEC_FUC std::shared_ptr<megdnn::Handle> get_megdnn_handle_shared(
        CompNode comp_node);
 /*!
 * \brief get global megdnn operator asscoated with a computing node
 * \tparam Opr megdnn operator class, must be one of:
 *      * AddUpdate
 *      * Relayout
 *      * Checksum
 */
 template <typename Opr>
 MGE_WIN_DECLSPEC_FUC Opr* get_megdnn_global_opr(CompNode comp_node);
 template <class Obj>
 class UniqPtrWithCN : public std::unique_ptr<Obj> {
    CompNode m_cn;
 public:
    UniqPtrWithCN() = default;
    template <class RObj>
    UniqPtrWithCN(UniqPtrWithCN<RObj>&& o)
            : std::unique_ptr<Obj>(std::move(o)), m_cn(o.comp_node()) {}
    UniqPtrWithCN(std::unique_ptr<Obj> ptr, CompNode cn)
            : std::unique_ptr<Obj>{std::move(ptr)}, m_cn{cn} {}
    CompNode comp_node() const { return m_cn; }
 };
 //! create megdnn opr from megdnn handle in a CompNode
 template <class Opr>
 UniqPtrWithCN<Opr> create_megdnn_opr(CompNode comp_node) {
    return {get_megdnn_handle(comp_node)->create_operator<Opr>(), comp_node};
 }
 }  // namespace intl
 }  // namespace opr
 namespace rdnn {
 template <typename Obj>
 using UniqPtrWithCN = opr::intl::UniqPtrWithCN<Obj>;
 }  // namespace rdnn
 }  // namespace mgb
 // vim: syntax=cpp.doxygen foldmethod=marker foldmarker=f{{{,f}}}
--- a/src/rdnn/include/megbrain/rdnn/profiler.h
+++ b/src/rdnn/include/megbrain/rdnn/profiler.h
@@ -0,0 +1,152 @@
 #pragma once
 #include "megbrain/comp_node.h"
 #include "megbrain/rdnn/management.h"
 #include "megbrain/system.h"
 #include "megbrain/tensor.h"
 #include "megbrain/utils/hash_ct.h"
 #include "megbrain/utils/timer.h"
 #include "megdnn/basic_types.h"
 #include "megdnn/oprs.h"
 namespace mgb {
 namespace rdnn {
 // clang-format off
 #define DNN_FOREACH_FASTRUN_OPR(cb)  \
    cb(ConvolutionForward)           \
    cb(ConvBiasForward)              \
    cb(ConvolutionBackwardData)      \
    cb(ConvolutionBackwardFilter)    \
    cb(Convolution3DForward)         \
    cb(Convolution3DBackwardData)    \
    cb(Convolution3DBackwardFilter)  \
    cb(LocalShareForward)            \
    cb(LocalShareBackwardData)       \
    cb(LocalShareBackwardFilter)     \
    cb(DeformableConvForward)        \
    cb(DeformableConvBackwardFilter) \
    cb(DeformableConvBackwardData)   \
    cb(BatchConvBiasForward)         \
    cb(MatrixMul)                    \
    cb(BatchedMatrixMul)             \
    cb(PoolingForward)               \
    cb(PoolingBackward)
 // clang-format on
 template <typename Opr>
 constexpr bool opr_supports_preprocess() {
    return std::is_same<Opr, megdnn::ConvolutionForward>::value ||
           std::is_same<Opr, megdnn::ConvBias>::value;
 }
 template <typename Opr>
 constexpr bool opr_contain_bias() {
    return std::is_same<Opr, megdnn::ConvBias>::value;
 }
 //! matmul and batchedMatrixMul
 template <typename Opr>
 constexpr bool is_matmul() {
    return std::is_same<Opr, megdnn::MatrixMul>::value ||
           std::is_same<Opr, megdnn::BatchedMatrixMul>::value;
 }
 template <typename Opr, bool has_prep>
 struct PreprocessFilterImpl {
    using T = union {};
 };
 template <typename Opr>
 struct PreprocessFilterImpl<Opr, true> {
    using T = typename Opr::PreprocessedFilter;
 };
 template <typename Opr>
 using PreprocessFilter =
        typename PreprocessFilterImpl<Opr, opr_supports_preprocess<Opr>()>::T;
 template <typename Opr>
 struct AlgoChooserFuncId {};
 #define DEF_FUNC_ID(func)                                                           \
    template <>                                                                     \
    struct AlgoChooserFuncId<megdnn::func> {                                        \
        __attribute__((unused)) static constexpr sys::TimedFuncInvoker::FuncId ID = \
                static_cast<sys::TimedFuncInvoker::FuncId>(                         \
                        MGB_HASH_STR("megdnn::" #func));                            \
    };
 DNN_FOREACH_FASTRUN_OPR(DEF_FUNC_ID)
 #undef DEF_FUNC_ID
 /* =================== TimedProfiler =================== */
 /*!
 * \brief profile a megdnn opr conv with given param
 *
 * This class only provides static methods, and the entry point is
 * TimedProfiler::profile; it would run profiler in a timed environment by
 * sys::TimedFuncInvoker
 *
 * \tparam Opr megdnn opr impl
 */
 template <typename Opr>
 class TimedProfiler {
    static constexpr int arity_in = OprArityTrait<Opr>::arity_in;
    static constexpr int arity_out = OprArityTrait<Opr>::arity_out;
    static constexpr int arity = OprArityTrait<Opr>::arity;
    using TensorShapeArray = std::array<megdnn::TensorShape, arity>;
 public:
    struct Param {
        struct ExecutionPolicyBlob {
            //! enlarge the max size if needed
            constexpr static size_t MAX_SIZE_IN_BYTES = 10240;
            char data[MAX_SIZE_IN_BYTES];
            uint32_t size;
            static ExecutionPolicyBlob serialize(const megdnn::ExecutionPolicy& policy);
            megdnn::ExecutionPolicy deserialize() const;
        };
        ExecutionPolicyBlob execution_policy;
        size_t workspace;
        megdnn::DTypeEnum dtypes[arity];
        CompNode::Locator comp_node_physical, comp_node_logical;
        TensorShapeArray shapes;
        typename Opr::Param opr_param;
        bool allow_weight_preprocess;
        //! filled by profile()
        mutable double actual_timeout;
    };
    struct Result {
        double time;
    };
    static Maybe<Result> profile(const Param& param, double& timeout);
 private:
    using TParam = sys::TimedFuncInvoker::Param;
    using TResult = sys::TimedFuncInvoker::Result;
    static const double timeout_setting;
    static double init_timeout_setting();
    static void preprocess(
            const megdnn::TensorLayoutArray& preprocessed_layout,
            const SmallVector<DeviceTensorND>& flt_val, UniqPtrWithCN<Opr>& megdnn_opr,
            megdnn::Workspace& mdn_workspace, std::array<TensorLayout, arity>& layouts,
            std::array<DeviceTensorND, arity_in>& inp_val,
            PreprocessFilter<Opr>& prep_flt);
    static TResult prof_impl(const TParam& raw_param);
    static void prof_init_device(const TParam& raw_param);
 };
 }  // namespace rdnn
 }  // namespace mgb
 // vim: syntax=cpp.doxygen foldmethod=marker foldmarker=f{{{,f}}}