From d56570d929d496d0f49a0c256de34bb60f423e2c Mon Sep 17 00:00:00 2001 From: Megvii Engine Team Date: Fri, 18 Mar 2022 18:03:18 +0800 Subject: [PATCH] fix(megbrain): add rdnn to copybara GitOrigin-RevId: 7d8bf770532d385dc8a797192cddf5587b27afe0 --- imperative/src/impl/proxy_graph/mini_graph.h | 19 +- src/rdnn/impl/algo_chooser.cpp | 1148 +++++++++++++++++++++++++ src/rdnn/impl/management.cpp | 66 ++ src/rdnn/impl/profiler.cpp | 400 +++++++++ src/rdnn/include/megbrain/rdnn/algo_chooser.h | 174 ++++ src/rdnn/include/megbrain/rdnn/management.h | 57 ++ src/rdnn/include/megbrain/rdnn/profiler.h | 152 ++++ 7 files changed, 2001 insertions(+), 15 deletions(-) create mode 100644 src/rdnn/impl/algo_chooser.cpp create mode 100644 src/rdnn/impl/management.cpp create mode 100644 src/rdnn/impl/profiler.cpp create mode 100644 src/rdnn/include/megbrain/rdnn/algo_chooser.h create mode 100644 src/rdnn/include/megbrain/rdnn/management.h create mode 100644 src/rdnn/include/megbrain/rdnn/profiler.h diff --git a/imperative/src/impl/proxy_graph/mini_graph.h b/imperative/src/impl/proxy_graph/mini_graph.h index 67cc292a..be393406 100644 --- a/imperative/src/impl/proxy_graph/mini_graph.h +++ b/imperative/src/impl/proxy_graph/mini_graph.h @@ -320,15 +320,12 @@ public: input->force_assign_dev_tensor_from_tensor(dev_tensor); - mgb_assert(input->comp_node() == dev_tensor.comp_node()); mgb_assert(input->shape().eq_shape(layout)); - mgb_assert(input->dtype() == layout.dtype); idx++; } } void init_output_tensor(const SmallVector& outputs) { - mgb_assert(m_opr->usable_output().size() == outputs.size()); ::mgb::opr::intl::WorkspaceLimitHook::set_impl( m_opr->owner_graph(), get_workspace_limit); @@ -347,9 +344,6 @@ public: mgb_assert(j < outputs.size()); auto&& tensor = outputs[j]; auto&& layout = tensor->layout(); - mgb_assert(var->comp_node() == tensor->comp_node()); - mgb_assert(var->shape().eq_shape(layout)); - mgb_assert(var->dtype() == layout.dtype); if (var->m_mem_plan.chunk().owner_var != var) { tensor->assign_from_dev_tensor( var->m_dev_tensor); // memory forwarding @@ -816,7 +810,6 @@ public: // minigraph.opr()->usable_output() bug execution may use the attrs for those // output var, so we infer attrs for all outputs, but only return // LogicalTensorDesc for minigraph.opr()->usable_output() - SmallVector output_descs; for (size_t i = 0; i < minigraph.opr()->output().size(); ++i) { auto* var = minigraph.opr()->output()[i]; auto* shape = sess.infer(sess.output_data[i].shape_infer, true); @@ -824,19 +817,15 @@ public: var->shape(*shape); } - for (size_t i = 0; i < minigraph.output_size(); ++i) { + SmallVector outputs(minigraph.output_size(), {}); + for (size_t i = 0; i < outputs.size(); i++) { auto* ovar = minigraph.output_var(i); mgb_assert(ovar->dtype().valid() && ovar->comp_node().valid()); mgb_assert( ovar->shape().ndim || ovar->contain_flag(VarNode::Flag::NO_SYS_MEM_ALLOC)); - output_descs.push_back({{ovar->shape(), ovar->dtype()}, ovar->comp_node()}); - } - - SmallVector outputs(output_descs.size(), {}); - for (size_t i = 0; i < outputs.size(); i++) { - outputs[i] = - Tensor::make(output_descs[i].layout, output_descs[i].comp_node); + outputs[i] = Tensor::make( + TensorLayout{ovar->shape(), ovar->dtype()}, ovar->comp_node()); } auto raw_outputs = to_raw_ptr_array(outputs, false); diff --git a/src/rdnn/impl/algo_chooser.cpp b/src/rdnn/impl/algo_chooser.cpp new file mode 100644 index 00000000..00bcf8b7 --- /dev/null +++ b/src/rdnn/impl/algo_chooser.cpp @@ -0,0 +1,1148 @@ +#include +#include + +#include "megbrain/exception.h" +#include "megbrain/rdnn/algo_chooser.h" +#include "megbrain/utils/invoke.h" + +//! TODO: here has to be know some megdnn::opr when there is produced midout.h +//! fix it if there is another graceful way. +#include "megdnn/opr_param_defs.h" +#include "megdnn/oprs.h" +#include "megdnn/oprs/base.h" +#include "midout.h" +MIDOUT_DECL(megbrain_opr_algo_chooser) +#define MIDOUT_B(...) MIDOUT_BEGIN(megbrain_opr_algo_chooser, __VA_ARGS__) { +#define MIDOUT_E \ + } \ + MIDOUT_END(); + +using namespace megdnn; +using namespace mgb; + +#define APPLY(statement, ...) \ + mgb::apply( \ + [&](const auto&... args) { return statement; }, \ + std::tuple_cat(__VA_ARGS__)) + +// timeout delta to be added with fastest known algorithm for new algos +constexpr double TIMEOUT_TOLERANCE = 2; + +#define CACHE_KEY_VERSION "v5" + +namespace { + +template +struct MegDNNOpr2Typename; + +#define cb(_Opr) \ + template <> \ + struct MegDNNOpr2Typename { \ + static const char* name; \ + }; \ + const char* MegDNNOpr2Typename::name = #_Opr; + +DNN_FOREACH_FASTRUN_OPR(cb) +#undef cb + +template +std::string profile_name(Opr* opr) { + std::string ret = std::string(::MegDNNOpr2Typename::name) + CACHE_KEY_VERSION; + ret.append(opr->get_algorithm_set_name()); + return ret; +} + +template +std::string format_fixlayouts( + const typename rdnn::AlgoChooser::FixedTensorLayouts& layouts, + size_t arity_in, size_t arity_out, const std::string& delimiter = " -> ") { + std::string ret; + if (arity_in) { + ret.append("("); + for (size_t i = 0; i < arity_in; ++i) { + if (i) { + ret.append(", "); + } + ret.append(layouts[i].to_string() + " "); + } + ret.append(")"); + } + if (arity_in && arity_out) { + ret.append(delimiter); + } + if (arity_out) { + ret.append("("); + for (size_t i = 0; i < arity_out; ++i) { + if (i) { + ret.append(", "); + } + ret.append(layouts[i + arity_in].to_string() + " "); + } + ret.append(")"); + } + return ret; +} + +/** + * \brief Check if the sub opr list has circular dependence. + */ +class CircularDepsChecker { + struct SearchItemStorage { + std::string data_hold; + size_t hash = 0; + + SearchItemStorage(const Algorithm::SearchItem& item) { + Algorithm::serialize_write_pod(item.opr_type, data_hold); + for (auto&& layout : item.layouts) { + data_hold += layout.serialize(); + } + data_hold += item.param; + } + + SearchItemStorage& init_hash() { + hash = XXHash64CT::hash(data_hold.data(), data_hold.size(), 20201225); + return *this; + } + + bool operator==(const SearchItemStorage& rhs) const { + return data_hold == rhs.data_hold; + } + + struct Hash { + size_t operator()(const SearchItemStorage& s) const { return s.hash; } + }; + }; + std::unordered_set m_set; + +public: + void put(const megdnn::Algorithm::SearchItem& key) { + SearchItemStorage key_storage(key); + key_storage.init_hash(); + mgb_assert( + m_set.find(key_storage) == m_set.end(), + "Circular dependency during flatten search space"); + auto ret = m_set.insert(std::move(key_storage)); + mgb_assert(ret.second); + } + void remove(const megdnn::Algorithm::SearchItem& key) { + SearchItemStorage key_storage(key); + key_storage.init_hash(); + auto&& iter = m_set.find(key_storage); + mgb_assert(iter != m_set.end()); + m_set.erase(iter); + } +}; + +///////////////// OprTypeTrait ///////////////////////////// +template +struct OprFromOprTypeTrait; + +template +struct OprTypeFromOprTrait; + +#define cb(_opr_type, _opr) \ + template <> \ + struct OprFromOprTypeTrait { \ + using Opr = megdnn::_opr; \ + }; \ + template <> \ + struct OprTypeFromOprTrait { \ + constexpr static megdnn::Algorithm::OprType opr_type = \ + megdnn::Algorithm::OprType::_opr_type; \ + } + +cb(MATRIX_MUL_FORWARD, MatrixMulForward); +cb(BATCHED_MATRIX_MUL_FORWARD, BatchedMatrixMulForward); +cb(CONVOLUTION_FORWARD, ConvolutionForward); +cb(CONVOLUTION_BACKWARD_DATA, ConvolutionBackwardData); +cb(CONVOLUTION_BACKWARD_FILTER, ConvolutionBackwardFilter); +cb(CONVOLUTION3D_FORWARD, Convolution3DForward); +cb(CONVOLUTION3D_BACKWARD_DATA, Convolution3DBackwardData); +cb(CONVOLUTION3D_BACKWARD_FILTER, Convolution3DBackwardFilter); +cb(LOCAL_SHARE_FORWARD, LocalShareForward); +cb(LOCAL_SHARE_BACKWARD_DATA, LocalShareBackwardData); +cb(LOCAL_SHARE_BACKWARD_FILTER, LocalShareBackwardFilter); +cb(DEFORMABLE_CONV_FORWARD, DeformableConvForward); +cb(DEFORMABLE_CONV_BACKWARD_DATA, DeformableConvBackwardData); +cb(DEFORMABLE_CONV_BACKWARD_FILTER, DeformableConvBackwardFilter); +cb(BATCH_CONV_FORWARD, BatchConvBiasForward); +cb(CONVBIAS_FORWARD, ConvBiasForward); +cb(POOLING_FORWARD, PoolingForward); +cb(POOLING_BACKWARD, PoolingBackward); + +#undef cb + +// clang-format off +#define FOREACH_OPR_TYPE_WITH_STMT(cb, stmt) \ + cb(MATRIX_MUL_FORWARD, stmt) \ + cb(BATCHED_MATRIX_MUL_FORWARD, stmt) \ + cb(CONVOLUTION_FORWARD, stmt) \ + cb(CONVOLUTION_BACKWARD_DATA, stmt) \ + cb(CONVOLUTION_BACKWARD_FILTER, stmt) \ + cb(CONVOLUTION3D_FORWARD, stmt) \ + cb(CONVOLUTION3D_BACKWARD_DATA, stmt) \ + cb(CONVOLUTION3D_BACKWARD_FILTER, stmt) \ + cb(LOCAL_SHARE_FORWARD, stmt) \ + cb(LOCAL_SHARE_BACKWARD_DATA, stmt) \ + cb(LOCAL_SHARE_BACKWARD_FILTER, stmt) \ + cb(DEFORMABLE_CONV_FORWARD, stmt) \ + cb(DEFORMABLE_CONV_BACKWARD_DATA, stmt) \ + cb(DEFORMABLE_CONV_BACKWARD_FILTER, stmt) \ + cb(BATCH_CONV_FORWARD, stmt) \ + cb(CONVBIAS_FORWARD, stmt) \ + cb(POOLING_FORWARD, stmt) \ + cb(POOLING_BACKWARD, stmt) +// clang-format on + +#define _OPR_TYPE_CASE(_opr_type, _stmt) \ + case Algorithm::OprType::_opr_type: { \ + using _Opr = typename OprFromOprTypeTrait::Opr; \ + _stmt; \ + break; \ + } + +#define FOREACH_OPR_TYPE_DISPATCH(_search_items, _stmt) \ + for (size_t _item_idx = 0; _item_idx < _search_items.size(); _item_idx++) { \ + auto&& _item = _search_items[_item_idx]; \ + switch (_item.opr_type) { \ + FOREACH_OPR_TYPE_WITH_STMT(_OPR_TYPE_CASE, _stmt) \ + default: \ + mgb_throw(MegBrainError, "unknown opr_type"); \ + } \ + } + +template +TensorLayoutArray to_layout_array( + const typename rdnn::AlgoChooser::FixedTensorLayouts& layouts) { + TensorLayoutArray ret; + for (auto&& layout : layouts) { + ret.push_back(layout); + } + return ret; +} + +template +typename rdnn::AlgoChooser::FixedTensorLayouts to_fixed_layouts( + const TensorLayoutArray& layouts) { + typename rdnn::AlgoChooser::FixedTensorLayouts ret; + mgb_assert(ret.size() == layouts.size()); + size_t idx = 0; + for (auto&& layout : layouts) { + ret[idx++] = layout; + } + return ret; +} + +/** + * flatten search space in postorder traversal + * The subopr search construct a search tree + * + * A + * / \ + * B1B2 C + * / \ + * D1D2D3 E + * We use postorder traverse the search tree. + * D1 -> D2 -> D3 -> E -> B1 -> B2 -> C -> A + */ +template +std::vector flatten_search_space( + const typename rdnn::AlgoChooser::AlgoChooserHelper& helper, + CircularDepsChecker& checker) { + auto&& search_item = megdnn::Algorithm::SearchItem{ + OprTypeFromOprTrait::opr_type, helper.param(), + to_layout_array(helper.fastrun_layouts())}; + checker.put(search_item); + std::vector ret; + for (auto algo_info : helper.get_all_candidates()) { + megdnn::Algorithm* algo = helper.get_algorithm_from_desc(algo_info.desc); + mgb_assert(algo, "Unknown algo description"); + std::vector&& sub_items = algo->get_subopr_list( + to_layout_array(helper.fastrun_layouts()), helper.megdnn_opr()); + + FOREACH_OPR_TYPE_DISPATCH(sub_items, { + auto&& megdnn_opr = opr::intl::create_megdnn_opr<_Opr>(helper.comp_node()); + megdnn_opr->param() = + Algorithm::deserialize_read_pod(_item.param); + typename rdnn::AlgoChooser<_Opr>::AlgoChooserHelper sub_helper( + to_fixed_layouts<_Opr>(_item.layouts), megdnn_opr.get(), + _item.param, helper.comp_node(), helper.execution_policy(), + helper.allow_weight_preprocess(), helper.desc()); + auto space = flatten_search_space<_Opr>(sub_helper, checker); + ret.insert(ret.end(), space.begin(), space.end()); + }); + } + ret.push_back(search_item); + checker.remove(search_item); + return ret; +} + +//! serialize a algo's desc to string. format is +//! handle_type|algo_type|size_of_param|size_of_name|string_of_param|string_of_name +static void serialize_write_pod(const Algorithm::Info::Desc& val, std::string& result) { + megdnn::Algorithm::serialize_write_pod(val.handle_type, result); + megdnn::Algorithm::serialize_write_pod(val.type, result); + uint32_t param_size = val.param.size(); + uint32_t name_size = val.name.size(); + megdnn::Algorithm::serialize_write_pod(param_size, result); + megdnn::Algorithm::serialize_write_pod(name_size, result); + megdnn::Algorithm::serialize_write_pod(val.param, result); + megdnn::Algorithm::serialize_write_pod(val.name, result); +} + +static Algorithm::Info::Desc deserialize_read_pod( + const std::string& data, size_t offset = 0) { + Algorithm::Info::Desc ret; +#define cb(_val, _type) \ + _val = megdnn::Algorithm::deserialize_read_pod<_type>(data.data(), offset); \ + offset += sizeof(_val) + + cb(ret.handle_type, megdnn::Handle::HandleType); + cb(ret.type, uint32_t); + + uint32_t param_size = 0; + uint32_t name_size = 0; + cb(param_size, uint32_t); + cb(name_size, uint32_t); + + if (param_size > 0) { + ret.param = std::string(data.data() + offset, param_size); + offset += param_size; + } + if (name_size > 0) { + ret.name = std::string(data.data() + offset, name_size); + offset += name_size; + } + return ret; +} + +} // namespace + +namespace megdnn { +namespace param { +MGB_DEF_ENUM_CLASS_BIT_OPR(ExecutionPolicy::Strategy) +} // namespace param +} // namespace megdnn + +namespace mgb { +namespace rdnn { +template +class LayoutsModifier { + using FixedTensorLayouts = typename AlgoChooser::FixedTensorLayouts; + +public: + static void on(FixedTensorLayouts&, const typename Opr::Param&, size_t) {} + +private: + //! index of batch in tensor, 3 for CHWN4 e.g. + static size_t index_of_batch(const typename Opr::Param&) { return 0; } + + //! indices contain batch in inputs and outputs, src(0) dst(2) for conv e.g. + static std::vector sm_indices_contain_batch; +}; +template +std::vector LayoutsModifier::sm_indices_contain_batch = {}; + +#define DEFAULT_OPR_WITHOUT_INPUT_BROADCAST(opr, idxs) \ + template <> \ + class LayoutsModifier { \ + public: \ + using FixedTensorLayouts = typename AlgoChooser::FixedTensorLayouts; \ + static void on( \ + FixedTensorLayouts& layouts, const opr::Param& param, \ + size_t new_batch_size) { \ + size_t batch_index = index_of_batch(param); \ + for (size_t index : sm_indices_contain_batch) { \ + layouts.at(index)[batch_index] = new_batch_size; \ + } \ + } \ + \ + private: \ + static size_t index_of_batch(const opr::Param&) { return 0; } \ + static std::vector sm_indices_contain_batch; \ + }; \ + std::vector LayoutsModifier::sm_indices_contain_batch = idxs; + +DEFAULT_OPR_WITHOUT_INPUT_BROADCAST( + megdnn::Convolution3DForward, (std::initializer_list{0, 2})) +DEFAULT_OPR_WITHOUT_INPUT_BROADCAST( + megdnn::Convolution3DBackwardData, (std::initializer_list{1, 2})) +DEFAULT_OPR_WITHOUT_INPUT_BROADCAST( + megdnn::Convolution3DBackwardFilter, (std::initializer_list{0, 1})) +DEFAULT_OPR_WITHOUT_INPUT_BROADCAST( + megdnn::BatchedMatrixMul, (std::initializer_list{0, 1, 2})) +#undef DEFAULT_OPR_WITHOUT_INPUT_BROADCAST + +#define CONV_LIKE_OPR_WITHOUT_INPUT_BROADCAST(opr, idxs) \ + template <> \ + class LayoutsModifier { \ + public: \ + using FixedTensorLayouts = typename AlgoChooser::FixedTensorLayouts; \ + static void on( \ + FixedTensorLayouts& layouts, const opr::Param& param, \ + size_t new_batch_size) { \ + size_t batch_index = index_of_batch(param); \ + for (size_t index : sm_indices_contain_batch) { \ + layouts.at(index)[batch_index] = new_batch_size; \ + } \ + } \ + \ + private: \ + static size_t index_of_batch(const opr::Param& param) { \ + if (param.format == opr::Param::Format::CHWN4) { \ + return 3; \ + } \ + return 0; \ + } \ + static std::vector sm_indices_contain_batch; \ + }; \ + std::vector LayoutsModifier::sm_indices_contain_batch = idxs; + +CONV_LIKE_OPR_WITHOUT_INPUT_BROADCAST( + megdnn::ConvolutionForward, (std::initializer_list{0, 2})) +CONV_LIKE_OPR_WITHOUT_INPUT_BROADCAST( + megdnn::ConvolutionBackwardData, (std::initializer_list{1, 2})) +CONV_LIKE_OPR_WITHOUT_INPUT_BROADCAST( + megdnn::ConvolutionBackwardFilter, (std::initializer_list{0, 1})) +CONV_LIKE_OPR_WITHOUT_INPUT_BROADCAST( + megdnn::LocalShareForward, (std::initializer_list{0, 2})) +CONV_LIKE_OPR_WITHOUT_INPUT_BROADCAST( + megdnn::LocalShareBackwardData, (std::initializer_list{1, 2})) +CONV_LIKE_OPR_WITHOUT_INPUT_BROADCAST( + megdnn::LocalShareBackwardFilter, (std::initializer_list{0, 1})) +CONV_LIKE_OPR_WITHOUT_INPUT_BROADCAST( + megdnn::DeformableConvForward, (std::initializer_list{0, 2, 3, 4})) +CONV_LIKE_OPR_WITHOUT_INPUT_BROADCAST( + megdnn::DeformableConvBackwardData, + (std::initializer_list{0, 2, 3, 4, 5, 6, 7})) +CONV_LIKE_OPR_WITHOUT_INPUT_BROADCAST( + megdnn::DeformableConvBackwardFilter, + (std::initializer_list{0, 1, 2, 3})) +CONV_LIKE_OPR_WITHOUT_INPUT_BROADCAST( + megdnn::BatchConvBiasForward, (std::initializer_list{0, 1, 2, 3, 4})) +#undef CONV_LIKE_OPR_WITHOUT_INPUT_BROADCAST + +template <> +class LayoutsModifier { +public: + using FixedTensorLayouts = + typename AlgoChooser::FixedTensorLayouts; + static void on( + FixedTensorLayouts& layouts, const megdnn::ConvBiasForward::Param& param, + size_t new_batch_size) { + size_t batch_index = index_of_batch(param); + for (size_t index : sm_indices_contain_batch) { + layouts.at(index)[batch_index] = new_batch_size; + } + for (size_t index : sm_indices_contain_batch_broadcast) { + if (!check_bias_share_in_channel(layouts.at(index), param.format)) { + layouts.at(index)[batch_index] = new_batch_size; + } + } + } + +private: + static std::vector sm_indices_contain_batch; + static std::vector sm_indices_contain_batch_broadcast; + static size_t index_of_batch(const megdnn::ConvBiasForward::Param& param) { + if (param.format == megdnn::ConvBiasForward::Param::Format::CHWN4) { + return 3; + } + return 0; + } +}; +std::vector LayoutsModifier::sm_indices_contain_batch = + {0, 3, 4}; +std::vector + LayoutsModifier::sm_indices_contain_batch_broadcast = { + 2}; + +template <> +class LayoutsModifier { +public: + using FixedTensorLayouts = + typename AlgoChooser::FixedTensorLayouts; + static void on( + FixedTensorLayouts& layouts, const megdnn::MatrixMul::Param& param, + size_t new_batch_size) { + //! Because we do not know whether the batch size is in the dimension m + //! or the dimension n, we just ignore both m and n here. + // FIXME Find a way to make mgb obtain batch size information from R or + // automatically + layouts.at(2)[0] = new_batch_size; + layouts.at(2)[1] = new_batch_size; + if (param.transposeA) { + layouts.at(0)[1] = new_batch_size; + } else { + layouts.at(0)[0] = new_batch_size; + } + if (param.transposeB) { + layouts.at(1)[0] = new_batch_size; + } else { + layouts.at(1)[1] = new_batch_size; + } + } +}; + +///////////////////////////// AlgoChooserHelper ////////////////////////// +template +AlgoChooser::AlgoChooserHelper::AlgoChooserHelper( + const FixedTensorLayouts& layouts, Opr* megdnn_opr, + const std::string& param_str, const CompNode& cn, + const megdnn::param::ExecutionPolicy& execution_policy, + bool allow_weight_preprocess, const AlgoChooserDesc& desc) + : m_fastrun_layouts{layouts}, + m_incache_layouts{layouts}, + m_dnn_opr{megdnn_opr}, + m_param{param_str}, + m_cn{cn}, + m_execution_policy{execution_policy}, + m_allow_weight_preprocess{allow_weight_preprocess}, + m_desc{desc} { + auto fastrun_batch_size = desc.shared_batch_size; + + if (fastrun_batch_size) { + LayoutsModifier::on(m_incache_layouts, m_dnn_opr->param(), 0); + LayoutsModifier::on( + m_fastrun_layouts, m_dnn_opr->param(), fastrun_batch_size); + } + + if (m_desc.no_profiling_on_shape_change) { + for (size_t i = 0; i < m_incache_layouts.size(); i++) { + for (size_t j = 0; j < m_incache_layouts.at(i).ndim; j++) { + m_incache_layouts.at(i)[j] = 0; + m_incache_layouts.at(i).stride[j] = 0; + } + } + } + + mgb_assert(m_fastrun_layouts.size() == layouts.size()); + + static_assert( + std::tuple_size::value == 2 || + std::tuple_size::value == 3 || + std::tuple_size::value == 4 || + std::tuple_size::value == 5 || + std::tuple_size::value == 8, + "Pooling assumes arity = 2 or 4,Convolution AlgoChooser assumes " + "arity = 3 , 5 or 8 (for deformable conv)"); +} + +template +typename AlgoChooser::ImplExecutionPolicy AlgoChooser::AlgoChooserHelper:: + choose_by_heuristic(const ExecutionStrategy& selected_strategy) const { + MIDOUT_B(Opr, midout_iv(MGB_HASH_STR("choose_by_heuristic"))) + ImplExecutionPolicy policy; + auto workspace_limit = + m_desc.get_workspace_limit(m_cn, m_execution_policy.workspace_limit); + auto attr = extract_algo_attribute(selected_strategy); + policy.algo = APPLY(m_dnn_opr->get_algorithm_info_heuristic( + args..., workspace_limit, attr.first, attr.second), + m_fastrun_layouts) + .desc; + + Algorithm* algo = m_dnn_opr->get_algorithm_from_desc(policy.algo); + mgb_assert(algo, "Unknown algo description"); + std::vector&& sub_items = + algo->get_subopr_list(to_layout_array(m_fastrun_layouts), m_dnn_opr); + + FOREACH_OPR_TYPE_DISPATCH(sub_items, { + auto&& megdnn_opr = opr::intl::create_megdnn_opr<_Opr>(m_cn); + megdnn_opr->param() = + Algorithm::deserialize_read_pod(_item.param); + typename AlgoChooser<_Opr>::AlgoChooserHelper sub_helper( + to_fixed_layouts<_Opr>(_item.layouts), megdnn_opr.get(), _item.param, + m_cn, m_execution_policy, m_allow_weight_preprocess, m_desc); + policy.sub_policy.push_back(sub_helper.choose_by_heuristic(selected_strategy)); + }); + + return policy; + MIDOUT_E +} + +template +typename AlgoChooser::ImplExecutionPolicy AlgoChooser::AlgoChooserHelper:: + choose_by_profile( + const ExecutionStrategy& selected_strategy, bool enable_update) const { + MIDOUT_B(Opr, midout_iv(MGB_HASH_STR("choose_by_profile"))) + if (m_desc.no_profiling_on_shape_change) { + auto policy = m_dnn_opr->execution_policy(); + if (policy.algo.valid()) { + return policy; + } + + if (is_matmul()) { + mgb_log_warn( + "choose algo by heuristic, which may cause performance " + "regression."); + return choose_by_heuristic(selected_strategy); + } + } + + typename AlgoChooser::ImplExecutionPolicy tmp_policy; + bool retrive_from_cache = true; + bool allow_log = false; + construct_execution_policy( + selected_strategy, tmp_policy, retrive_from_cache, allow_log); + if (tmp_policy.algo.valid()) { + // return policy when contruct successed + return tmp_policy; + } + + if (enable_update) { + CircularDepsChecker circular_deps_checker; + auto&& search_items = flatten_search_space(*this, circular_deps_checker); + FOREACH_OPR_TYPE_DISPATCH(search_items, { + auto&& megdnn_opr = opr::intl::create_megdnn_opr<_Opr>(m_cn); + megdnn_opr->param() = + Algorithm::deserialize_read_pod(_item.param); + typename AlgoChooser<_Opr>::AlgoChooserHelper sub_helper( + to_fixed_layouts<_Opr>(_item.layouts), megdnn_opr.get(), + _item.param, m_cn, m_execution_policy, m_allow_weight_preprocess, + m_desc); + sub_helper.profile(selected_strategy); + }); + } + + typename AlgoChooser::ImplExecutionPolicy policy; + construct_execution_policy(selected_strategy, policy); + return policy; + MIDOUT_E +} + +template +std::pair< + typename AlgoChooser::ImplAlgoDesc, Maybe> +AlgoChooser::AlgoChooserHelper::get_profile_result_from_cache( + const ExecutionStrategy& selected_strategy) const { + MIDOUT_B(Opr, midout_iv(MGB_HASH_STR("get_profile_result_from_cache"))) + AlgoChooserProfileCache cache(m_cn, profile_name(m_dnn_opr).c_str()); + + typename Opr::Param origin_param = m_dnn_opr->param(); + AlgoChooserProfileCache::Key cache_key{ + m_incache_layouts.data(), m_incache_layouts.size(), &origin_param, + sizeof(origin_param)}; + auto&& rst = cache.get(cache_key); + if (!rst.valid()) + return {{}, rst}; + + auto&& prof = rst.val(); + if (prof.empty()) + return {{}, rst}; + + size_t workspace_limit = + m_desc.get_workspace_limit(m_cn, m_execution_policy.workspace_limit); + auto target_attr = extract_algo_attribute(selected_strategy); + bool skip_by_negative = false; + bool skip_by_workspace = false; + for (auto&& i : prof) { + auto attr_of_algo = static_cast(i.attribute); + bool contain_attr_all_positive = + (target_attr.first == (attr_of_algo & target_attr.first)); + bool contain_attr_any_negative = + static_cast(attr_of_algo & target_attr.second); + if (contain_attr_all_positive) { + if (!contain_attr_any_negative) { + if (i.workspace <= workspace_limit) { + Algorithm::Info::Desc algo_desc = deserialize_read_pod(i.algo); + return {algo_desc, rst}; + } + skip_by_workspace = true; + } else { + skip_by_negative = true; + } + } + } + + if (skip_by_workspace) + return {}; + + std::string layouts_str = AlgoChooser::format_fixlayouts(m_fastrun_layouts); + if (skip_by_negative) { + mgb_log_error( + "opr: %s, layouts: %s, No usable algo. There are available " + "algos match " + "positive strategy(%s), but filtered by negative stategy(%s).", + ::MegDNNOpr2Typename::name, layouts_str.c_str(), + Algorithm::attribute_str(target_attr.first).c_str(), + Algorithm::attribute_str(target_attr.second).c_str()); + } else { + mgb_log_error( + "opr: %s, layouts: %s, No usable algo. algos read from cache " + "could not " + "satisfy positive strategy(%s)", + ::MegDNNOpr2Typename::name, layouts_str.c_str(), + Algorithm::attribute_str(target_attr.first).c_str()); + } + + mgb_trap(); + MIDOUT_E +} + +template +void AlgoChooser::AlgoChooserHelper::construct_execution_policy( + const ExecutionStrategy& selected_strategy, + typename AlgoChooser::ImplExecutionPolicy& policy, bool retrive_from_cache, + bool allow_log) const { + MIDOUT_B(Opr, midout_iv(MGB_HASH_STR("construct_execution_policy"))) + if (!policy.algo.valid()) { + if (retrive_from_cache) { + policy.algo = get_profile_result_from_cache(selected_strategy).first; + if (!policy.algo.valid()) { + if (allow_log) { + auto target_attr = extract_algo_attribute(selected_strategy); + std::string layouts_str = + AlgoChooser::format_fixlayouts(m_fastrun_layouts); + std::string msg = ssprintf( + "(opr : %s, layouts %s, with attribute(%s) and " + "without attribute(%s)", + ::MegDNNOpr2Typename::name, layouts_str.c_str(), + Algorithm::attribute_str(target_attr.first).c_str(), + Algorithm::attribute_str(target_attr.second).c_str()); + mgb_log_warn( + "No algo get from cache for %s. This may caused by " + "mismatch with model and cache file or imcomplete " + "cache file. ex. profiling with version1, but " + "inferencing on version2 or profiling modelA but " + "inferencing modelB", + msg.c_str()); + } + return; + } + } else { + auto workspace_limit = m_desc.get_workspace_limit( + m_cn, m_execution_policy.workspace_limit); + + auto attr = extract_algo_attribute(selected_strategy); + policy.algo = + APPLY(m_dnn_opr->get_algorithm_info_heuristic( + args..., workspace_limit, attr.first, attr.second), + m_fastrun_layouts) + .desc; + mgb_assert( + policy.algo.valid(), + "No algo found from heuristic with strategy %u and " + "workspace limit %zu", + static_cast(selected_strategy), workspace_limit); + } + } + + Algorithm* algo = m_dnn_opr->get_algorithm_from_desc(policy.algo); + mgb_assert(algo, "Unknown algo description"); + std::vector&& sub_items = + algo->get_subopr_list(to_layout_array(m_fastrun_layouts), m_dnn_opr); + + FOREACH_OPR_TYPE_DISPATCH(sub_items, { + auto&& megdnn_opr = opr::intl::create_megdnn_opr<_Opr>(m_cn); + megdnn_opr->param() = + Algorithm::deserialize_read_pod(_item.param); + typename AlgoChooser<_Opr>::AlgoChooserHelper sub_helper( + to_fixed_layouts<_Opr>(_item.layouts), megdnn_opr.get(), _item.param, + m_cn, m_execution_policy, m_allow_weight_preprocess, m_desc); + policy.sub_policy.push_back({}); + sub_helper.construct_execution_policy( + selected_strategy, policy.sub_policy.back(), retrive_from_cache, + allow_log); + if (!policy.sub_policy.back().algo.valid()) { + // means sub_helper.construct_execution_policy fails. clean up + // policy.algo and return + policy = {}; + return; + } + }); + MIDOUT_E +} + +template +size_t AlgoChooser::AlgoChooserHelper::get_workspace_size_bytes( + const ImplExecutionPolicy& policy, const FixedTensorLayouts& layouts) const { + MIDOUT_B(Opr, midout_iv(MGB_HASH_STR("get_workspace_size_bytes"))) + m_dnn_opr->execution_policy() = policy; + size_t result; + const FixedTensorLayouts* layouts_ptr = &m_fastrun_layouts; + if (layouts.at(0).ndim) { + layouts_ptr = &layouts; + } + if_constexpr()>( + [&](auto _) { + auto&& opr = _(m_dnn_opr); + auto prep = this->construct_fake_preprocess_filter(*layouts_ptr); + PreprocessFilter* prep_ptr = prep.valid() ? &prep.val() : nullptr; + result = std::max( + APPLY(opr->get_preprocess_workspace_in_bytes(args...), + *layouts_ptr), + APPLY(opr->get_workspace_in_bytes(args..., prep_ptr), + *layouts_ptr)); + }, + /* else */ + [&](auto _) { + result = APPLY( + _(m_dnn_opr)->get_workspace_in_bytes(args...), *layouts_ptr); + }); + return result; + MIDOUT_E +} + +template +std::vector::ImplAlgo> AlgoChooser< + Opr>::AlgoChooserHelper::get_all_candidates() const { + MIDOUT_B(Opr, midout_iv(MGB_HASH_STR("get_all_candidates"))) + auto heu = choose_by_heuristic(m_execution_policy.strategy); + auto&& ret = APPLY(m_dnn_opr->get_all_algorithms_info(args...), m_fastrun_layouts); + bool found = false; + for (size_t i = 0; i < ret.size(); ++i) { + if (ret[i].desc == heu.algo) { + found = true; + std::swap(ret[i], ret[0]); + break; + } + } + + Algorithm* palgo = m_dnn_opr->get_algorithm_from_desc(heu.algo); + mgb_assert(palgo, "Unknown algo description"); + mgb_assert( + found, + "algo %s got by heuristic not found in " + "candidate list", + palgo->name()); + return std::move(ret); + MIDOUT_E +} + +template +Maybe AlgoChooser::AlgoChooserHelper:: + profile_single_algo(const ImplExecutionPolicy& policy, double& timeout) const { + MIDOUT_B(Opr, midout_iv(MGB_HASH_STR("profile_single_algo"))) + typename TimedProfiler::Param param; + // force check copy size <= dest len-1 from gcc8 for safe + param.execution_policy = + TimedProfiler::Param::ExecutionPolicyBlob::serialize(policy); + param.workspace = get_workspace_size_bytes(policy); + for (int i = 0; i < arity; ++i) { + auto&& src = m_fastrun_layouts[i]; + bool cond_normal = src.format.is_default() && + (src.dtype.category() == DTypeCategory::FLOAT || + src.dtype.category() == DTypeCategory::INT || + src.dtype.category() == DTypeCategory::QUANTIZED); + bool cond_low_bit = src.dtype.is_low_bit() && src.format.is_lowbit_aligned() && + (src.dtype.category() == DTypeCategory::QUANTIZED || + src.dtype.category() == DTypeCategory::LOWBIT); + MGB_MARK_USED_VAR(cond_normal); + MGB_MARK_USED_VAR(cond_low_bit); + mgb_assert( + cond_normal || cond_low_bit, "unsupported layout in profiling: %s", + src.to_string().c_str()); + param.dtypes[i] = src.dtype.enumv(); + } + param.comp_node_physical = m_cn.locator(); + param.comp_node_logical = m_cn.locator_logical(); + mgb_assert(param.shapes.size() == m_fastrun_layouts.size()); + for (size_t i = 0; i < param.shapes.size(); ++i) + param.shapes[i] = m_fastrun_layouts[i]; + param.opr_param = m_dnn_opr->param(); + param.allow_weight_preprocess = m_allow_weight_preprocess; + + Algorithm* palgo = m_dnn_opr->get_algorithm_from_desc(policy.algo); + mgb_assert(palgo, "can not find algo when profile single algo"); + + auto rst = TimedProfiler::profile(param, timeout); + // MIOpen conv profiles all available algos when a specfic shape is + // provided for the first time, which probably adds to the result time. + // Therefore, a second profile execution is needed. + if (strncmp(palgo->name(), "MIOpen", 6) == 0) { + rst = TimedProfiler::profile(param, timeout); + } + if (!rst.valid()) + return None; + + std::string algo_desc; + serialize_write_pod(policy.algo, algo_desc); + return AlgoChooserProfileCache::ResultEntry{ + algo_desc, static_cast(palgo->attribute()), rst.val().time, + param.workspace}; + MIDOUT_E +} + +template +void AlgoChooser::AlgoChooserHelper::profile( + const ExecutionStrategy& selected_strategy) const { + MIDOUT_B(Opr, midout_iv(MGB_HASH_STR("profile"))) + auto&& rst = get_profile_result_from_cache(selected_strategy); + if (rst.first.valid()) + return; + AlgoChooserProfileCache::Result prof_rst; + + auto target_attr = extract_algo_attribute(selected_strategy); + std::string layouts_str = AlgoChooser::format_fixlayouts(m_fastrun_layouts); + double cur_timeout = 0; + + auto workspace_limit = + m_desc.get_workspace_limit(m_cn, m_execution_policy.workspace_limit); + RealTimer timer; + std::unordered_set rst_algos; + if (rst.second.valid()) { + std::transform( + rst.second.val().begin(), rst.second.val().end(), + std::inserter(rst_algos, rst_algos.end()), + [](const AlgoChooserProfileCache::ResultEntry& result) { + return result.algo; + }); + } + + for (auto algo : get_all_candidates()) { + std::string desc; + serialize_write_pod(algo.desc, desc); + if (rst_algos.find(desc) != rst_algos.end()) { + continue; + } + Maybe cur_rst; + + ImplExecutionPolicy policy; + policy.algo = algo.desc; + + //! check negative attribute : skip negative attribute + auto palgo = m_dnn_opr->get_algorithm_from_desc(policy.algo); + if (palgo->contain_attribute_any(target_attr.second)) { + mgb_log_debug( + "skip algo %s, which matches the profile strategy required " + "'not contain attribute(%s).'", + algo.desc.name.c_str(), + Algorithm::attribute_str(target_attr.second).c_str()); + continue; + } + + //! check workspace limit + construct_execution_policy(selected_strategy, policy); + mgb_assert( + policy.algo.valid(), + "construct execution policy must success when profiling"); + if (get_workspace_size_bytes(policy) > workspace_limit) { + continue; + } + + std::string msg = ssprintf( + "profiling %s algorithm %s %s", ::MegDNNOpr2Typename::name, + algo.desc.name.c_str(), layouts_str.c_str()); + timer.reset(); + MGB_TRY { cur_rst = profile_single_algo(policy, cur_timeout); } + MGB_CATCH(std::exception & exc, { + mgb_log_warn("caught exception during %s: %s", msg.c_str(), exc.what()); + continue; + }) + MGB_CATCH(..., { + mgb_log_warn("caught exception during %s", msg.c_str()); + continue; + }) + if (!cur_rst.valid()) { + mgb_log_warn( + "timeout when %s; timeout setting: %.3fsec", msg.c_str(), + cur_timeout); + continue; + } + if (!cur_timeout) { + cur_timeout = timer.get_secs() + TIMEOUT_TOLERANCE; + } else { + cur_timeout = std::min(cur_timeout, timer.get_secs() + TIMEOUT_TOLERANCE); + } + auto&& rst = cur_rst.val(); + mgb_log_debug( + "%s: workspace: %zu; time: %.3gsec", msg.c_str(), rst.workspace, + rst.time); + prof_rst.push_back(rst); + } + std::string msg = ssprintf( + "no usable %s algorithm %s without attribute(%s) or could not meet " + "workspace limite requirement(%zu)", + ::MegDNNOpr2Typename::name, layouts_str.c_str(), + Algorithm::attribute_str(target_attr.second).c_str(), workspace_limit); + mgb_assert(!prof_rst.empty(), "%s", msg.c_str()); + + if (rst.second.valid()) + prof_rst.insert( + prof_rst.end(), rst.second.val().begin(), rst.second.val().end()); + FixedTensorLayouts incache_layouts = m_incache_layouts; + typename Opr::Param origin_param = m_dnn_opr->param(); + AlgoChooserProfileCache::Key cache_key{ + incache_layouts.data(), incache_layouts.size(), &origin_param, + sizeof(origin_param)}; + + AlgoChooserProfileCache cache(m_cn, profile_name(m_dnn_opr).c_str()); + cache.put(cache_key, prof_rst); + MIDOUT_E +} + +template +Maybe> AlgoChooser::AlgoChooserHelper:: + construct_fake_preprocess_filter(const FixedTensorLayouts& layouts) const { + MIDOUT_B(Opr, midout_iv(MGB_HASH_STR("construct_fake_preprocess_filter"))) + Maybe> result = None; + const FixedTensorLayouts* layouts_ptr = &m_fastrun_layouts; + if (layouts.at(0).ndim) { + layouts_ptr = &layouts; + } + if_constexpr()>([&](auto _) { + if (!m_allow_weight_preprocess) + return; + auto opr = _(m_dnn_opr); + auto layouts = + APPLY(opr->deduce_preprocessed_filter_layout(args...), *layouts_ptr); + //! No preprocess layout means no need weight preprocess + if (layouts.empty()) { + return; + } + //! all layouts arm empty means no need weight preprocess + bool layout_valid = false; + for (auto&& layout : layouts) { + if (!layout.is_empty()) { + layout_valid = true; + } + } + if (!layout_valid) { + return; + } + + result = PreprocessFilter{}; + auto& res = result.val(); + res.algorithm_id = nullptr; + res.tensors.resize(layouts.size()); + for (size_t i = 0; i < layouts.size(); i++) { + res.tensors[i] = megdnn::TensorND(nullptr, layouts[i]); + } + }); + return result; + MIDOUT_E +} + +template +std::pair AlgoChooser::AlgoChooserHelper:: + extract_algo_attribute(const ExecutionStrategy& strategy) const { + std::pair ret = + std::make_pair(AlgoAttribute::DEFAULT, AlgoAttribute::DEFAULT); + + //! from strategy + if (strategy & ExecutionStrategy::REPRODUCIBLE) { + ret.first |= AlgoAttribute::REPRODUCIBLE; + } + if (strategy & ExecutionStrategy::OPTMIZED) { + ret.second |= AlgoAttribute::NAIVE; + } + + //! from graph option + // FIXME: no_profiling_on_shape_change extract USABLE_DEPEND_ON_SHAPE + // attribute when fixed usable + if (m_desc.shared_batch_size) { + ret.second |= AlgoAttribute::USABLE_DEPEND_ON_SHAPE; + } + + if (m_desc.binary_equal_between_batch) { + ret.first |= AlgoAttribute::REPRODUCIBLE; + ret.second |= AlgoAttribute::ACCURACY_DEPEND_ON_BATCH; + } + + return ret; +} + +#define INST(Opr) \ + template AlgoChooser::AlgoChooserHelper::AlgoChooserHelper( \ + const FixedTensorLayouts& layouts, megdnn::Opr* megdnn_opr, \ + const std::string& param_str, const CompNode& cn, \ + const megdnn::param::ExecutionPolicy& execution_policy, \ + bool allow_weight_preprocess, const AlgoChooserDesc& desc); \ + template typename AlgoChooser::ImplExecutionPolicy \ + AlgoChooser::AlgoChooserHelper::choose_by_heuristic( \ + const ExecutionStrategy& select_strategy) const; \ + template typename AlgoChooser::ImplExecutionPolicy \ + AlgoChooser::AlgoChooserHelper::choose_by_profile( \ + const ExecutionStrategy& select_strategy, bool enable_update) const; \ + template typename std::pair< \ + AlgoChooser::ImplAlgoDesc, \ + Maybe> \ + AlgoChooser::AlgoChooserHelper::get_profile_result_from_cache( \ + const ExecutionStrategy& select_strategy) const; \ + template void \ + AlgoChooser::AlgoChooserHelper::construct_execution_policy( \ + const ExecutionStrategy& select_strategy, \ + typename AlgoChooser::ImplExecutionPolicy& policy, \ + bool retrive_from_cache, bool allow_log) const; \ + template size_t \ + AlgoChooser::AlgoChooserHelper::get_workspace_size_bytes( \ + const typename AlgoChooser::ImplExecutionPolicy& policy, \ + const FixedTensorLayouts& layouts) const; \ + template std::vector::ImplAlgo> \ + AlgoChooser::AlgoChooserHelper::get_all_candidates() const; \ + template Maybe \ + AlgoChooser::AlgoChooserHelper::profile_single_algo( \ + const typename AlgoChooser::ImplExecutionPolicy& policy, \ + double& timeout) const; \ + template std::pair \ + AlgoChooser::AlgoChooserHelper::extract_algo_attribute( \ + const ExecutionStrategy& strategy) const; \ + template void AlgoChooser::AlgoChooserHelper::profile( \ + const ExecutionStrategy& selected_strategy) const; + +DNN_FOREACH_FASTRUN_OPR(INST) +#undef INST + +//////////////////////////////// AlgoChoose ///////////////////////////// +template +typename AlgoChooser::ImplExecutionPolicy AlgoChooser::get_policy( + const AlgoChooserHelper& helper) { + auto opr_strategy = helper.execution_policy().strategy; + auto strategy2str = [](auto strategy) { + std::string ret; + if (strategy & ExecutionStrategy::HEURISTIC) { + ret += "HEURISTIC "; + } + if (strategy & ExecutionStrategy::PROFILE) { + ret += "PROFILE "; + } + if (strategy & ExecutionStrategy::REPRODUCIBLE) { + ret += "REPRODUCIBLE "; + } + if (strategy & ExecutionStrategy::OPTIMIZED) { + ret += "OPTIMIZED "; + } + return ret; + }; + mgb_log_debug("Use Stragegy :%s", strategy2str(opr_strategy).c_str()); + if (opr_strategy & ExecutionStrategy::HEURISTIC) { + if (opr_strategy & ExecutionStrategy::PROFILE) { + //! this strategy will choose from cache first, then choost by + //! heuristic if fail. + ImplExecutionPolicy policy = helper.choose_by_profile(opr_strategy, false); + if (!policy.algo.valid()) { + policy = helper.choose_by_heuristic(opr_strategy); + } + return policy; + } else { + return helper.choose_by_heuristic(opr_strategy); + } + } +#if MGB_ENABLE_FASTRUN + else if (opr_strategy & ExecutionStrategy::PROFILE) { + return helper.choose_by_profile(opr_strategy, true); + } +#endif + else { + mgb_throw(InternalError, "bad ExecutionPolicy strategy"); + } +} + +template +std::string AlgoChooser::format_fixlayouts(const FixedTensorLayouts& layout) { + return ::format_fixlayouts(layout, arity_in, arity_out); +} + +#define INST(Opr) \ + template AlgoChooser::ImplExecutionPolicy \ + AlgoChooser::get_policy(const AlgoChooserHelper& proxy); \ + template std::string AlgoChooser::format_fixlayouts( \ + const FixedTensorLayouts& layout); + +DNN_FOREACH_FASTRUN_OPR(INST) +#undef INST + +} // namespace rdnn +} // namespace mgb + +// vim: syntax=cpp.doxygen foldmethod=marker foldmarker=f{{{,f}}} diff --git a/src/rdnn/impl/management.cpp b/src/rdnn/impl/management.cpp new file mode 100644 index 00000000..990a0b70 --- /dev/null +++ b/src/rdnn/impl/management.cpp @@ -0,0 +1,66 @@ +#include "megbrain/rdnn/management.h" +#include "megbrain/comp_node_env.h" +#include "megbrain/tensor.h" +#include "megbrain/utils/metahelper.h" + +#include "megdnn/handle.h" +#include "megdnn/oprs.h" + +/* ================== global functions ================== */ + +using namespace mgb; +using namespace mgb::opr; + +namespace { +template +class MegDNNGlobalOprContainer final : public UserDataContainer::UserData { + MGB_TYPEINFO_OBJ_DECL; + + std::shared_ptr m_megdnn_handle; + std::unique_ptr m_opr; + +public: + MegDNNGlobalOprContainer(CompNode cn) + : m_megdnn_handle{intl::get_megdnn_handle_shared(cn)}, + m_opr{m_megdnn_handle->create_operator()} { + mgb_assert(m_opr->is_thread_safe()); + } + + Opr* get() const { return m_opr.get(); } +}; + +template +MGB_TYPEINFO_OBJ_IMPL(MegDNNGlobalOprContainer); +} // anonymous namespace + +std::shared_ptr intl::get_megdnn_handle_shared(CompNode comp_node) { + auto& handle = MegDNNHandle::get(CompNodeEnv::from_comp_node(comp_node)); + return {handle.shared_from_this(), handle.handle()}; +} + +megdnn::Handle* intl::get_megdnn_handle(CompNode comp_node) { + return MegDNNHandle::get(CompNodeEnv::from_comp_node(comp_node)).handle(); +} + +template +Opr* intl::get_megdnn_global_opr(CompNode comp_node) { + using T = MegDNNGlobalOprContainer; + auto maker = [comp_node]() { return std::make_shared(comp_node); }; + return CompNodeEnv::from_comp_node(comp_node).get_user_data(maker).get(); +} + +namespace mgb { +namespace opr { +namespace intl { + +#define INST(o) template o* get_megdnn_global_opr(CompNode) +INST(megdnn::AddUpdate); +INST(megdnn::Relayout); +INST(megdnn::Checksum); +#undef INST + +} // namespace intl +} // namespace opr +} // namespace mgb + +// vim: syntax=cpp.doxygen foldmethod=marker foldmarker=f{{{,f}}} diff --git a/src/rdnn/impl/profiler.cpp b/src/rdnn/impl/profiler.cpp new file mode 100644 index 00000000..12303b5c --- /dev/null +++ b/src/rdnn/impl/profiler.cpp @@ -0,0 +1,400 @@ +#include "megbrain/rdnn/profiler.h" +#include "megbrain/utils/invoke.h" + +#include "megdnn/handle.h" +#include "megdnn/oprs/base.h" + +#if MGB_ROCM +#include "hcc_detail/hcc_defs_prologue.h" +#include "megcore_rocm.h" +#endif + +//! TODO: here has to be know some megdnn::opr when there is produced midout.h +//! fix it if there is another graceful way. +#include "megdnn/oprs.h" + +#include "midout.h" + +MIDOUT_DECL(megbrain_opr_profile) +#define MIDOUT_B(...) MIDOUT_BEGIN(megbrain_opr_profile, __VA_ARGS__) { +#define MIDOUT_E \ + } \ + MIDOUT_END(); + +namespace { +std::string serialize_policy(const megdnn::ExecutionPolicy& policy) { + std::string ret; + //! serialize AlgorithmDesc + megdnn::Algorithm::serialize_write_pod(policy.algo.handle_type, ret); + megdnn::Algorithm::serialize_write_pod(policy.algo.type, ret); + uint32_t param_size = policy.algo.param.size(); + uint32_t name_size = policy.algo.name.size(); + megdnn::Algorithm::serialize_write_pod(param_size, ret); + megdnn::Algorithm::serialize_write_pod(name_size, ret); + ret += policy.algo.param; + ret += policy.algo.name; + + //! serialize sub_policy + uint32_t size = policy.sub_policy.size(); + megdnn::Algorithm::serialize_write_pod(size, ret); + for (auto&& sub : policy.sub_policy) { + ret += serialize_policy(sub); + } + return ret; +} + +megdnn::ExecutionPolicy deserialize_policy( + const char* buf, uint32_t size, uint32_t& offset) { + megdnn::ExecutionPolicy ret; +#define cb(_val, _type) \ + _val = megdnn::Algorithm::deserialize_read_pod<_type>(buf, offset); \ + offset += sizeof(_val) + + cb(ret.algo.handle_type, megdnn::Handle::HandleType); + cb(ret.algo.type, uint32_t); + + uint32_t param_size = 0; + uint32_t name_size = 0; + cb(param_size, uint32_t); + cb(name_size, uint32_t); + if (param_size > 0) { + ret.algo.param = std::string(buf + offset, param_size); + offset += param_size; + } + if (name_size > 0) { + ret.algo.name = std::string(buf + offset, name_size); + offset += name_size; + } + + uint32_t nr_policy = 0; + cb(nr_policy, uint32_t); +#undef cb + + for (uint32_t i = 0; i < nr_policy; i++) { + ret.sub_policy.push_back(deserialize_policy(buf, size, offset)); + } + return ret; +} +} // namespace + +namespace mgb { +namespace rdnn { +#define APPLY(statement, ...) \ + mgb::apply( \ + [&](const auto&... args) { return statement; }, \ + std::tuple_cat(__VA_ARGS__)) + +////////////// TimedProfiler::Param::ExecutionPolicyBlob ////////////////////// + +template +typename TimedProfiler::Param::ExecutionPolicyBlob TimedProfiler::Param:: + ExecutionPolicyBlob::serialize(const megdnn::ExecutionPolicy& policy) { + ExecutionPolicyBlob ret; + std::string serialize_bin = serialize_policy(policy); + mgb_assert(serialize_bin.size() < MAX_SIZE_IN_BYTES); + memcpy(ret.data, serialize_bin.data(), serialize_bin.size()); + ret.size = serialize_bin.size(); + return ret; +} + +template +megdnn::ExecutionPolicy TimedProfiler::Param::ExecutionPolicyBlob::deserialize() + const { + uint32_t offset = 0; + auto&& ret = deserialize_policy(data, size, offset); + mgb_assert(offset == size); + return std::move(ret); +} + +#define INST(Opr) \ + template typename TimedProfiler::Param::ExecutionPolicyBlob \ + TimedProfiler::Param::ExecutionPolicyBlob::serialize( \ + const megdnn::ExecutionPolicy& policy); \ + template megdnn::ExecutionPolicy \ + TimedProfiler::Param::ExecutionPolicyBlob::deserialize() const; + +DNN_FOREACH_FASTRUN_OPR(INST) +#undef INST + +////////////////// TimedProfiler ////////////////////////////// + +template +const double TimedProfiler::timeout_setting = + TimedProfiler::init_timeout_setting(); + +template +double TimedProfiler::init_timeout_setting() { +#if MGB_ENABLE_FASTRUN + sys::TimedFuncInvoker::ins().register_func( + AlgoChooserFuncId::ID, &TimedProfiler::prof_impl, + &TimedProfiler::prof_init_device); + auto to_set = MGB_GETENV("MGB_CONV_PROFILING_TIMEOUT"); + if (to_set) + return std::stod(to_set); +#endif + return 0; +} + +#define APPLY(statement, ...) \ + mgb::apply( \ + [&](const auto&... args) { return statement; }, \ + std::tuple_cat(__VA_ARGS__)) + +template +void TimedProfiler::preprocess( + const TensorLayoutArray&, const megdnn::SmallVector&, + UniqPtrWithCN&, megdnn::Workspace&, std::array&, + std::array&, PreprocessFilter&) { + // Opr is neither convbias nor convolution.This function do nothing. +} + +//! convbias +template <> +void TimedProfiler::preprocess( + const TensorLayoutArray& preprocessed_layout, + const SmallVector& flt_val, + UniqPtrWithCN& megdnn_opr, megdnn::Workspace& mdn_workspace, + std::array& layouts, + std::array& inp_val, + PreprocessFilter& prep_flt) { + if (!preprocessed_layout.empty()) { + auto&& pf = prep_flt; + pf.algorithm_id = nullptr; + pf.tensors.resize(flt_val.size()); + for (size_t i = 0; i < flt_val.size(); i++) { + pf.tensors[i] = flt_val[i].as_megdnn(); + } + APPLY(megdnn_opr->exec_preprocess(args..., &pf, mdn_workspace), + std::forward_as_tuple( + layouts[0], inp_val[1].as_megdnn(), inp_val[2].as_megdnn()), + array_skip(layouts)); + } +} + +//! convolution +template <> +void TimedProfiler::preprocess( + const TensorLayoutArray& preprocessed_layout, + const megdnn::SmallVector& flt_val, + UniqPtrWithCN& megdnn_opr, + megdnn::Workspace& mdn_workspace, std::array& layouts, + std::array& inp_val, + PreprocessFilter& prep_flt) { + if (!preprocessed_layout.empty()) { + auto&& pf = prep_flt; + pf.algorithm_id = nullptr; + pf.tensors.resize(flt_val.size()); + for (size_t i = 0; i < flt_val.size(); i++) { + pf.tensors[i] = flt_val[i].as_megdnn(); + } + APPLY(megdnn_opr->exec_preprocess(args..., &pf, mdn_workspace), + std::forward_as_tuple(layouts[0], inp_val[1].as_megdnn()), + array_skip<2>(layouts)); + } +} + +template +typename TimedProfiler::TResult TimedProfiler::prof_impl( + const TParam& raw_param) { + MIDOUT_B(Opr, midout_iv(MGB_HASH_STR("TimedProfiler::prof_impl"))) +#if MGB_ROCM + bool miopen_algo_search_enabled; + megcore::getMIOpenAlgoSearchStatus(&miopen_algo_search_enabled); + mgb_assert(miopen_algo_search_enabled, "MIOpen algo search not enabled"); +#endif + auto&& param = raw_param.as_single_pod(); + CompNode cn = CompNode::load(param.comp_node_physical, param.comp_node_logical); + auto megdnn_opr = opr::intl::create_megdnn_opr(cn); + std::array layouts; + + auto from_enum = [&](DTypeEnum enumv) -> DType { + switch (enumv) { + +#define cb(_dt) \ + case DTypeTrait<_dt>::enumv: \ + return _dt(1.0f, static_cast(0)) + cb(dtype::Quantized8Asymm); + cb(dtype::Quantized4Asymm); +#undef cb + +#define cb(_dt) \ + case DTypeTrait<_dt>::enumv: \ + return _dt(1.0f) + + cb(dtype::QuantizedS8); + cb(dtype::QuantizedS16); + cb(dtype::QuantizedS32); + cb(dtype::QuantizedS4); + default: + return DType::from_enum(enumv); +#undef cb + } + }; + for (int i = 0; i < arity; ++i) { + layouts[i] = {param.shapes[i], from_enum(param.dtypes[i])}; + } + + megdnn_opr->param() = param.opr_param; + megdnn_opr->execution_policy() = param.execution_policy.deserialize(); + + // Allocate preprocessed weight buffers. + TensorLayoutArray preprocessed_layout; + if_constexpr()>([&](auto _) { + if (param.allow_weight_preprocess) { + preprocessed_layout = APPLY( + _(megdnn_opr)->deduce_preprocessed_filter_layout(args...), layouts); + } + }); + + { + // first allocate a whole chunk to avoid memory fragmentation (here we + // rely on memory allocator to reuse memory) + auto align = cn.get_mem_addr_alignment(); + size_t tot_size = align; + for (int i = 0; i < arity; ++i) { + tot_size += layouts[i].span().high_byte + align; + } + for (const auto& layout : preprocessed_layout) { + tot_size += layout.span().high_byte + align; + } + tot_size += param.workspace; + DeviceTensorStorage storage{cn}; + storage.ensure_size(tot_size); + } + + // allocate input and output memory + std::array inp_val; + std::array out_val; + DeviceTensorND workspace; + for (int i = 0; i < arity_in; ++i) { + inp_val[i].comp_node(cn).dtype(layouts[i].dtype).resize(layouts[i]); + } + for (int i = 0; i < arity_out; ++i) { + out_val[i] + .comp_node(cn) + .dtype(layouts[arity_in + i].dtype) + .resize(layouts[arity_in + i]); + } + megdnn::Workspace mdn_workspace; + + // allocate workspace + if (param.workspace) { + workspace.comp_node(cn).dtype(dtype::Byte()).resize({param.workspace}); + mdn_workspace.size = param.workspace; + mdn_workspace.raw_ptr = workspace.raw_ptr(); + } + + // allocate storage for preprocessed filter + SmallVector flt_val(preprocessed_layout.size()); + for (size_t i = 0; i < preprocessed_layout.size(); i++) { + flt_val[i] = { + cn, preprocessed_layout[i], preprocessed_layout[i].dtype, + preprocessed_layout[i].format}; + } + + for (int i = 0; i < arity_in; ++i) { + fill_zero_dev_tensor(inp_val[i]); + } + + PreprocessFilter prep_flt; + preprocess( + preprocessed_layout, flt_val, megdnn_opr, mdn_workspace, layouts, inp_val, + prep_flt); + + RealTimer timer; + auto ev_start = cn.create_event(CompNode::Event::NEED_TIMER), + ev_end = cn.create_event(CompNode::Event::NEED_TIMER); + ev_start->record(); + if_constexpr()>( + [&](auto _) { + auto&& opr = _(megdnn_opr); + PreprocessFilter* pf = + preprocessed_layout.empty() ? nullptr : &prep_flt; + APPLY(opr->exec(args.as_megdnn()..., pf, mdn_workspace), inp_val, + out_val); + }, + /* else */ + [&](auto _) { + APPLY(_(megdnn_opr)->exec(args.as_megdnn()..., mdn_workspace), inp_val, + out_val); + }); + ev_end->record(); + + megdnn::Algorithm* algo = + megdnn_opr->get_algorithm_from_desc(megdnn_opr->execution_policy().algo); + mgb_assert(algo); + double next_report_time = 0.5; + while (!ev_end->finished()) { + if (timer.get_secs() >= next_report_time) { +#if MGB_ENABLE_GETENV + mgb_log_warn( + "profiling conv algo %s already took %.3f/%.3f secs" + " (limit can be set by MGB_CONV_PROFILING_TIMEOUT) ", + algo->name(), timer.get_secs(), param.actual_timeout); +#else + mgb_log_warn( + "profiling conv algo %s already took %.3f/%.3f secs", algo->name(), + timer.get_secs(), param.actual_timeout); +#endif + next_report_time = timer.get_secs() + 1; + } + using namespace std::literals; +#if !__DEPLOY_ON_XP_SP2__ + std::this_thread::sleep_for(1000us); +#endif + } + // release all free blocks owned by child process, + // in order to avoid main process running out of memory + cn.try_coalesce_all_free_memory(); + + mgb_assert(ev_start->finished()); + return TResult::from_pod(Result{ev_start->elapsed_time_until(*ev_end)}); + MIDOUT_E +}; + +template +Maybe::Result> TimedProfiler::profile( + const Param& param, double& timeout) { + mgb_assert(timeout >= 0); + if (!timeout) { + timeout = timeout_setting; + } else if (timeout_setting) { + timeout = std::min(timeout, timeout_setting); + } + param.actual_timeout = timeout ? timeout : std::numeric_limits::infinity(); + auto res = sys::TimedFuncInvoker::ins().invoke( + AlgoChooserFuncId::ID, TParam::from_pod(const_cast(param)), + timeout); + if (res.valid()) + return res.val().template as_single_pod(); + return None; +} + +template +void TimedProfiler::prof_init_device(const TParam& raw_param) { + MIDOUT_B(Opr, midout_iv(MGB_HASH_STR("TimedProfiler::prof_init_device"))) +#if MGB_ROCM + megcore::enableMIOpenAlgoSearch(true); +#endif + auto&& param = raw_param.as_single_pod(); + CompNode cn = CompNode::load(param.comp_node_physical, param.comp_node_logical); + // wait for cuda init, so its time does not get accounted in timeout + cn.sync(); + MIDOUT_E +} + +#define INST(Opr) \ + template const double TimedProfiler::timeout_setting; \ + template double TimedProfiler::init_timeout_setting(); \ + template typename TimedProfiler::TResult \ + TimedProfiler::prof_impl(const TParam& raw_param); \ + template Maybe::Result> \ + TimedProfiler::profile(const Param& param, double& timeout); \ + template void TimedProfiler::prof_init_device(const TParam& raw_param); + +DNN_FOREACH_FASTRUN_OPR(INST) +#undef INST +} // namespace rdnn +} // namespace mgb + +// vim: syntax=cpp.doxygen foldmethod=marker foldmarker=f{{{,f}}} diff --git a/src/rdnn/include/megbrain/rdnn/algo_chooser.h b/src/rdnn/include/megbrain/rdnn/algo_chooser.h new file mode 100644 index 00000000..98defa7d --- /dev/null +++ b/src/rdnn/include/megbrain/rdnn/algo_chooser.h @@ -0,0 +1,174 @@ +#pragma once + +#include +#include "megbrain/opr/param_defs.h" +#include "megbrain/rdnn/profiler.h" +#include "megbrain/utils/persistent_cache.h" +#include "megdnn/oprs/base.h" + +namespace mgb { +namespace rdnn { + +//! define logical operation of megdnn::param::ExecutionPolicy::Strategy::Enum +//! and megdnn::detail::AlgoAttribute enum +using ExecutionStrategy = megdnn::param::ExecutionPolicy::Strategy; + +using AlgoAttribute = megdnn::AlgoAttribute; + +/* =================== AlgoChooser =================== */ +/*! + * \brief choose algorithm according to ExecutionPolicy + * + * This class only provides static methods, and the entry point is + * AlgoChooser::setup_algo. When profiling is needed, it would first try to + * retrive profiling stats from cache, and run TimedProfiler when necessary + * + * \tparam Opr megdnn operator impl + */ +struct AlgoChooserDesc { + uint32_t shared_batch_size = 0; + bool binary_equal_between_batch = false; + bool no_profiling_on_shape_change = false; + using WorkspaceLimitGetter = std::function; + WorkspaceLimitGetter get_workspace_limit; +}; + +template +class AlgoChooser { + static constexpr int arity_in = OprArityTrait::arity_in; + static constexpr int arity_out = OprArityTrait::arity_out; + static constexpr int arity = OprArityTrait::arity; + + using ImplAlgo = typename Opr::AlgorithmInfo; + using ImplAlgoDesc = typename Opr::AlgorithmInfo::Desc; + +protected: + using ImplExecutionPolicy = megdnn::ExecutionPolicy; + +public: + using FixedTensorLayouts = std::array; + + class AlgoChooserHelper { + //! fastrun layouts + FixedTensorLayouts m_fastrun_layouts; + //! layouts used when get and set cache item + FixedTensorLayouts m_incache_layouts; + Opr* m_dnn_opr; + std::string m_param; + CompNode m_cn; + megdnn::param::ExecutionPolicy m_execution_policy; + bool m_allow_weight_preprocess; + const AlgoChooserDesc& m_desc; + + public: + MGE_WIN_DECLSPEC_FUC AlgoChooserHelper( + const FixedTensorLayouts& layouts, Opr* megdnn_opr, + const std::string& param_str, const CompNode& cn, + const megdnn::param::ExecutionPolicy& execution_policy, + bool allow_weight_preprocess, const AlgoChooserDesc& desc); + + Opr* megdnn_opr() const { return m_dnn_opr; } + + const TensorLayout& inp_layout(size_t idx) const { + return m_fastrun_layouts[idx]; + } + + const megdnn::param::ExecutionPolicy& execution_policy() const { + return m_execution_policy; + } + CompNode comp_node() const { return m_cn; } + const std::string& param() const { return m_param; } + + bool allow_weight_preprocess() const { return m_allow_weight_preprocess; } + + megdnn::Algorithm* get_algorithm_from_desc( + const megdnn::Algorithm::Info::Desc& desc) const { + return m_dnn_opr->get_algorithm_from_desc(desc); + } + + const FixedTensorLayouts& fastrun_layouts() const { return m_fastrun_layouts; } + + const FixedTensorLayouts& incache_layouts() const { return m_incache_layouts; } + + const AlgoChooserDesc& desc() const { return m_desc; } + + //! construct algo chain by heuristic + ImplExecutionPolicy choose_by_heuristic( + const ExecutionStrategy& selected_strategy) const; + + //! construct algo chain by profiling + ImplExecutionPolicy choose_by_profile( + const ExecutionStrategy& selected_strategy, bool enable_update) const; + + //! get all profile algorithm from cache, return invalid if not exists + std::pair> + get_profile_result_from_cache(const ExecutionStrategy& selected_strategy) const; + + /** + * \brief construct execution policy from cache or heuristic. + * + * \param selected_strategy select algo which matched this strategy + * \param[in,out] policy execution policy + * \param retrive_from_cache retrive algo from cache if set True, get + * from heuristic otherwise. + * \param allow_log no warning log print if set True, print warning info + * otherwise. + */ + void construct_execution_policy( + const ExecutionStrategy& selected_strategy, ImplExecutionPolicy& policy, + bool retrive_from_cache = true, bool allow_log = true) const; + + //! get workspace size required for specific execution policy + MGE_WIN_DECLSPEC_FUC size_t get_workspace_size_bytes( + const ImplExecutionPolicy& policy, + const FixedTensorLayouts& layouts = {}) const; + + //! get all candidate algos, and the one choose_by_heuristic() is + //! put first + std::vector get_all_candidates() const; + + /*! + * \brief profile a single algorithm + * + * This is actually a wrapper that constructs param and call + * TimedProfiler::profile for the actual profiling + * + * \param[in,out] timeout set the timeout, and return the actual + * timeout used during profiling + */ + Maybe profile_single_algo( + const ImplExecutionPolicy& policy, double& timeout) const; + + //! profile and save to cache + void profile(const ExecutionStrategy& selected_strategy) const; + + /** + * \brief extract algo attribute from execution strategy and graph + * option. + * + * \param strategy select algo which matched this strategy + * \return pair + */ + std::pair extract_algo_attribute( + const ExecutionStrategy& strategy) const; + + private: + Maybe> construct_fake_preprocess_filter( + const FixedTensorLayouts& layouts = {}) const; + }; + + template + friend class AlgoChooser; + + //! entrance for getting algorithm according to execution strategy + MGE_WIN_DECLSPEC_FUC static ImplExecutionPolicy get_policy( + const AlgoChooserHelper& helper); + + //! format given layouts to string + static std::string format_fixlayouts(const FixedTensorLayouts& layout); +}; + +} // namespace rdnn +} // namespace mgb + +// vim: syntax=cpp.doxygen foldmethod=marker foldmarker=f{{{,f}}} diff --git a/src/rdnn/include/megbrain/rdnn/management.h b/src/rdnn/include/megbrain/rdnn/management.h new file mode 100644 index 00000000..8425a65b --- /dev/null +++ b/src/rdnn/include/megbrain/rdnn/management.h @@ -0,0 +1,57 @@ +#pragma once + +#include "megbrain/comp_node.h" +#include "megdnn/handle.h" + +namespace mgb { +namespace opr { +namespace intl { + +//! get megdnn handle from comp node +MGE_WIN_DECLSPEC_FUC megdnn::Handle* get_megdnn_handle(CompNode comp_node); +MGE_WIN_DECLSPEC_FUC std::shared_ptr get_megdnn_handle_shared( + CompNode comp_node); + +/*! + * \brief get global megdnn operator asscoated with a computing node + * \tparam Opr megdnn operator class, must be one of: + * * AddUpdate + * * Relayout + * * Checksum + */ +template +MGE_WIN_DECLSPEC_FUC Opr* get_megdnn_global_opr(CompNode comp_node); + +template +class UniqPtrWithCN : public std::unique_ptr { + CompNode m_cn; + +public: + UniqPtrWithCN() = default; + + template + UniqPtrWithCN(UniqPtrWithCN&& o) + : std::unique_ptr(std::move(o)), m_cn(o.comp_node()) {} + + UniqPtrWithCN(std::unique_ptr ptr, CompNode cn) + : std::unique_ptr{std::move(ptr)}, m_cn{cn} {} + + CompNode comp_node() const { return m_cn; } +}; + +//! create megdnn opr from megdnn handle in a CompNode +template +UniqPtrWithCN create_megdnn_opr(CompNode comp_node) { + return {get_megdnn_handle(comp_node)->create_operator(), comp_node}; +} + +} // namespace intl +} // namespace opr + +namespace rdnn { +template +using UniqPtrWithCN = opr::intl::UniqPtrWithCN; +} // namespace rdnn +} // namespace mgb + +// vim: syntax=cpp.doxygen foldmethod=marker foldmarker=f{{{,f}}} diff --git a/src/rdnn/include/megbrain/rdnn/profiler.h b/src/rdnn/include/megbrain/rdnn/profiler.h new file mode 100644 index 00000000..5537a70f --- /dev/null +++ b/src/rdnn/include/megbrain/rdnn/profiler.h @@ -0,0 +1,152 @@ +#pragma once + +#include "megbrain/comp_node.h" +#include "megbrain/rdnn/management.h" +#include "megbrain/system.h" +#include "megbrain/tensor.h" +#include "megbrain/utils/hash_ct.h" +#include "megbrain/utils/timer.h" + +#include "megdnn/basic_types.h" +#include "megdnn/oprs.h" + +namespace mgb { +namespace rdnn { + +// clang-format off +#define DNN_FOREACH_FASTRUN_OPR(cb) \ + cb(ConvolutionForward) \ + cb(ConvBiasForward) \ + cb(ConvolutionBackwardData) \ + cb(ConvolutionBackwardFilter) \ + cb(Convolution3DForward) \ + cb(Convolution3DBackwardData) \ + cb(Convolution3DBackwardFilter) \ + cb(LocalShareForward) \ + cb(LocalShareBackwardData) \ + cb(LocalShareBackwardFilter) \ + cb(DeformableConvForward) \ + cb(DeformableConvBackwardFilter) \ + cb(DeformableConvBackwardData) \ + cb(BatchConvBiasForward) \ + cb(MatrixMul) \ + cb(BatchedMatrixMul) \ + cb(PoolingForward) \ + cb(PoolingBackward) +// clang-format on + +template +constexpr bool opr_supports_preprocess() { + return std::is_same::value || + std::is_same::value; +} + +template +constexpr bool opr_contain_bias() { + return std::is_same::value; +} + +//! matmul and batchedMatrixMul +template +constexpr bool is_matmul() { + return std::is_same::value || + std::is_same::value; +} + +template +struct PreprocessFilterImpl { + using T = union {}; +}; + +template +struct PreprocessFilterImpl { + using T = typename Opr::PreprocessedFilter; +}; + +template +using PreprocessFilter = + typename PreprocessFilterImpl()>::T; + +template +struct AlgoChooserFuncId {}; + +#define DEF_FUNC_ID(func) \ + template <> \ + struct AlgoChooserFuncId { \ + __attribute__((unused)) static constexpr sys::TimedFuncInvoker::FuncId ID = \ + static_cast( \ + MGB_HASH_STR("megdnn::" #func)); \ + }; + +DNN_FOREACH_FASTRUN_OPR(DEF_FUNC_ID) + +#undef DEF_FUNC_ID + +/* =================== TimedProfiler =================== */ + +/*! + * \brief profile a megdnn opr conv with given param + * + * This class only provides static methods, and the entry point is + * TimedProfiler::profile; it would run profiler in a timed environment by + * sys::TimedFuncInvoker + * + * \tparam Opr megdnn opr impl + */ +template +class TimedProfiler { + static constexpr int arity_in = OprArityTrait::arity_in; + static constexpr int arity_out = OprArityTrait::arity_out; + static constexpr int arity = OprArityTrait::arity; + + using TensorShapeArray = std::array; + +public: + struct Param { + struct ExecutionPolicyBlob { + //! enlarge the max size if needed + constexpr static size_t MAX_SIZE_IN_BYTES = 10240; + char data[MAX_SIZE_IN_BYTES]; + uint32_t size; + + static ExecutionPolicyBlob serialize(const megdnn::ExecutionPolicy& policy); + megdnn::ExecutionPolicy deserialize() const; + }; + ExecutionPolicyBlob execution_policy; + size_t workspace; + megdnn::DTypeEnum dtypes[arity]; + CompNode::Locator comp_node_physical, comp_node_logical; + TensorShapeArray shapes; + typename Opr::Param opr_param; + bool allow_weight_preprocess; + + //! filled by profile() + mutable double actual_timeout; + }; + + struct Result { + double time; + }; + + static Maybe profile(const Param& param, double& timeout); + +private: + using TParam = sys::TimedFuncInvoker::Param; + using TResult = sys::TimedFuncInvoker::Result; + + static const double timeout_setting; + + static double init_timeout_setting(); + static void preprocess( + const megdnn::TensorLayoutArray& preprocessed_layout, + const SmallVector& flt_val, UniqPtrWithCN& megdnn_opr, + megdnn::Workspace& mdn_workspace, std::array& layouts, + std::array& inp_val, + PreprocessFilter& prep_flt); + static TResult prof_impl(const TParam& raw_param); + static void prof_init_device(const TParam& raw_param); +}; +} // namespace rdnn +} // namespace mgb + +// vim: syntax=cpp.doxygen foldmethod=marker foldmarker=f{{{,f}}}