@@ -320,15 +320,12 @@ public: | |||
input->force_assign_dev_tensor_from_tensor(dev_tensor); | |||
mgb_assert(input->comp_node() == dev_tensor.comp_node()); | |||
mgb_assert(input->shape().eq_shape(layout)); | |||
mgb_assert(input->dtype() == layout.dtype); | |||
idx++; | |||
} | |||
} | |||
void init_output_tensor(const SmallVector<Tensor*>& outputs) { | |||
mgb_assert(m_opr->usable_output().size() == outputs.size()); | |||
::mgb::opr::intl::WorkspaceLimitHook::set_impl( | |||
m_opr->owner_graph(), get_workspace_limit); | |||
@@ -347,9 +344,6 @@ public: | |||
mgb_assert(j < outputs.size()); | |||
auto&& tensor = outputs[j]; | |||
auto&& layout = tensor->layout(); | |||
mgb_assert(var->comp_node() == tensor->comp_node()); | |||
mgb_assert(var->shape().eq_shape(layout)); | |||
mgb_assert(var->dtype() == layout.dtype); | |||
if (var->m_mem_plan.chunk().owner_var != var) { | |||
tensor->assign_from_dev_tensor( | |||
var->m_dev_tensor); // memory forwarding | |||
@@ -816,7 +810,6 @@ public: | |||
// minigraph.opr()->usable_output() bug execution may use the attrs for those | |||
// output var, so we infer attrs for all outputs, but only return | |||
// LogicalTensorDesc for minigraph.opr()->usable_output() | |||
SmallVector<LogicalTensorDesc> output_descs; | |||
for (size_t i = 0; i < minigraph.opr()->output().size(); ++i) { | |||
auto* var = minigraph.opr()->output()[i]; | |||
auto* shape = sess.infer(sess.output_data[i].shape_infer, true); | |||
@@ -824,19 +817,15 @@ public: | |||
var->shape(*shape); | |||
} | |||
for (size_t i = 0; i < minigraph.output_size(); ++i) { | |||
SmallVector<TensorPtr> outputs(minigraph.output_size(), {}); | |||
for (size_t i = 0; i < outputs.size(); i++) { | |||
auto* ovar = minigraph.output_var(i); | |||
mgb_assert(ovar->dtype().valid() && ovar->comp_node().valid()); | |||
mgb_assert( | |||
ovar->shape().ndim || | |||
ovar->contain_flag(VarNode::Flag::NO_SYS_MEM_ALLOC)); | |||
output_descs.push_back({{ovar->shape(), ovar->dtype()}, ovar->comp_node()}); | |||
} | |||
SmallVector<TensorPtr> outputs(output_descs.size(), {}); | |||
for (size_t i = 0; i < outputs.size(); i++) { | |||
outputs[i] = | |||
Tensor::make(output_descs[i].layout, output_descs[i].comp_node); | |||
outputs[i] = Tensor::make( | |||
TensorLayout{ovar->shape(), ovar->dtype()}, ovar->comp_node()); | |||
} | |||
auto raw_outputs = to_raw_ptr_array(outputs, false); | |||
@@ -0,0 +1,66 @@ | |||
#include "megbrain/rdnn/management.h" | |||
#include "megbrain/comp_node_env.h" | |||
#include "megbrain/tensor.h" | |||
#include "megbrain/utils/metahelper.h" | |||
#include "megdnn/handle.h" | |||
#include "megdnn/oprs.h" | |||
/* ================== global functions ================== */ | |||
using namespace mgb; | |||
using namespace mgb::opr; | |||
namespace { | |||
template <class Opr> | |||
class MegDNNGlobalOprContainer final : public UserDataContainer::UserData { | |||
MGB_TYPEINFO_OBJ_DECL; | |||
std::shared_ptr<megdnn::Handle> m_megdnn_handle; | |||
std::unique_ptr<Opr> m_opr; | |||
public: | |||
MegDNNGlobalOprContainer(CompNode cn) | |||
: m_megdnn_handle{intl::get_megdnn_handle_shared(cn)}, | |||
m_opr{m_megdnn_handle->create_operator<Opr>()} { | |||
mgb_assert(m_opr->is_thread_safe()); | |||
} | |||
Opr* get() const { return m_opr.get(); } | |||
}; | |||
template <class Opr> | |||
MGB_TYPEINFO_OBJ_IMPL(MegDNNGlobalOprContainer<Opr>); | |||
} // anonymous namespace | |||
std::shared_ptr<megdnn::Handle> intl::get_megdnn_handle_shared(CompNode comp_node) { | |||
auto& handle = MegDNNHandle::get(CompNodeEnv::from_comp_node(comp_node)); | |||
return {handle.shared_from_this(), handle.handle()}; | |||
} | |||
megdnn::Handle* intl::get_megdnn_handle(CompNode comp_node) { | |||
return MegDNNHandle::get(CompNodeEnv::from_comp_node(comp_node)).handle(); | |||
} | |||
template <typename Opr> | |||
Opr* intl::get_megdnn_global_opr(CompNode comp_node) { | |||
using T = MegDNNGlobalOprContainer<Opr>; | |||
auto maker = [comp_node]() { return std::make_shared<T>(comp_node); }; | |||
return CompNodeEnv::from_comp_node(comp_node).get_user_data<T>(maker).get(); | |||
} | |||
namespace mgb { | |||
namespace opr { | |||
namespace intl { | |||
#define INST(o) template o* get_megdnn_global_opr<o>(CompNode) | |||
INST(megdnn::AddUpdate); | |||
INST(megdnn::Relayout); | |||
INST(megdnn::Checksum); | |||
#undef INST | |||
} // namespace intl | |||
} // namespace opr | |||
} // namespace mgb | |||
// vim: syntax=cpp.doxygen foldmethod=marker foldmarker=f{{{,f}}} |
@@ -0,0 +1,400 @@ | |||
#include "megbrain/rdnn/profiler.h" | |||
#include "megbrain/utils/invoke.h" | |||
#include "megdnn/handle.h" | |||
#include "megdnn/oprs/base.h" | |||
#if MGB_ROCM | |||
#include "hcc_detail/hcc_defs_prologue.h" | |||
#include "megcore_rocm.h" | |||
#endif | |||
//! TODO: here has to be know some megdnn::opr when there is produced midout.h | |||
//! fix it if there is another graceful way. | |||
#include "megdnn/oprs.h" | |||
#include "midout.h" | |||
MIDOUT_DECL(megbrain_opr_profile) | |||
#define MIDOUT_B(...) MIDOUT_BEGIN(megbrain_opr_profile, __VA_ARGS__) { | |||
#define MIDOUT_E \ | |||
} \ | |||
MIDOUT_END(); | |||
namespace { | |||
std::string serialize_policy(const megdnn::ExecutionPolicy& policy) { | |||
std::string ret; | |||
//! serialize AlgorithmDesc | |||
megdnn::Algorithm::serialize_write_pod(policy.algo.handle_type, ret); | |||
megdnn::Algorithm::serialize_write_pod(policy.algo.type, ret); | |||
uint32_t param_size = policy.algo.param.size(); | |||
uint32_t name_size = policy.algo.name.size(); | |||
megdnn::Algorithm::serialize_write_pod<uint32_t>(param_size, ret); | |||
megdnn::Algorithm::serialize_write_pod<uint32_t>(name_size, ret); | |||
ret += policy.algo.param; | |||
ret += policy.algo.name; | |||
//! serialize sub_policy | |||
uint32_t size = policy.sub_policy.size(); | |||
megdnn::Algorithm::serialize_write_pod(size, ret); | |||
for (auto&& sub : policy.sub_policy) { | |||
ret += serialize_policy(sub); | |||
} | |||
return ret; | |||
} | |||
megdnn::ExecutionPolicy deserialize_policy( | |||
const char* buf, uint32_t size, uint32_t& offset) { | |||
megdnn::ExecutionPolicy ret; | |||
#define cb(_val, _type) \ | |||
_val = megdnn::Algorithm::deserialize_read_pod<_type>(buf, offset); \ | |||
offset += sizeof(_val) | |||
cb(ret.algo.handle_type, megdnn::Handle::HandleType); | |||
cb(ret.algo.type, uint32_t); | |||
uint32_t param_size = 0; | |||
uint32_t name_size = 0; | |||
cb(param_size, uint32_t); | |||
cb(name_size, uint32_t); | |||
if (param_size > 0) { | |||
ret.algo.param = std::string(buf + offset, param_size); | |||
offset += param_size; | |||
} | |||
if (name_size > 0) { | |||
ret.algo.name = std::string(buf + offset, name_size); | |||
offset += name_size; | |||
} | |||
uint32_t nr_policy = 0; | |||
cb(nr_policy, uint32_t); | |||
#undef cb | |||
for (uint32_t i = 0; i < nr_policy; i++) { | |||
ret.sub_policy.push_back(deserialize_policy(buf, size, offset)); | |||
} | |||
return ret; | |||
} | |||
} // namespace | |||
namespace mgb { | |||
namespace rdnn { | |||
#define APPLY(statement, ...) \ | |||
mgb::apply( \ | |||
[&](const auto&... args) { return statement; }, \ | |||
std::tuple_cat(__VA_ARGS__)) | |||
////////////// TimedProfiler::Param::ExecutionPolicyBlob ////////////////////// | |||
template <typename Opr> | |||
typename TimedProfiler<Opr>::Param::ExecutionPolicyBlob TimedProfiler<Opr>::Param:: | |||
ExecutionPolicyBlob::serialize(const megdnn::ExecutionPolicy& policy) { | |||
ExecutionPolicyBlob ret; | |||
std::string serialize_bin = serialize_policy(policy); | |||
mgb_assert(serialize_bin.size() < MAX_SIZE_IN_BYTES); | |||
memcpy(ret.data, serialize_bin.data(), serialize_bin.size()); | |||
ret.size = serialize_bin.size(); | |||
return ret; | |||
} | |||
template <typename Opr> | |||
megdnn::ExecutionPolicy TimedProfiler<Opr>::Param::ExecutionPolicyBlob::deserialize() | |||
const { | |||
uint32_t offset = 0; | |||
auto&& ret = deserialize_policy(data, size, offset); | |||
mgb_assert(offset == size); | |||
return std::move(ret); | |||
} | |||
#define INST(Opr) \ | |||
template typename TimedProfiler<megdnn::Opr>::Param::ExecutionPolicyBlob \ | |||
TimedProfiler<megdnn::Opr>::Param::ExecutionPolicyBlob::serialize( \ | |||
const megdnn::ExecutionPolicy& policy); \ | |||
template megdnn::ExecutionPolicy \ | |||
TimedProfiler<megdnn::Opr>::Param::ExecutionPolicyBlob::deserialize() const; | |||
DNN_FOREACH_FASTRUN_OPR(INST) | |||
#undef INST | |||
////////////////// TimedProfiler ////////////////////////////// | |||
template <typename Opr> | |||
const double TimedProfiler<Opr>::timeout_setting = | |||
TimedProfiler<Opr>::init_timeout_setting(); | |||
template <typename Opr> | |||
double TimedProfiler<Opr>::init_timeout_setting() { | |||
#if MGB_ENABLE_FASTRUN | |||
sys::TimedFuncInvoker::ins().register_func( | |||
AlgoChooserFuncId<Opr>::ID, &TimedProfiler<Opr>::prof_impl, | |||
&TimedProfiler<Opr>::prof_init_device); | |||
auto to_set = MGB_GETENV("MGB_CONV_PROFILING_TIMEOUT"); | |||
if (to_set) | |||
return std::stod(to_set); | |||
#endif | |||
return 0; | |||
} | |||
#define APPLY(statement, ...) \ | |||
mgb::apply( \ | |||
[&](const auto&... args) { return statement; }, \ | |||
std::tuple_cat(__VA_ARGS__)) | |||
template <typename Opr> | |||
void TimedProfiler<Opr>::preprocess( | |||
const TensorLayoutArray&, const megdnn::SmallVector<DeviceTensorND>&, | |||
UniqPtrWithCN<Opr>&, megdnn::Workspace&, std::array<TensorLayout, arity>&, | |||
std::array<DeviceTensorND, arity_in>&, PreprocessFilter<Opr>&) { | |||
// Opr is neither convbias nor convolution.This function do nothing. | |||
} | |||
//! convbias | |||
template <> | |||
void TimedProfiler<megdnn::ConvBias>::preprocess( | |||
const TensorLayoutArray& preprocessed_layout, | |||
const SmallVector<DeviceTensorND>& flt_val, | |||
UniqPtrWithCN<megdnn::ConvBias>& megdnn_opr, megdnn::Workspace& mdn_workspace, | |||
std::array<TensorLayout, arity>& layouts, | |||
std::array<DeviceTensorND, arity_in>& inp_val, | |||
PreprocessFilter<megdnn::ConvBias>& prep_flt) { | |||
if (!preprocessed_layout.empty()) { | |||
auto&& pf = prep_flt; | |||
pf.algorithm_id = nullptr; | |||
pf.tensors.resize(flt_val.size()); | |||
for (size_t i = 0; i < flt_val.size(); i++) { | |||
pf.tensors[i] = flt_val[i].as_megdnn(); | |||
} | |||
APPLY(megdnn_opr->exec_preprocess(args..., &pf, mdn_workspace), | |||
std::forward_as_tuple( | |||
layouts[0], inp_val[1].as_megdnn(), inp_val[2].as_megdnn()), | |||
array_skip<arity_in - 1>(layouts)); | |||
} | |||
} | |||
//! convolution | |||
template <> | |||
void TimedProfiler<megdnn::ConvolutionForward>::preprocess( | |||
const TensorLayoutArray& preprocessed_layout, | |||
const megdnn::SmallVector<DeviceTensorND>& flt_val, | |||
UniqPtrWithCN<megdnn::ConvolutionForward>& megdnn_opr, | |||
megdnn::Workspace& mdn_workspace, std::array<TensorLayout, arity>& layouts, | |||
std::array<DeviceTensorND, arity_in>& inp_val, | |||
PreprocessFilter<megdnn::ConvolutionForward>& prep_flt) { | |||
if (!preprocessed_layout.empty()) { | |||
auto&& pf = prep_flt; | |||
pf.algorithm_id = nullptr; | |||
pf.tensors.resize(flt_val.size()); | |||
for (size_t i = 0; i < flt_val.size(); i++) { | |||
pf.tensors[i] = flt_val[i].as_megdnn(); | |||
} | |||
APPLY(megdnn_opr->exec_preprocess(args..., &pf, mdn_workspace), | |||
std::forward_as_tuple(layouts[0], inp_val[1].as_megdnn()), | |||
array_skip<2>(layouts)); | |||
} | |||
} | |||
template <typename Opr> | |||
typename TimedProfiler<Opr>::TResult TimedProfiler<Opr>::prof_impl( | |||
const TParam& raw_param) { | |||
MIDOUT_B(Opr, midout_iv(MGB_HASH_STR("TimedProfiler::prof_impl"))) | |||
#if MGB_ROCM | |||
bool miopen_algo_search_enabled; | |||
megcore::getMIOpenAlgoSearchStatus(&miopen_algo_search_enabled); | |||
mgb_assert(miopen_algo_search_enabled, "MIOpen algo search not enabled"); | |||
#endif | |||
auto&& param = raw_param.as_single_pod<Param>(); | |||
CompNode cn = CompNode::load(param.comp_node_physical, param.comp_node_logical); | |||
auto megdnn_opr = opr::intl::create_megdnn_opr<Opr>(cn); | |||
std::array<TensorLayout, arity> layouts; | |||
auto from_enum = [&](DTypeEnum enumv) -> DType { | |||
switch (enumv) { | |||
#define cb(_dt) \ | |||
case DTypeTrait<_dt>::enumv: \ | |||
return _dt(1.0f, static_cast<uint8_t>(0)) | |||
cb(dtype::Quantized8Asymm); | |||
cb(dtype::Quantized4Asymm); | |||
#undef cb | |||
#define cb(_dt) \ | |||
case DTypeTrait<_dt>::enumv: \ | |||
return _dt(1.0f) | |||
cb(dtype::QuantizedS8); | |||
cb(dtype::QuantizedS16); | |||
cb(dtype::QuantizedS32); | |||
cb(dtype::QuantizedS4); | |||
default: | |||
return DType::from_enum(enumv); | |||
#undef cb | |||
} | |||
}; | |||
for (int i = 0; i < arity; ++i) { | |||
layouts[i] = {param.shapes[i], from_enum(param.dtypes[i])}; | |||
} | |||
megdnn_opr->param() = param.opr_param; | |||
megdnn_opr->execution_policy() = param.execution_policy.deserialize(); | |||
// Allocate preprocessed weight buffers. | |||
TensorLayoutArray preprocessed_layout; | |||
if_constexpr<opr_supports_preprocess<Opr>()>([&](auto _) { | |||
if (param.allow_weight_preprocess) { | |||
preprocessed_layout = APPLY( | |||
_(megdnn_opr)->deduce_preprocessed_filter_layout(args...), layouts); | |||
} | |||
}); | |||
{ | |||
// first allocate a whole chunk to avoid memory fragmentation (here we | |||
// rely on memory allocator to reuse memory) | |||
auto align = cn.get_mem_addr_alignment(); | |||
size_t tot_size = align; | |||
for (int i = 0; i < arity; ++i) { | |||
tot_size += layouts[i].span().high_byte + align; | |||
} | |||
for (const auto& layout : preprocessed_layout) { | |||
tot_size += layout.span().high_byte + align; | |||
} | |||
tot_size += param.workspace; | |||
DeviceTensorStorage storage{cn}; | |||
storage.ensure_size(tot_size); | |||
} | |||
// allocate input and output memory | |||
std::array<DeviceTensorND, arity_in> inp_val; | |||
std::array<DeviceTensorND, arity_out> out_val; | |||
DeviceTensorND workspace; | |||
for (int i = 0; i < arity_in; ++i) { | |||
inp_val[i].comp_node(cn).dtype(layouts[i].dtype).resize(layouts[i]); | |||
} | |||
for (int i = 0; i < arity_out; ++i) { | |||
out_val[i] | |||
.comp_node(cn) | |||
.dtype(layouts[arity_in + i].dtype) | |||
.resize(layouts[arity_in + i]); | |||
} | |||
megdnn::Workspace mdn_workspace; | |||
// allocate workspace | |||
if (param.workspace) { | |||
workspace.comp_node(cn).dtype(dtype::Byte()).resize({param.workspace}); | |||
mdn_workspace.size = param.workspace; | |||
mdn_workspace.raw_ptr = workspace.raw_ptr(); | |||
} | |||
// allocate storage for preprocessed filter | |||
SmallVector<DeviceTensorND> flt_val(preprocessed_layout.size()); | |||
for (size_t i = 0; i < preprocessed_layout.size(); i++) { | |||
flt_val[i] = { | |||
cn, preprocessed_layout[i], preprocessed_layout[i].dtype, | |||
preprocessed_layout[i].format}; | |||
} | |||
for (int i = 0; i < arity_in; ++i) { | |||
fill_zero_dev_tensor(inp_val[i]); | |||
} | |||
PreprocessFilter<Opr> prep_flt; | |||
preprocess( | |||
preprocessed_layout, flt_val, megdnn_opr, mdn_workspace, layouts, inp_val, | |||
prep_flt); | |||
RealTimer timer; | |||
auto ev_start = cn.create_event(CompNode::Event::NEED_TIMER), | |||
ev_end = cn.create_event(CompNode::Event::NEED_TIMER); | |||
ev_start->record(); | |||
if_constexpr<opr_supports_preprocess<Opr>()>( | |||
[&](auto _) { | |||
auto&& opr = _(megdnn_opr); | |||
PreprocessFilter<Opr>* pf = | |||
preprocessed_layout.empty() ? nullptr : &prep_flt; | |||
APPLY(opr->exec(args.as_megdnn()..., pf, mdn_workspace), inp_val, | |||
out_val); | |||
}, | |||
/* else */ | |||
[&](auto _) { | |||
APPLY(_(megdnn_opr)->exec(args.as_megdnn()..., mdn_workspace), inp_val, | |||
out_val); | |||
}); | |||
ev_end->record(); | |||
megdnn::Algorithm* algo = | |||
megdnn_opr->get_algorithm_from_desc(megdnn_opr->execution_policy().algo); | |||
mgb_assert(algo); | |||
double next_report_time = 0.5; | |||
while (!ev_end->finished()) { | |||
if (timer.get_secs() >= next_report_time) { | |||
#if MGB_ENABLE_GETENV | |||
mgb_log_warn( | |||
"profiling conv algo %s already took %.3f/%.3f secs" | |||
" (limit can be set by MGB_CONV_PROFILING_TIMEOUT) ", | |||
algo->name(), timer.get_secs(), param.actual_timeout); | |||
#else | |||
mgb_log_warn( | |||
"profiling conv algo %s already took %.3f/%.3f secs", algo->name(), | |||
timer.get_secs(), param.actual_timeout); | |||
#endif | |||
next_report_time = timer.get_secs() + 1; | |||
} | |||
using namespace std::literals; | |||
#if !__DEPLOY_ON_XP_SP2__ | |||
std::this_thread::sleep_for(1000us); | |||
#endif | |||
} | |||
// release all free blocks owned by child process, | |||
// in order to avoid main process running out of memory | |||
cn.try_coalesce_all_free_memory(); | |||
mgb_assert(ev_start->finished()); | |||
return TResult::from_pod(Result{ev_start->elapsed_time_until(*ev_end)}); | |||
MIDOUT_E | |||
}; | |||
template <typename Opr> | |||
Maybe<typename TimedProfiler<Opr>::Result> TimedProfiler<Opr>::profile( | |||
const Param& param, double& timeout) { | |||
mgb_assert(timeout >= 0); | |||
if (!timeout) { | |||
timeout = timeout_setting; | |||
} else if (timeout_setting) { | |||
timeout = std::min(timeout, timeout_setting); | |||
} | |||
param.actual_timeout = timeout ? timeout : std::numeric_limits<double>::infinity(); | |||
auto res = sys::TimedFuncInvoker::ins().invoke( | |||
AlgoChooserFuncId<Opr>::ID, TParam::from_pod(const_cast<Param&>(param)), | |||
timeout); | |||
if (res.valid()) | |||
return res.val().template as_single_pod<Result>(); | |||
return None; | |||
} | |||
template <typename Opr> | |||
void TimedProfiler<Opr>::prof_init_device(const TParam& raw_param) { | |||
MIDOUT_B(Opr, midout_iv(MGB_HASH_STR("TimedProfiler::prof_init_device"))) | |||
#if MGB_ROCM | |||
megcore::enableMIOpenAlgoSearch(true); | |||
#endif | |||
auto&& param = raw_param.as_single_pod<Param>(); | |||
CompNode cn = CompNode::load(param.comp_node_physical, param.comp_node_logical); | |||
// wait for cuda init, so its time does not get accounted in timeout | |||
cn.sync(); | |||
MIDOUT_E | |||
} | |||
#define INST(Opr) \ | |||
template const double TimedProfiler<megdnn::Opr>::timeout_setting; \ | |||
template double TimedProfiler<megdnn::Opr>::init_timeout_setting(); \ | |||
template typename TimedProfiler<megdnn::Opr>::TResult \ | |||
TimedProfiler<megdnn::Opr>::prof_impl(const TParam& raw_param); \ | |||
template Maybe<typename TimedProfiler<megdnn::Opr>::Result> \ | |||
TimedProfiler<megdnn::Opr>::profile(const Param& param, double& timeout); \ | |||
template void TimedProfiler<megdnn::Opr>::prof_init_device(const TParam& raw_param); | |||
DNN_FOREACH_FASTRUN_OPR(INST) | |||
#undef INST | |||
} // namespace rdnn | |||
} // namespace mgb | |||
// vim: syntax=cpp.doxygen foldmethod=marker foldmarker=f{{{,f}}} |
@@ -0,0 +1,174 @@ | |||
#pragma once | |||
#include <memory> | |||
#include "megbrain/opr/param_defs.h" | |||
#include "megbrain/rdnn/profiler.h" | |||
#include "megbrain/utils/persistent_cache.h" | |||
#include "megdnn/oprs/base.h" | |||
namespace mgb { | |||
namespace rdnn { | |||
//! define logical operation of megdnn::param::ExecutionPolicy::Strategy::Enum | |||
//! and megdnn::detail::AlgoAttribute enum | |||
using ExecutionStrategy = megdnn::param::ExecutionPolicy::Strategy; | |||
using AlgoAttribute = megdnn::AlgoAttribute; | |||
/* =================== AlgoChooser =================== */ | |||
/*! | |||
* \brief choose algorithm according to ExecutionPolicy | |||
* | |||
* This class only provides static methods, and the entry point is | |||
* AlgoChooser::setup_algo. When profiling is needed, it would first try to | |||
* retrive profiling stats from cache, and run TimedProfiler when necessary | |||
* | |||
* \tparam Opr megdnn operator impl | |||
*/ | |||
struct AlgoChooserDesc { | |||
uint32_t shared_batch_size = 0; | |||
bool binary_equal_between_batch = false; | |||
bool no_profiling_on_shape_change = false; | |||
using WorkspaceLimitGetter = std::function<size_t(CompNode, size_t)>; | |||
WorkspaceLimitGetter get_workspace_limit; | |||
}; | |||
template <typename Opr> | |||
class AlgoChooser { | |||
static constexpr int arity_in = OprArityTrait<Opr>::arity_in; | |||
static constexpr int arity_out = OprArityTrait<Opr>::arity_out; | |||
static constexpr int arity = OprArityTrait<Opr>::arity; | |||
using ImplAlgo = typename Opr::AlgorithmInfo; | |||
using ImplAlgoDesc = typename Opr::AlgorithmInfo::Desc; | |||
protected: | |||
using ImplExecutionPolicy = megdnn::ExecutionPolicy; | |||
public: | |||
using FixedTensorLayouts = std::array<TensorLayout, arity>; | |||
class AlgoChooserHelper { | |||
//! fastrun layouts | |||
FixedTensorLayouts m_fastrun_layouts; | |||
//! layouts used when get and set cache item | |||
FixedTensorLayouts m_incache_layouts; | |||
Opr* m_dnn_opr; | |||
std::string m_param; | |||
CompNode m_cn; | |||
megdnn::param::ExecutionPolicy m_execution_policy; | |||
bool m_allow_weight_preprocess; | |||
const AlgoChooserDesc& m_desc; | |||
public: | |||
MGE_WIN_DECLSPEC_FUC AlgoChooserHelper( | |||
const FixedTensorLayouts& layouts, Opr* megdnn_opr, | |||
const std::string& param_str, const CompNode& cn, | |||
const megdnn::param::ExecutionPolicy& execution_policy, | |||
bool allow_weight_preprocess, const AlgoChooserDesc& desc); | |||
Opr* megdnn_opr() const { return m_dnn_opr; } | |||
const TensorLayout& inp_layout(size_t idx) const { | |||
return m_fastrun_layouts[idx]; | |||
} | |||
const megdnn::param::ExecutionPolicy& execution_policy() const { | |||
return m_execution_policy; | |||
} | |||
CompNode comp_node() const { return m_cn; } | |||
const std::string& param() const { return m_param; } | |||
bool allow_weight_preprocess() const { return m_allow_weight_preprocess; } | |||
megdnn::Algorithm* get_algorithm_from_desc( | |||
const megdnn::Algorithm::Info::Desc& desc) const { | |||
return m_dnn_opr->get_algorithm_from_desc(desc); | |||
} | |||
const FixedTensorLayouts& fastrun_layouts() const { return m_fastrun_layouts; } | |||
const FixedTensorLayouts& incache_layouts() const { return m_incache_layouts; } | |||
const AlgoChooserDesc& desc() const { return m_desc; } | |||
//! construct algo chain by heuristic | |||
ImplExecutionPolicy choose_by_heuristic( | |||
const ExecutionStrategy& selected_strategy) const; | |||
//! construct algo chain by profiling | |||
ImplExecutionPolicy choose_by_profile( | |||
const ExecutionStrategy& selected_strategy, bool enable_update) const; | |||
//! get all profile algorithm from cache, return invalid if not exists | |||
std::pair<ImplAlgoDesc, Maybe<AlgoChooserProfileCache::Result>> | |||
get_profile_result_from_cache(const ExecutionStrategy& selected_strategy) const; | |||
/** | |||
* \brief construct execution policy from cache or heuristic. | |||
* | |||
* \param selected_strategy select algo which matched this strategy | |||
* \param[in,out] policy execution policy | |||
* \param retrive_from_cache retrive algo from cache if set True, get | |||
* from heuristic otherwise. | |||
* \param allow_log no warning log print if set True, print warning info | |||
* otherwise. | |||
*/ | |||
void construct_execution_policy( | |||
const ExecutionStrategy& selected_strategy, ImplExecutionPolicy& policy, | |||
bool retrive_from_cache = true, bool allow_log = true) const; | |||
//! get workspace size required for specific execution policy | |||
MGE_WIN_DECLSPEC_FUC size_t get_workspace_size_bytes( | |||
const ImplExecutionPolicy& policy, | |||
const FixedTensorLayouts& layouts = {}) const; | |||
//! get all candidate algos, and the one choose_by_heuristic() is | |||
//! put first | |||
std::vector<ImplAlgo> get_all_candidates() const; | |||
/*! | |||
* \brief profile a single algorithm | |||
* | |||
* This is actually a wrapper that constructs param and call | |||
* TimedProfiler<Opr>::profile for the actual profiling | |||
* | |||
* \param[in,out] timeout set the timeout, and return the actual | |||
* timeout used during profiling | |||
*/ | |||
Maybe<AlgoChooserProfileCache::ResultEntry> profile_single_algo( | |||
const ImplExecutionPolicy& policy, double& timeout) const; | |||
//! profile and save to cache | |||
void profile(const ExecutionStrategy& selected_strategy) const; | |||
/** | |||
* \brief extract algo attribute from execution strategy and graph | |||
* option. | |||
* | |||
* \param strategy select algo which matched this strategy | |||
* \return pair<positive_attr, negative_attr> | |||
*/ | |||
std::pair<AlgoAttribute, AlgoAttribute> extract_algo_attribute( | |||
const ExecutionStrategy& strategy) const; | |||
private: | |||
Maybe<PreprocessFilter<Opr>> construct_fake_preprocess_filter( | |||
const FixedTensorLayouts& layouts = {}) const; | |||
}; | |||
template <typename U> | |||
friend class AlgoChooser; | |||
//! entrance for getting algorithm according to execution strategy | |||
MGE_WIN_DECLSPEC_FUC static ImplExecutionPolicy get_policy( | |||
const AlgoChooserHelper& helper); | |||
//! format given layouts to string | |||
static std::string format_fixlayouts(const FixedTensorLayouts& layout); | |||
}; | |||
} // namespace rdnn | |||
} // namespace mgb | |||
// vim: syntax=cpp.doxygen foldmethod=marker foldmarker=f{{{,f}}} |
@@ -0,0 +1,57 @@ | |||
#pragma once | |||
#include "megbrain/comp_node.h" | |||
#include "megdnn/handle.h" | |||
namespace mgb { | |||
namespace opr { | |||
namespace intl { | |||
//! get megdnn handle from comp node | |||
MGE_WIN_DECLSPEC_FUC megdnn::Handle* get_megdnn_handle(CompNode comp_node); | |||
MGE_WIN_DECLSPEC_FUC std::shared_ptr<megdnn::Handle> get_megdnn_handle_shared( | |||
CompNode comp_node); | |||
/*! | |||
* \brief get global megdnn operator asscoated with a computing node | |||
* \tparam Opr megdnn operator class, must be one of: | |||
* * AddUpdate | |||
* * Relayout | |||
* * Checksum | |||
*/ | |||
template <typename Opr> | |||
MGE_WIN_DECLSPEC_FUC Opr* get_megdnn_global_opr(CompNode comp_node); | |||
template <class Obj> | |||
class UniqPtrWithCN : public std::unique_ptr<Obj> { | |||
CompNode m_cn; | |||
public: | |||
UniqPtrWithCN() = default; | |||
template <class RObj> | |||
UniqPtrWithCN(UniqPtrWithCN<RObj>&& o) | |||
: std::unique_ptr<Obj>(std::move(o)), m_cn(o.comp_node()) {} | |||
UniqPtrWithCN(std::unique_ptr<Obj> ptr, CompNode cn) | |||
: std::unique_ptr<Obj>{std::move(ptr)}, m_cn{cn} {} | |||
CompNode comp_node() const { return m_cn; } | |||
}; | |||
//! create megdnn opr from megdnn handle in a CompNode | |||
template <class Opr> | |||
UniqPtrWithCN<Opr> create_megdnn_opr(CompNode comp_node) { | |||
return {get_megdnn_handle(comp_node)->create_operator<Opr>(), comp_node}; | |||
} | |||
} // namespace intl | |||
} // namespace opr | |||
namespace rdnn { | |||
template <typename Obj> | |||
using UniqPtrWithCN = opr::intl::UniqPtrWithCN<Obj>; | |||
} // namespace rdnn | |||
} // namespace mgb | |||
// vim: syntax=cpp.doxygen foldmethod=marker foldmarker=f{{{,f}}} |
@@ -0,0 +1,152 @@ | |||
#pragma once | |||
#include "megbrain/comp_node.h" | |||
#include "megbrain/rdnn/management.h" | |||
#include "megbrain/system.h" | |||
#include "megbrain/tensor.h" | |||
#include "megbrain/utils/hash_ct.h" | |||
#include "megbrain/utils/timer.h" | |||
#include "megdnn/basic_types.h" | |||
#include "megdnn/oprs.h" | |||
namespace mgb { | |||
namespace rdnn { | |||
// clang-format off | |||
#define DNN_FOREACH_FASTRUN_OPR(cb) \ | |||
cb(ConvolutionForward) \ | |||
cb(ConvBiasForward) \ | |||
cb(ConvolutionBackwardData) \ | |||
cb(ConvolutionBackwardFilter) \ | |||
cb(Convolution3DForward) \ | |||
cb(Convolution3DBackwardData) \ | |||
cb(Convolution3DBackwardFilter) \ | |||
cb(LocalShareForward) \ | |||
cb(LocalShareBackwardData) \ | |||
cb(LocalShareBackwardFilter) \ | |||
cb(DeformableConvForward) \ | |||
cb(DeformableConvBackwardFilter) \ | |||
cb(DeformableConvBackwardData) \ | |||
cb(BatchConvBiasForward) \ | |||
cb(MatrixMul) \ | |||
cb(BatchedMatrixMul) \ | |||
cb(PoolingForward) \ | |||
cb(PoolingBackward) | |||
// clang-format on | |||
template <typename Opr> | |||
constexpr bool opr_supports_preprocess() { | |||
return std::is_same<Opr, megdnn::ConvolutionForward>::value || | |||
std::is_same<Opr, megdnn::ConvBias>::value; | |||
} | |||
template <typename Opr> | |||
constexpr bool opr_contain_bias() { | |||
return std::is_same<Opr, megdnn::ConvBias>::value; | |||
} | |||
//! matmul and batchedMatrixMul | |||
template <typename Opr> | |||
constexpr bool is_matmul() { | |||
return std::is_same<Opr, megdnn::MatrixMul>::value || | |||
std::is_same<Opr, megdnn::BatchedMatrixMul>::value; | |||
} | |||
template <typename Opr, bool has_prep> | |||
struct PreprocessFilterImpl { | |||
using T = union {}; | |||
}; | |||
template <typename Opr> | |||
struct PreprocessFilterImpl<Opr, true> { | |||
using T = typename Opr::PreprocessedFilter; | |||
}; | |||
template <typename Opr> | |||
using PreprocessFilter = | |||
typename PreprocessFilterImpl<Opr, opr_supports_preprocess<Opr>()>::T; | |||
template <typename Opr> | |||
struct AlgoChooserFuncId {}; | |||
#define DEF_FUNC_ID(func) \ | |||
template <> \ | |||
struct AlgoChooserFuncId<megdnn::func> { \ | |||
__attribute__((unused)) static constexpr sys::TimedFuncInvoker::FuncId ID = \ | |||
static_cast<sys::TimedFuncInvoker::FuncId>( \ | |||
MGB_HASH_STR("megdnn::" #func)); \ | |||
}; | |||
DNN_FOREACH_FASTRUN_OPR(DEF_FUNC_ID) | |||
#undef DEF_FUNC_ID | |||
/* =================== TimedProfiler =================== */ | |||
/*! | |||
* \brief profile a megdnn opr conv with given param | |||
* | |||
* This class only provides static methods, and the entry point is | |||
* TimedProfiler::profile; it would run profiler in a timed environment by | |||
* sys::TimedFuncInvoker | |||
* | |||
* \tparam Opr megdnn opr impl | |||
*/ | |||
template <typename Opr> | |||
class TimedProfiler { | |||
static constexpr int arity_in = OprArityTrait<Opr>::arity_in; | |||
static constexpr int arity_out = OprArityTrait<Opr>::arity_out; | |||
static constexpr int arity = OprArityTrait<Opr>::arity; | |||
using TensorShapeArray = std::array<megdnn::TensorShape, arity>; | |||
public: | |||
struct Param { | |||
struct ExecutionPolicyBlob { | |||
//! enlarge the max size if needed | |||
constexpr static size_t MAX_SIZE_IN_BYTES = 10240; | |||
char data[MAX_SIZE_IN_BYTES]; | |||
uint32_t size; | |||
static ExecutionPolicyBlob serialize(const megdnn::ExecutionPolicy& policy); | |||
megdnn::ExecutionPolicy deserialize() const; | |||
}; | |||
ExecutionPolicyBlob execution_policy; | |||
size_t workspace; | |||
megdnn::DTypeEnum dtypes[arity]; | |||
CompNode::Locator comp_node_physical, comp_node_logical; | |||
TensorShapeArray shapes; | |||
typename Opr::Param opr_param; | |||
bool allow_weight_preprocess; | |||
//! filled by profile() | |||
mutable double actual_timeout; | |||
}; | |||
struct Result { | |||
double time; | |||
}; | |||
static Maybe<Result> profile(const Param& param, double& timeout); | |||
private: | |||
using TParam = sys::TimedFuncInvoker::Param; | |||
using TResult = sys::TimedFuncInvoker::Result; | |||
static const double timeout_setting; | |||
static double init_timeout_setting(); | |||
static void preprocess( | |||
const megdnn::TensorLayoutArray& preprocessed_layout, | |||
const SmallVector<DeviceTensorND>& flt_val, UniqPtrWithCN<Opr>& megdnn_opr, | |||
megdnn::Workspace& mdn_workspace, std::array<TensorLayout, arity>& layouts, | |||
std::array<DeviceTensorND, arity_in>& inp_val, | |||
PreprocessFilter<Opr>& prep_flt); | |||
static TResult prof_impl(const TParam& raw_param); | |||
static void prof_init_device(const TParam& raw_param); | |||
}; | |||
} // namespace rdnn | |||
} // namespace mgb | |||
// vim: syntax=cpp.doxygen foldmethod=marker foldmarker=f{{{,f}}} |