GitOrigin-RevId: f56aa5a505
HuaHua404-patch-1
@@ -11,7 +11,8 @@ MGE_WIN_DECLSPEC_FUC size_t setup_algo( | |||||
const typename mgb::rdnn::AlgoChooser<Opr>::FixedTensorLayouts& layouts, | const typename mgb::rdnn::AlgoChooser<Opr>::FixedTensorLayouts& layouts, | ||||
Opr* megdnn_opr, uint32_t shared_batch_size, bool binary_equal_between_batch, | Opr* megdnn_opr, uint32_t shared_batch_size, bool binary_equal_between_batch, | ||||
bool no_profiling_on_shape_change, CompNode comp_node, | bool no_profiling_on_shape_change, CompNode comp_node, | ||||
megdnn::param::ExecutionPolicy execution_policy, bool allow_weight_preprocess) { | |||||
megdnn::param::ExecutionPolicy execution_policy, bool allow_weight_preprocess, | |||||
SmallVector<megdnn::TensorND>* inp_tensornds = nullptr) { | |||||
megdnn::AlgorithmCache::Key cache_key( | megdnn::AlgorithmCache::Key cache_key( | ||||
megdnn_opr->handle(), megdnn_opr->get_opr_type(), layouts.data(), | megdnn_opr->handle(), megdnn_opr->get_opr_type(), layouts.data(), | ||||
layouts.size(), &megdnn_opr->param(), sizeof(megdnn_opr->param())); | layouts.size(), &megdnn_opr->param(), sizeof(megdnn_opr->param())); | ||||
@@ -39,7 +40,7 @@ MGE_WIN_DECLSPEC_FUC size_t setup_algo( | |||||
using AlgoChooserHelper = typename mgb::rdnn::AlgoChooser<Opr>::AlgoChooserHelper; | using AlgoChooserHelper = typename mgb::rdnn::AlgoChooser<Opr>::AlgoChooserHelper; | ||||
AlgoChooserHelper helper( | AlgoChooserHelper helper( | ||||
layouts, megdnn_opr, param_str, comp_node, execution_policy, | layouts, megdnn_opr, param_str, comp_node, execution_policy, | ||||
allow_weight_preprocess, desc); | |||||
allow_weight_preprocess, desc, inp_tensornds); | |||||
megdnn::ExecutionPolicy policy; | megdnn::ExecutionPolicy policy; | ||||
policy = mgb::rdnn::AlgoChooser<Opr>::get_policy(helper); | policy = mgb::rdnn::AlgoChooser<Opr>::get_policy(helper); | ||||
@@ -141,11 +141,8 @@ SmallVector<TensorPtr> apply_on_physical_tensor( | |||||
def, inputs[0]->layout().ndim, inputs[0]->layout(), | def, inputs[0]->layout().ndim, inputs[0]->layout(), | ||||
inputs[1]->layout()); | inputs[1]->layout()); | ||||
DeviceTensorND out = | |||||
BlobManager::inst()->alloc_workspace_with_defrag(cn, out_layout); | |||||
using TensorND = megdnn::TensorND; | using TensorND = megdnn::TensorND; | ||||
SmallVector<TensorND> inp_tensornds(inputs.size()); | |||||
SmallVector<TensorND> inp_tensornds(inputs.size() + 2); | |||||
TensorLayoutArray inp_shapes(inputs.size()), oup_shapes(output_descs.size()); | TensorLayoutArray inp_shapes(inputs.size()), oup_shapes(output_descs.size()); | ||||
for (unsigned i = 0; i < inputs.size(); ++i) { | for (unsigned i = 0; i < inputs.size(); ++i) { | ||||
inp_tensornds[i] = inputs[i]->dnn_tensor(); | inp_tensornds[i] = inputs[i]->dnn_tensor(); | ||||
@@ -168,13 +165,20 @@ SmallVector<TensorPtr> apply_on_physical_tensor( | |||||
TensorLayout empty_shp({0}, inputs[0]->dtype()); | TensorLayout empty_shp({0}, inputs[0]->dtype()); | ||||
empty_shp.ndim = 0; | empty_shp.ndim = 0; | ||||
DeviceTensorND empty_bias = | |||||
BlobManager::inst()->alloc_workspace_with_defrag(cn, empty_shp); | |||||
inp_tensornds[2] = empty_bias.as_megdnn(); | |||||
inp_tensornds[3] = empty_bias.as_megdnn(); | |||||
size_t sz = setup_algo<megdnn::ConvBiasForward>( | size_t sz = setup_algo<megdnn::ConvBiasForward>( | ||||
{inp_shapes[0], inp_shapes[1], empty_shp, empty_shp, oup_shapes[0]}, | {inp_shapes[0], inp_shapes[1], empty_shp, empty_shp, oup_shapes[0]}, | ||||
dnn_opr.op.get(), 0, false, false, cn, conv.policy(), false); | |||||
dnn_opr.op.get(), 0, false, false, cn, conv.policy(), false, | |||||
&inp_tensornds); | |||||
// alloc memory | // alloc memory | ||||
DeviceTensorND empty_bias = | |||||
BlobManager::inst()->alloc_workspace_with_defrag(cn, empty_shp); | |||||
DeviceTensorND out = | |||||
BlobManager::inst()->alloc_workspace_with_defrag(cn, out_layout); | |||||
TensorLayout w_layout({sz}, dtype::Byte()); | TensorLayout w_layout({sz}, dtype::Byte()); | ||||
auto dnn_wk = dnn_opr.create_workspace(w_layout); | auto dnn_wk = dnn_opr.create_workspace(w_layout); | ||||
@@ -364,9 +368,6 @@ SmallVector<TensorPtr> apply_on_physical_tensor( | |||||
def, inputs[1]->layout().ndim, inputs[0]->layout(), inputs[1]->layout(), | def, inputs[1]->layout().ndim, inputs[0]->layout(), inputs[1]->layout(), | ||||
cn); | cn); | ||||
DeviceTensorND out = | |||||
BlobManager::inst()->alloc_workspace_with_defrag(cn, out_layout); | |||||
using TensorND = megdnn::TensorND; | using TensorND = megdnn::TensorND; | ||||
SmallVector<TensorND> inp_tensornds(inputs.size()); | SmallVector<TensorND> inp_tensornds(inputs.size()); | ||||
TensorLayoutArray inp_shapes(inputs.size()), oup_shapes(output_descs.size()); | TensorLayoutArray inp_shapes(inputs.size()), oup_shapes(output_descs.size()); | ||||
@@ -380,7 +381,10 @@ SmallVector<TensorPtr> apply_on_physical_tensor( | |||||
size_t sz = setup_algo<megdnn::ConvolutionBackwardData>( | size_t sz = setup_algo<megdnn::ConvolutionBackwardData>( | ||||
{inp_shapes[0], inp_shapes[1], oup_shapes[0]}, dnn_opr.op.get(), 0, false, | {inp_shapes[0], inp_shapes[1], oup_shapes[0]}, dnn_opr.op.get(), 0, false, | ||||
false, cn, convbwd.policy(), false); | |||||
false, cn, convbwd.policy(), false, &inp_tensornds); | |||||
DeviceTensorND out = | |||||
BlobManager::inst()->alloc_workspace_with_defrag(cn, out_layout); | |||||
auto wk = Blob::make(cn, sz); | auto wk = Blob::make(cn, sz); | ||||
auto ptr = wk->storage().get(); | auto ptr = wk->storage().get(); | ||||
@@ -542,7 +546,7 @@ SmallVector<TensorPtr> apply_on_physical_tensor( | |||||
// shape infer | // shape infer | ||||
size_t sz = setup_algo<megdnn::Convolution3D>( | size_t sz = setup_algo<megdnn::Convolution3D>( | ||||
{inp_shapes[0], inp_shapes[1], oup_shapes[0]}, dnn_opr.op.get(), 0, false, | {inp_shapes[0], inp_shapes[1], oup_shapes[0]}, dnn_opr.op.get(), 0, false, | ||||
false, cn, conv.policy(), false); | |||||
false, cn, conv.policy(), false, &inp_tensornds); | |||||
// alloc memory | // alloc memory | ||||
DeviceTensorND out = | DeviceTensorND out = | ||||
@@ -598,8 +602,9 @@ SmallVector<TensorPtr> apply_on_physical_tensor( | |||||
SmallVector<LogicalTensorDesc>& output_descs, const bool& validated) { | SmallVector<LogicalTensorDesc>& output_descs, const bool& validated) { | ||||
auto&& op_def = def.cast_final_safe<Convolution3DBackwardData>(); | auto&& op_def = def.cast_final_safe<Convolution3DBackwardData>(); | ||||
auto cn = inputs[0]->comp_node(); | auto cn = inputs[0]->comp_node(); | ||||
megdnn::TensorND weight = inputs[0]->dnn_tensor(); | |||||
megdnn::TensorND diff = inputs[1]->dnn_tensor(); | |||||
auto&& wlayout = inputs[0]->layout(); | |||||
auto&& dlayout = inputs[1]->layout(); | |||||
DnnOprCaller<megdnn::Convolution3DBackwardData> caller(cn); | DnnOprCaller<megdnn::Convolution3DBackwardData> caller(cn); | ||||
auto&& dnn_opr = caller.op; | auto&& dnn_opr = caller.op; | ||||
@@ -608,21 +613,24 @@ SmallVector<TensorPtr> apply_on_physical_tensor( | |||||
TensorLayout& oup_layout = output_descs[0].layout; | TensorLayout& oup_layout = output_descs[0].layout; | ||||
if (!validated) { | if (!validated) { | ||||
megdnn::Convolution3DBackwardData::deduce_layout_impl( | megdnn::Convolution3DBackwardData::deduce_layout_impl( | ||||
weight.layout, diff.layout, op_def.param(), oup_layout); | |||||
wlayout, dlayout, op_def.param(), oup_layout); | |||||
} | } | ||||
DeviceTensorND oup = | DeviceTensorND oup = | ||||
BlobManager::inst()->alloc_workspace_with_defrag(cn, oup_layout); | BlobManager::inst()->alloc_workspace_with_defrag(cn, oup_layout); | ||||
SmallVector<megdnn::TensorND> inp_tensornds(inputs.size()); | |||||
inp_tensornds[0] = inputs[0]->dnn_tensor(); | |||||
inp_tensornds[1] = inputs[1]->dnn_tensor(); | |||||
size_t wk_size = setup_algo<megdnn::Convolution3DBackwardData>( | size_t wk_size = setup_algo<megdnn::Convolution3DBackwardData>( | ||||
{weight.layout, diff.layout, oup_layout}, dnn_opr.get(), 0, false, false, | |||||
cn, op_def.policy(), false); | |||||
{wlayout, dlayout, oup_layout}, dnn_opr.get(), 0, false, false, cn, | |||||
op_def.policy(), false, &inp_tensornds); | |||||
megdnn::Workspace dnn_wk; | megdnn::Workspace dnn_wk; | ||||
if (wk_size != 0) { | if (wk_size != 0) { | ||||
TensorLayout w_layout({wk_size}, dtype::Byte()); | TensorLayout w_layout({wk_size}, dtype::Byte()); | ||||
dnn_wk = caller.create_workspace(w_layout); | dnn_wk = caller.create_workspace(w_layout); | ||||
} | } | ||||
dnn_opr->exec(weight, diff, oup.as_megdnn(), dnn_wk); | |||||
dnn_opr->exec(inp_tensornds[0], inp_tensornds[1], oup.as_megdnn(), dnn_wk); | |||||
return {Tensor::make(oup)}; | return {Tensor::make(oup)}; | ||||
} | } | ||||
@@ -229,12 +229,11 @@ SmallVector<TensorPtr> apply_on_physical_tensor( | |||||
inp_tensornds[0].layout = layout_a; | inp_tensornds[0].layout = layout_a; | ||||
inp_tensornds[1].layout = layout_b; | inp_tensornds[1].layout = layout_b; | ||||
} | } | ||||
DeviceTensorND out = | |||||
BlobManager::inst()->alloc_workspace_with_defrag(cn, dst_layout); | |||||
size_t sz = setup_algo<megdnn::MatrixMul>( | size_t sz = setup_algo<megdnn::MatrixMul>( | ||||
{layout_a, layout_b, dst_layout}, dnn_opr.op.get(), 0, false, false, cn, | {layout_a, layout_b, dst_layout}, dnn_opr.op.get(), 0, false, false, cn, | ||||
matmul.policy(), false); | |||||
matmul.policy(), false, &inp_tensornds); | |||||
DeviceTensorND out = | |||||
BlobManager::inst()->alloc_workspace_with_defrag(cn, dst_layout); | |||||
TensorLayout w_layout({sz}, dtype::Byte()); | TensorLayout w_layout({sz}, dtype::Byte()); | ||||
auto dnn_wk = dnn_opr.create_workspace(w_layout); | auto dnn_wk = dnn_opr.create_workspace(w_layout); | ||||
@@ -470,21 +469,22 @@ SmallVector<TensorPtr> apply_on_physical_tensor( | |||||
return {Tensor::make(out)}; | return {Tensor::make(out)}; | ||||
} | } | ||||
using TensorND = megdnn::TensorND; | |||||
TensorND inp_nd1 = inp1->dnn_tensor(); | |||||
inp_nd1.layout = layout1; | |||||
TensorND inp_nd2 = inp2->dnn_tensor(); | |||||
inp_nd2.layout = layout2; | |||||
SmallVector<megdnn::TensorND> inp_tensornds(2u); | |||||
inp_tensornds[0] = inp1->dnn_tensor(); | |||||
inp_tensornds[0].layout = layout1; | |||||
inp_tensornds[1] = inp2->dnn_tensor(); | |||||
inp_tensornds[1].layout = layout2; | |||||
size_t sz = setup_algo<megdnn::BatchedMatrixMul>( | |||||
{layout1, layout2, dst_layout}, dnn_opr.op.get(), 0, false, false, cn, | |||||
matmul.policy(), false, &inp_tensornds); | |||||
DeviceTensorND out = | DeviceTensorND out = | ||||
BlobManager::inst()->alloc_workspace_with_defrag(cn, dst_layout); | BlobManager::inst()->alloc_workspace_with_defrag(cn, dst_layout); | ||||
size_t sz = setup_algo<megdnn::BatchedMatrixMul>( | |||||
{layout1, layout2, dst_layout}, dnn_opr.op.get(), 0, false, false, cn, | |||||
matmul.policy(), false); | |||||
TensorLayout w_layout({sz}, dtype::Byte()); | TensorLayout w_layout({sz}, dtype::Byte()); | ||||
auto dnn_wk = dnn_opr.create_workspace(w_layout); | auto dnn_wk = dnn_opr.create_workspace(w_layout); | ||||
dnn_opr.op->exec(inp_nd1, inp_nd2, out.as_megdnn(), dnn_wk); | |||||
dnn_opr.op->exec(inp_tensornds[0], inp_tensornds[1], out.as_megdnn(), dnn_wk); | |||||
shp1[shp1.ndim - 2] = dst_layout[dst_layout.ndim - 2]; | shp1[shp1.ndim - 2] = dst_layout[dst_layout.ndim - 2]; | ||||
shp1[shp1.ndim - 1] = dst_layout[dst_layout.ndim - 1]; | shp1[shp1.ndim - 1] = dst_layout[dst_layout.ndim - 1]; | ||||
@@ -49,23 +49,25 @@ SmallVector<TensorPtr> apply_on_physical_tensor( | |||||
auto&& op_def = def.cast_final_safe<Pooling>(); | auto&& op_def = def.cast_final_safe<Pooling>(); | ||||
auto cn = inputs[0]->comp_node(); | auto cn = inputs[0]->comp_node(); | ||||
megdnn::TensorND inp_tensornd = inputs[0]->dnn_tensor(); | |||||
DnnOprCaller<megdnn::Pooling> caller(cn); | DnnOprCaller<megdnn::Pooling> caller(cn); | ||||
auto&& dnn_opr = caller.op; | auto&& dnn_opr = caller.op; | ||||
dnn_opr->param() = op_def.param(); | dnn_opr->param() = op_def.param(); | ||||
SmallVector<megdnn::TensorND> inp_tensornds(inputs.size()); | |||||
inp_tensornds[0] = inputs[0]->dnn_tensor(); | |||||
TensorLayout& oup_layout = output_descs[0].layout; | TensorLayout& oup_layout = output_descs[0].layout; | ||||
if (!validated) { | if (!validated) { | ||||
megdnn::Pooling::deduce_layout_impl( | megdnn::Pooling::deduce_layout_impl( | ||||
inp_tensornd.layout, op_def.param(), oup_layout); | |||||
inp_tensornds[0].layout, op_def.param(), oup_layout); | |||||
} | } | ||||
DeviceTensorND out_devtensor = | |||||
BlobManager::inst()->alloc_workspace_with_defrag(cn, oup_layout); | |||||
size_t wk_size = setup_algo<megdnn::Pooling>( | size_t wk_size = setup_algo<megdnn::Pooling>( | ||||
{inp_tensornd.layout, oup_layout}, dnn_opr.get(), 0, false, false, cn, | |||||
op_def.policy(), false); | |||||
{inp_tensornds[0].layout, oup_layout}, dnn_opr.get(), 0, false, false, cn, | |||||
op_def.policy(), false, &inp_tensornds); | |||||
DeviceTensorND out_devtensor = | |||||
BlobManager::inst()->alloc_workspace_with_defrag(cn, oup_layout); | |||||
megdnn::Workspace dnn_wk; | megdnn::Workspace dnn_wk; | ||||
if (wk_size) { | if (wk_size) { | ||||
@@ -73,7 +75,7 @@ SmallVector<TensorPtr> apply_on_physical_tensor( | |||||
dnn_wk = caller.create_workspace(w_layout); | dnn_wk = caller.create_workspace(w_layout); | ||||
} | } | ||||
dnn_opr->exec(inp_tensornd, out_devtensor.as_megdnn(), dnn_wk); | |||||
dnn_opr->exec(inp_tensornds[0], out_devtensor.as_megdnn(), dnn_wk); | |||||
return {Tensor::make(out_devtensor)}; | return {Tensor::make(out_devtensor)}; | ||||
} | } | ||||
@@ -265,7 +265,8 @@ std::vector<megdnn::Algorithm::SearchItem> flatten_search_space( | |||||
typename rdnn::AlgoChooser<_Opr>::AlgoChooserHelper sub_helper( | typename rdnn::AlgoChooser<_Opr>::AlgoChooserHelper sub_helper( | ||||
to_fixed_layouts<_Opr>(_item.layouts), megdnn_opr.get(), | to_fixed_layouts<_Opr>(_item.layouts), megdnn_opr.get(), | ||||
_item.param, helper.comp_node(), helper.execution_policy(), | _item.param, helper.comp_node(), helper.execution_policy(), | ||||
helper.allow_weight_preprocess(), helper.desc()); | |||||
helper.allow_weight_preprocess(), helper.desc(), | |||||
helper.get_input()); | |||||
auto space = flatten_search_space<_Opr>(sub_helper, checker); | auto space = flatten_search_space<_Opr>(sub_helper, checker); | ||||
ret.insert(ret.end(), space.begin(), space.end()); | ret.insert(ret.end(), space.begin(), space.end()); | ||||
}); | }); | ||||
@@ -488,7 +489,8 @@ AlgoChooser<Opr>::AlgoChooserHelper::AlgoChooserHelper( | |||||
const FixedTensorLayouts& layouts, Opr* megdnn_opr, | const FixedTensorLayouts& layouts, Opr* megdnn_opr, | ||||
const std::string& param_str, const CompNode& cn, | const std::string& param_str, const CompNode& cn, | ||||
const megdnn::param::ExecutionPolicy& execution_policy, | const megdnn::param::ExecutionPolicy& execution_policy, | ||||
bool allow_weight_preprocess, const AlgoChooserDesc& desc) | |||||
bool allow_weight_preprocess, const AlgoChooserDesc& desc, | |||||
SmallVector<megdnn::TensorND>* inputs) | |||||
: m_fastrun_layouts{layouts}, | : m_fastrun_layouts{layouts}, | ||||
m_incache_layouts{layouts}, | m_incache_layouts{layouts}, | ||||
m_dnn_opr{megdnn_opr}, | m_dnn_opr{megdnn_opr}, | ||||
@@ -496,7 +498,8 @@ AlgoChooser<Opr>::AlgoChooserHelper::AlgoChooserHelper( | |||||
m_cn{cn}, | m_cn{cn}, | ||||
m_execution_policy{execution_policy}, | m_execution_policy{execution_policy}, | ||||
m_allow_weight_preprocess{allow_weight_preprocess}, | m_allow_weight_preprocess{allow_weight_preprocess}, | ||||
m_desc{desc} { | |||||
m_desc{desc}, | |||||
m_inputs{inputs} { | |||||
auto fastrun_batch_size = desc.shared_batch_size; | auto fastrun_batch_size = desc.shared_batch_size; | ||||
if (fastrun_batch_size) { | if (fastrun_batch_size) { | ||||
@@ -604,7 +607,7 @@ typename AlgoChooser<Opr>::ImplExecutionPolicy AlgoChooser<Opr>::AlgoChooserHelp | |||||
typename AlgoChooser<_Opr>::AlgoChooserHelper sub_helper( | typename AlgoChooser<_Opr>::AlgoChooserHelper sub_helper( | ||||
to_fixed_layouts<_Opr>(_item.layouts), megdnn_opr.get(), | to_fixed_layouts<_Opr>(_item.layouts), megdnn_opr.get(), | ||||
_item.param, m_cn, m_execution_policy, m_allow_weight_preprocess, | _item.param, m_cn, m_execution_policy, m_allow_weight_preprocess, | ||||
m_desc); | |||||
m_desc, m_inputs); | |||||
sub_helper.profile(selected_strategy); | sub_helper.profile(selected_strategy); | ||||
}); | }); | ||||
} | } | ||||
@@ -868,6 +871,7 @@ Maybe<AlgoChooserProfileCache::ResultEntry> AlgoChooser<Opr>::AlgoChooserHelper: | |||||
param.shapes[i] = m_fastrun_layouts[i]; | param.shapes[i] = m_fastrun_layouts[i]; | ||||
param.opr_param = m_dnn_opr->param(); | param.opr_param = m_dnn_opr->param(); | ||||
param.allow_weight_preprocess = m_allow_weight_preprocess; | param.allow_weight_preprocess = m_allow_weight_preprocess; | ||||
param.inp_tensornds = m_inputs; | |||||
Algorithm* palgo = m_dnn_opr->get_algorithm_from_desc(policy.algo); | Algorithm* palgo = m_dnn_opr->get_algorithm_from_desc(policy.algo); | ||||
mgb_assert(palgo, "can not find algo when profile single algo"); | mgb_assert(palgo, "can not find algo when profile single algo"); | ||||
@@ -964,7 +968,9 @@ void AlgoChooser<Opr>::AlgoChooserHelper::profile( | |||||
if (!policy.algo.valid()) | if (!policy.algo.valid()) | ||||
continue; | continue; | ||||
size_t workspace_needed = get_workspace_size_bytes(policy); | size_t workspace_needed = get_workspace_size_bytes(policy); | ||||
if (data_size + workspace_needed > | |||||
if (m_inputs != nullptr) | |||||
workspace_needed += data_size; | |||||
if (workspace_needed > | |||||
m_desc.get_workspace_limit(m_cn, m_execution_policy.workspace_limit)) { | m_desc.get_workspace_limit(m_cn, m_execution_policy.workspace_limit)) { | ||||
continue; | continue; | ||||
} | } | ||||
@@ -1101,7 +1107,8 @@ std::pair<AlgoAttribute, AlgoAttribute> AlgoChooser<Opr>::AlgoChooserHelper:: | |||||
const FixedTensorLayouts& layouts, megdnn::Opr* megdnn_opr, \ | const FixedTensorLayouts& layouts, megdnn::Opr* megdnn_opr, \ | ||||
const std::string& param_str, const CompNode& cn, \ | const std::string& param_str, const CompNode& cn, \ | ||||
const megdnn::param::ExecutionPolicy& execution_policy, \ | const megdnn::param::ExecutionPolicy& execution_policy, \ | ||||
bool allow_weight_preprocess, const AlgoChooserDesc& desc); \ | |||||
bool allow_weight_preprocess, const AlgoChooserDesc& desc, \ | |||||
SmallVector<megdnn::TensorND>* inputs); \ | |||||
template typename AlgoChooser<megdnn::Opr>::ImplExecutionPolicy \ | template typename AlgoChooser<megdnn::Opr>::ImplExecutionPolicy \ | ||||
AlgoChooser<megdnn::Opr>::AlgoChooserHelper::choose_by_heuristic( \ | AlgoChooser<megdnn::Opr>::AlgoChooserHelper::choose_by_heuristic( \ | ||||
const ExecutionStrategy& select_strategy) const; \ | const ExecutionStrategy& select_strategy) const; \ | ||||
@@ -143,7 +143,7 @@ template <typename Opr> | |||||
void TimedProfiler<Opr>::preprocess( | void TimedProfiler<Opr>::preprocess( | ||||
const TensorLayoutArray&, const megdnn::SmallVector<DeviceTensorND>&, | const TensorLayoutArray&, const megdnn::SmallVector<DeviceTensorND>&, | ||||
UniqPtrWithCN<Opr>&, megdnn::Workspace&, std::array<TensorLayout, arity>&, | UniqPtrWithCN<Opr>&, megdnn::Workspace&, std::array<TensorLayout, arity>&, | ||||
std::array<DeviceTensorND, arity_in>&, PreprocessFilter<Opr>&) { | |||||
std::array<megdnn::TensorND, arity_in>&, PreprocessFilter<Opr>&) { | |||||
// Opr is neither convbias nor convolution.This function do nothing. | // Opr is neither convbias nor convolution.This function do nothing. | ||||
} | } | ||||
@@ -154,7 +154,7 @@ void TimedProfiler<megdnn::ConvBias>::preprocess( | |||||
const SmallVector<DeviceTensorND>& flt_val, | const SmallVector<DeviceTensorND>& flt_val, | ||||
UniqPtrWithCN<megdnn::ConvBias>& megdnn_opr, megdnn::Workspace& mdn_workspace, | UniqPtrWithCN<megdnn::ConvBias>& megdnn_opr, megdnn::Workspace& mdn_workspace, | ||||
std::array<TensorLayout, arity>& layouts, | std::array<TensorLayout, arity>& layouts, | ||||
std::array<DeviceTensorND, arity_in>& inp_val, | |||||
std::array<megdnn::TensorND, arity_in>& inp_val, | |||||
PreprocessFilter<megdnn::ConvBias>& prep_flt) { | PreprocessFilter<megdnn::ConvBias>& prep_flt) { | ||||
if (!preprocessed_layout.empty()) { | if (!preprocessed_layout.empty()) { | ||||
auto&& pf = prep_flt; | auto&& pf = prep_flt; | ||||
@@ -164,8 +164,7 @@ void TimedProfiler<megdnn::ConvBias>::preprocess( | |||||
pf.tensors[i] = flt_val[i].as_megdnn(); | pf.tensors[i] = flt_val[i].as_megdnn(); | ||||
} | } | ||||
APPLY(megdnn_opr->exec_preprocess(args..., &pf, mdn_workspace), | APPLY(megdnn_opr->exec_preprocess(args..., &pf, mdn_workspace), | ||||
std::forward_as_tuple( | |||||
layouts[0], inp_val[1].as_megdnn(), inp_val[2].as_megdnn()), | |||||
std::forward_as_tuple(layouts[0], inp_val[1], inp_val[2]), | |||||
array_skip<arity_in - 1>(layouts)); | array_skip<arity_in - 1>(layouts)); | ||||
} | } | ||||
} | } | ||||
@@ -177,7 +176,7 @@ void TimedProfiler<megdnn::ConvolutionForward>::preprocess( | |||||
const megdnn::SmallVector<DeviceTensorND>& flt_val, | const megdnn::SmallVector<DeviceTensorND>& flt_val, | ||||
UniqPtrWithCN<megdnn::ConvolutionForward>& megdnn_opr, | UniqPtrWithCN<megdnn::ConvolutionForward>& megdnn_opr, | ||||
megdnn::Workspace& mdn_workspace, std::array<TensorLayout, arity>& layouts, | megdnn::Workspace& mdn_workspace, std::array<TensorLayout, arity>& layouts, | ||||
std::array<DeviceTensorND, arity_in>& inp_val, | |||||
std::array<megdnn::TensorND, arity_in>& inp_val, | |||||
PreprocessFilter<megdnn::ConvolutionForward>& prep_flt) { | PreprocessFilter<megdnn::ConvolutionForward>& prep_flt) { | ||||
if (!preprocessed_layout.empty()) { | if (!preprocessed_layout.empty()) { | ||||
auto&& pf = prep_flt; | auto&& pf = prep_flt; | ||||
@@ -187,8 +186,7 @@ void TimedProfiler<megdnn::ConvolutionForward>::preprocess( | |||||
pf.tensors[i] = flt_val[i].as_megdnn(); | pf.tensors[i] = flt_val[i].as_megdnn(); | ||||
} | } | ||||
APPLY(megdnn_opr->exec_preprocess(args..., &pf, mdn_workspace), | APPLY(megdnn_opr->exec_preprocess(args..., &pf, mdn_workspace), | ||||
std::forward_as_tuple(layouts[0], inp_val[1].as_megdnn()), | |||||
array_skip<2>(layouts)); | |||||
std::forward_as_tuple(layouts[0], inp_val[1]), array_skip<2>(layouts)); | |||||
} | } | ||||
} | } | ||||
@@ -259,8 +257,12 @@ typename TimedProfiler<Opr>::TResult TimedProfiler<Opr>::prof_impl( | |||||
std::max(cn.get_free_mem(), cn.get_max_block_size_available()); | std::max(cn.get_free_mem(), cn.get_max_block_size_available()); | ||||
auto align = cn.get_mem_addr_alignment(); | auto align = cn.get_mem_addr_alignment(); | ||||
size_t tot_size = align; | size_t tot_size = align; | ||||
for (int i = 0; i < arity; ++i) { | |||||
tot_size += layouts[i].span().high_byte + align; | |||||
for (size_t i = 0; i < arity; ++i) { | |||||
// if input tensornds are given, only consider output tensornds | |||||
if (param.inp_tensornds != nullptr) { | |||||
if (i >= (*param.inp_tensornds).size()) | |||||
tot_size += layouts[i].span().high_byte + align; | |||||
} | |||||
} | } | ||||
for (const auto& layout : preprocessed_layout) { | for (const auto& layout : preprocessed_layout) { | ||||
tot_size += layout.span().high_byte + align; | tot_size += layout.span().high_byte + align; | ||||
@@ -275,20 +277,34 @@ typename TimedProfiler<Opr>::TResult TimedProfiler<Opr>::prof_impl( | |||||
#endif | #endif | ||||
// allocate input and output memory | // allocate input and output memory | ||||
std::array<DeviceTensorND, arity_in> inp_val; | |||||
std::array<DeviceTensorND, arity_out> out_val; | |||||
std::array<DeviceTensorND, arity_in> inp_dev; | |||||
std::array<DeviceTensorND, arity_out> out_dev; | |||||
std::array<megdnn::TensorND, arity_in> inp_val; | |||||
std::array<megdnn::TensorND, arity_out> out_val; | |||||
DeviceTensorND workspace; | DeviceTensorND workspace; | ||||
for (int i = 0; i < arity_in; ++i) { | |||||
inp_val[i].comp_node(cn).dtype(layouts[i].dtype).resize(layouts[i]); | |||||
if (param.inp_tensornds != nullptr) { | |||||
// if inp_tensornds exists, then reusing it | |||||
for (int i = 0; i < arity_in; ++i) { | |||||
inp_val[i] = (*param.inp_tensornds)[i]; | |||||
} | |||||
} else { | |||||
// inp_tensornds does not exist, create zero tensor with the same layout | |||||
for (int i = 0; i < arity_in; ++i) { | |||||
inp_dev[i].comp_node(cn).dtype(layouts[i].dtype).resize(layouts[i]); | |||||
fill_zero_dev_tensor(inp_dev[i]); | |||||
inp_val[i] = inp_dev[i].as_megdnn(); | |||||
} | |||||
} | } | ||||
for (int i = 0; i < arity_out; ++i) { | for (int i = 0; i < arity_out; ++i) { | ||||
out_val[i] | |||||
out_dev[i] | |||||
.comp_node(cn) | .comp_node(cn) | ||||
.dtype(layouts[arity_in + i].dtype) | .dtype(layouts[arity_in + i].dtype) | ||||
.resize(layouts[arity_in + i]); | .resize(layouts[arity_in + i]); | ||||
out_val[i] = out_dev[i].as_megdnn(); | |||||
} | } | ||||
megdnn::Workspace mdn_workspace; | |||||
megdnn::Workspace mdn_workspace; | |||||
// allocate workspace | // allocate workspace | ||||
if (param.workspace) { | if (param.workspace) { | ||||
workspace.comp_node(cn).dtype(dtype::Byte()).resize({param.workspace}); | workspace.comp_node(cn).dtype(dtype::Byte()).resize({param.workspace}); | ||||
@@ -304,10 +320,6 @@ typename TimedProfiler<Opr>::TResult TimedProfiler<Opr>::prof_impl( | |||||
preprocessed_layout[i].format}; | preprocessed_layout[i].format}; | ||||
} | } | ||||
for (int i = 0; i < arity_in; ++i) { | |||||
fill_zero_dev_tensor(inp_val[i]); | |||||
} | |||||
PreprocessFilter<Opr> prep_flt; | PreprocessFilter<Opr> prep_flt; | ||||
preprocess( | preprocess( | ||||
preprocessed_layout, flt_val, megdnn_opr, mdn_workspace, layouts, inp_val, | preprocessed_layout, flt_val, megdnn_opr, mdn_workspace, layouts, inp_val, | ||||
@@ -322,13 +334,12 @@ typename TimedProfiler<Opr>::TResult TimedProfiler<Opr>::prof_impl( | |||||
auto&& opr = _(megdnn_opr); | auto&& opr = _(megdnn_opr); | ||||
PreprocessFilter<Opr>* pf = | PreprocessFilter<Opr>* pf = | ||||
preprocessed_layout.empty() ? nullptr : &prep_flt; | preprocessed_layout.empty() ? nullptr : &prep_flt; | ||||
APPLY(opr->exec(args.as_megdnn()..., pf, mdn_workspace), inp_val, | |||||
out_val); | |||||
APPLY(opr->exec(args..., pf, mdn_workspace), inp_val, out_val); | |||||
}, | }, | ||||
/* else */ | /* else */ | ||||
[&](auto _) { | [&](auto _) { | ||||
APPLY(_(megdnn_opr)->exec(args.as_megdnn()..., mdn_workspace), | |||||
inp_val, out_val); | |||||
APPLY(_(megdnn_opr)->exec(args..., mdn_workspace), inp_val, | |||||
out_val); | |||||
}); | }); | ||||
} | } | ||||
ev_start->record(); | ev_start->record(); | ||||
@@ -337,13 +348,11 @@ typename TimedProfiler<Opr>::TResult TimedProfiler<Opr>::prof_impl( | |||||
auto&& opr = _(megdnn_opr); | auto&& opr = _(megdnn_opr); | ||||
PreprocessFilter<Opr>* pf = | PreprocessFilter<Opr>* pf = | ||||
preprocessed_layout.empty() ? nullptr : &prep_flt; | preprocessed_layout.empty() ? nullptr : &prep_flt; | ||||
APPLY(opr->exec(args.as_megdnn()..., pf, mdn_workspace), inp_val, | |||||
out_val); | |||||
APPLY(opr->exec(args..., pf, mdn_workspace), inp_val, out_val); | |||||
}, | }, | ||||
/* else */ | /* else */ | ||||
[&](auto _) { | [&](auto _) { | ||||
APPLY(_(megdnn_opr)->exec(args.as_megdnn()..., mdn_workspace), inp_val, | |||||
out_val); | |||||
APPLY(_(megdnn_opr)->exec(args..., mdn_workspace), inp_val, out_val); | |||||
}); | }); | ||||
ev_end->record(); | ev_end->record(); | ||||
@@ -370,10 +379,10 @@ typename TimedProfiler<Opr>::TResult TimedProfiler<Opr>::prof_impl( | |||||
DeviceTensorStorage storage; | DeviceTensorStorage storage; | ||||
for (int i = 0; i < arity_in; ++i) { | for (int i = 0; i < arity_in; ++i) { | ||||
inp_val[i].reset(storage, TensorLayout{}); | |||||
inp_dev[i].reset(storage, TensorLayout{}); | |||||
} | } | ||||
for (int i = 0; i < arity_out; ++i) { | for (int i = 0; i < arity_out; ++i) { | ||||
out_val[i].reset(storage, TensorLayout{}); | |||||
out_dev[i].reset(storage, TensorLayout{}); | |||||
} | } | ||||
for (size_t i = 0; i < preprocessed_layout.size(); i++) { | for (size_t i = 0; i < preprocessed_layout.size(); i++) { | ||||
flt_val[i].reset(storage, TensorLayout{}); | flt_val[i].reset(storage, TensorLayout{}); | ||||
@@ -60,13 +60,15 @@ public: | |||||
megdnn::param::ExecutionPolicy m_execution_policy; | megdnn::param::ExecutionPolicy m_execution_policy; | ||||
bool m_allow_weight_preprocess; | bool m_allow_weight_preprocess; | ||||
const AlgoChooserDesc& m_desc; | const AlgoChooserDesc& m_desc; | ||||
SmallVector<megdnn::TensorND>* m_inputs; | |||||
public: | public: | ||||
MGE_WIN_DECLSPEC_FUC AlgoChooserHelper( | MGE_WIN_DECLSPEC_FUC AlgoChooserHelper( | ||||
const FixedTensorLayouts& layouts, Opr* megdnn_opr, | const FixedTensorLayouts& layouts, Opr* megdnn_opr, | ||||
const std::string& param_str, const CompNode& cn, | const std::string& param_str, const CompNode& cn, | ||||
const megdnn::param::ExecutionPolicy& execution_policy, | const megdnn::param::ExecutionPolicy& execution_policy, | ||||
bool allow_weight_preprocess, const AlgoChooserDesc& desc); | |||||
bool allow_weight_preprocess, const AlgoChooserDesc& desc, | |||||
SmallVector<megdnn::TensorND>* inputs = nullptr); | |||||
Opr* megdnn_opr() const { return m_dnn_opr; } | Opr* megdnn_opr() const { return m_dnn_opr; } | ||||
@@ -93,6 +95,8 @@ public: | |||||
const AlgoChooserDesc& desc() const { return m_desc; } | const AlgoChooserDesc& desc() const { return m_desc; } | ||||
SmallVector<megdnn::TensorND>* get_input() const { return m_inputs; } | |||||
//! construct algo chain by heuristic | //! construct algo chain by heuristic | ||||
ImplExecutionPolicy choose_by_heuristic( | ImplExecutionPolicy choose_by_heuristic( | ||||
const ExecutionStrategy& selected_strategy) const; | const ExecutionStrategy& selected_strategy) const; | ||||
@@ -122,6 +122,8 @@ public: | |||||
//! filled by profile() | //! filled by profile() | ||||
mutable double actual_timeout; | mutable double actual_timeout; | ||||
// input | |||||
SmallVector<megdnn::TensorND>* inp_tensornds; | |||||
}; | }; | ||||
struct Result { | struct Result { | ||||
@@ -141,7 +143,7 @@ private: | |||||
const megdnn::TensorLayoutArray& preprocessed_layout, | const megdnn::TensorLayoutArray& preprocessed_layout, | ||||
const SmallVector<DeviceTensorND>& flt_val, UniqPtrWithCN<Opr>& megdnn_opr, | const SmallVector<DeviceTensorND>& flt_val, UniqPtrWithCN<Opr>& megdnn_opr, | ||||
megdnn::Workspace& mdn_workspace, std::array<TensorLayout, arity>& layouts, | megdnn::Workspace& mdn_workspace, std::array<TensorLayout, arity>& layouts, | ||||
std::array<DeviceTensorND, arity_in>& inp_val, | |||||
std::array<megdnn::TensorND, arity_in>& inp_val, | |||||
PreprocessFilter<Opr>& prep_flt); | PreprocessFilter<Opr>& prep_flt); | ||||
static TResult prof_impl(const TParam& raw_param); | static TResult prof_impl(const TParam& raw_param); | ||||
static void prof_init_device(const TParam& raw_param); | static void prof_init_device(const TParam& raw_param); | ||||