diff --git a/dnn/include/megdnn/heuristic_cache.h b/dnn/include/megdnn/heuristic_cache.h index 75f75c67..f8daf65a 100644 --- a/dnn/include/megdnn/heuristic_cache.h +++ b/dnn/include/megdnn/heuristic_cache.h @@ -26,7 +26,7 @@ private: HeuristicCache() = default; public: - static HeuristicCache& instance(); + MGE_WIN_DECLSPEC_FUC static HeuristicCache& instance(); struct KeyStorage { std::string category; @@ -67,9 +67,9 @@ public: size_t workspace; }; - void put(const Key& key, Result& result); + MGE_WIN_DECLSPEC_FUC void put(const Key& key, Result& result); - Result get(const Key& key); + MGE_WIN_DECLSPEC_FUC Result get(const Key& key); void clear(); diff --git a/imperative/src/impl/algo_chooser.h b/imperative/src/impl/algo_chooser.h new file mode 100644 index 00000000..454c8723 --- /dev/null +++ b/imperative/src/impl/algo_chooser.h @@ -0,0 +1,54 @@ +#include "megbrain/rdnn/algo_chooser.h" +#include "megdnn/heuristic_cache.h" + +namespace mgb { +namespace imperative { + +template +MGE_WIN_DECLSPEC_FUC size_t setup_algo( + const typename mgb::rdnn::AlgoChooser::FixedTensorLayouts& layouts, + Opr* megdnn_opr, uint32_t shared_batch_size, bool binary_equal_between_batch, + bool no_profiling_on_shape_change, CompNode comp_node, + megdnn::param::ExecutionPolicy execution_policy, bool allow_weight_preprocess) { + megdnn::HeuristicCache::Key cache_key( + megdnn_opr->handle(), megdnn_opr->get_opr_type(), layouts.data(), + layouts.size(), &megdnn_opr->param(), sizeof(megdnn_opr->param())); + auto rst = megdnn::HeuristicCache::instance().get(cache_key); + if (rst.policy.algo.valid()) { + megdnn_opr->execution_policy() = rst.policy; + return rst.workspace; + } + + std::string param_str; + megdnn::Algorithm::serialize_write_pod(megdnn_opr->param(), param_str); + + rdnn::AlgoChooserDesc desc; + desc.shared_batch_size = shared_batch_size; + desc.binary_equal_between_batch = binary_equal_between_batch; + desc.no_profiling_on_shape_change = no_profiling_on_shape_change; + desc.get_workspace_limit = [&](CompNode cn, size_t old_limit) { + size_t free = cn.get_free_mem(); + size_t lmt = cn.get_max_block_size_available(); + return std::max(lmt, free); + }; + + using AlgoChooserHelper = typename mgb::rdnn::AlgoChooser::AlgoChooserHelper; + AlgoChooserHelper helper( + layouts, megdnn_opr, param_str, comp_node, execution_policy, + allow_weight_preprocess, desc); + + megdnn::ExecutionPolicy policy; + policy = mgb::rdnn::AlgoChooser::get_policy(helper); + size_t workspace = helper.get_workspace_size_bytes(policy, layouts); + + megdnn_opr->execution_policy() = policy; + + if (execution_policy.strategy & rdnn::ExecutionStrategy::HEURISTIC) { + megdnn::HeuristicCache::Result cache_result{policy, workspace}; + megdnn::HeuristicCache::instance().put(cache_key, cache_result); + } + return workspace; +} + +} // namespace imperative +} // namespace mgb diff --git a/imperative/src/impl/ops/convolution.cpp b/imperative/src/impl/ops/convolution.cpp index 90f325d7..e4c19431 100644 --- a/imperative/src/impl/ops/convolution.cpp +++ b/imperative/src/impl/ops/convolution.cpp @@ -10,14 +10,23 @@ */ #include "megbrain/opr/dnn/convolution.h" -#include "megbrain/imperative/ops/autogen.h" - +#include "../algo_chooser.h" +#include "../blob_manager_impl.h" +#include "../dnn_op_helper.h" #include "../op_trait.h" +#include "megbrain/imperative/ops/autogen.h" +#include "megbrain/opr/internal/megdnn_opr_wrapper.h" namespace mgb { namespace imperative { namespace { + +size_t infer_conv_shape(size_t inp, size_t flt, size_t stride, size_t pad) { + mgb_assert(inp + 2 * pad >= flt, "input=%zu padding=%zu filter=%zu", inp, pad, flt); + return (inp + 2 * pad - flt) / stride + 1; +} + namespace convolution { std::shared_ptr make_from_op_node(cg::OperatorNodeBase* node_) { auto* node = &node_->cast_final_safe(); @@ -31,14 +40,200 @@ auto apply_on_var_node(const OpDef& def, const VarNodeArray& inputs) { inputs[0], inputs[1], conv.param(), conv.policy(), config); } +TensorLayout do_shape_infer( + const OpDef& def, size_t src_ndim, TensorLayout src, TensorLayout filter) { + auto&& conv = static_cast(def); + using Param = ::megdnn::param::Convolution; + + auto img_ndim = src_ndim - 2; + mgb_assert( + img_ndim == 2, + "only 2D convolution is supported, and input should be 4-dim; " + "got input dim = %zu", + src_ndim); + size_t group = 1; + size_t flt_start, flt_spatial_start, ocpg_pos, icpg_pos; + if (conv.sparse == Param::Sparse::DENSE) { + mgb_assert( + filter.ndim == img_ndim + 2 || filter.ndim == img_ndim + 4, + "bad filter ndim for dense convolution: " + "spatial_ndim=%zu filter_ndim=%zu", + img_ndim, filter.ndim); + group = 1; + flt_start = 0; + } else { // Param::Sparse::GROUP + mgb_assert( + filter.ndim == img_ndim + 3 || filter.ndim == img_ndim + 5, + "bad filter ndim for group convolution: " + "spatial_ndim=%zu filter_ndim=%zu", + img_ndim, filter.ndim); + // grp, oc, ic, dims[] + group = filter[0]; + flt_start = 1; + } + + uint32_t ic_block_size = 1, oc_block_size = 1; + size_t src_or_dst_c_pos = 0; + size_t src_or_dst_spatial_start = 0; + if (conv.format == Param::Format::NCHW) { + // filter should be (oc, ic, fh, fw) + flt_spatial_start = 2; + ocpg_pos = 0; + icpg_pos = 1; + src_or_dst_c_pos = 1; + src_or_dst_spatial_start = 2; + } else { // Param::Format::NHWC + // filter should be (oc, fh, fw, ic) + flt_spatial_start = 1; + ocpg_pos = 0; + icpg_pos = 3; + src_or_dst_c_pos = 3; + src_or_dst_spatial_start = 1; + } + size_t ocpg = filter[flt_start + ocpg_pos] * oc_block_size; + size_t icpg = filter[flt_start + icpg_pos] * ic_block_size; + uint32_t dilation[2], dilated_spatial[2], stride[2], padding[2]; + dilation[0] = conv.dilate_h; + dilation[1] = conv.dilate_w; + stride[0] = conv.stride_h; + stride[1] = conv.stride_w; + padding[0] = conv.pad_h; + padding[1] = conv.pad_w; + for (size_t i = 0; i < img_ndim; ++i) { + mgb_assert( + dilation[i] > 0, "invalid dilation on spatial dim %zu: %u", i, + dilation[i]); + dilated_spatial[i] = + (filter[i + flt_start + flt_spatial_start] - 1) * dilation[i] + 1; + } + mgb_assert(icpg * group == src[src_or_dst_c_pos], "group conv invalid"); + + TensorLayout dst{src.dtype}; + dst.ndim = src_ndim; + dst[0] = src[0]; + dst[src_or_dst_c_pos] = ocpg * group; + for (size_t i = 0; i < img_ndim; ++i) { + dst[i + src_or_dst_spatial_start] = infer_conv_shape( + src[i + src_or_dst_spatial_start], dilated_spatial[i], stride[i], + padding[i]); + } + dst.init_contiguous_stride(); + return dst; +} + +std::tuple, bool> infer_output_attrs_fallible( + const OpDef& def, const SmallVector& inputs) { + auto&& conv = static_cast(def); + + using Param = ::megdnn::param::Convolution; + + SmallVector dests(1); + auto&& desc = dests[0]; + desc.comp_node = inputs[0].comp_node; + + TensorLayout src = inputs[0].layout; + size_t src_ndim = src.ndim; + if (src_ndim == 0) { + desc.layout = src; + return {dests, false}; + } + + TensorLayout filter = inputs[1].layout; + desc.layout = do_shape_infer(def, src_ndim, src, filter); + return {dests, true}; +} + +SmallVector apply_on_physical_tensor( + const OpDef& def, const SmallVector& inputs, + SmallVector& output_descs, const bool& validated) { + // create megdnn opr + auto&& conv = static_cast(def); + CompNode cn = inputs[0]->comp_node(); + + TensorLayout out_layout = output_descs[0].layout; + if (!validated) + out_layout = do_shape_infer( + def, inputs[0]->layout().ndim, inputs[0]->layout(), + inputs[1]->layout()); + + DeviceTensorND out = + BlobManager::inst()->alloc_workspace_with_defrag(cn, out_layout); + + using TensorND = megdnn::TensorND; + SmallVector inp_tensornds(inputs.size()); + TensorLayoutArray inp_shapes(inputs.size()), oup_shapes(output_descs.size()); + for (unsigned i = 0; i < inputs.size(); ++i) { + inp_tensornds[i] = inputs[i]->dnn_tensor(); + inp_shapes[i] = inputs[i]->layout(); + } + oup_shapes[0] = out_layout; + auto&& dnn_opr = opr::intl::create_megdnn_opr(cn); + dnn_opr->param().pad_h = conv.pad_h; + dnn_opr->param().pad_w = conv.pad_w; + dnn_opr->param().stride_h = conv.stride_h; + dnn_opr->param().stride_w = conv.stride_w; + dnn_opr->param().dilate_h = conv.dilate_h; + dnn_opr->param().dilate_w = conv.dilate_w; + dnn_opr->param().sparse = conv.sparse; + dnn_opr->param().compute_mode = conv.compute_mode; + dnn_opr->param().format = conv.format; + + // shape infer + TensorLayout shp({0}, inputs[0]->dtype()); + shp.ndim = 0; + + size_t sz = setup_algo( + {inp_shapes[0], inp_shapes[1], shp, shp, oup_shapes[0]}, dnn_opr.get(), 0, + false, false, cn, conv.policy(), false); + + // alloc memory + DeviceTensorND bias = BlobManager::inst()->alloc_workspace_with_defrag(cn, shp); + + auto wk = Blob::make(cn, sz); + auto ptr = wk->storage().get(); + megdnn::Workspace dnn_wk(ptr, sz); + + // exeucte + dnn_opr->exec( + inp_tensornds[0], inp_tensornds[1], bias.as_megdnn(), bias.as_megdnn(), + out.as_megdnn(), nullptr, dnn_wk); + return {Tensor::make(out)}; +} + OP_TRAIT_REG(Convolution, Convolution, opr::Convolution) .make_from_op_node(make_from_op_node) .apply_on_var_node(apply_on_var_node) + .infer_output_attrs_fallible(infer_output_attrs_fallible) + .apply_on_physical_tensor(apply_on_physical_tensor) .fallback(); } // namespace convolution } // namespace namespace { +namespace conv_bias { +auto apply_on_var_node(const OpDef& def, const VarNodeArray& inputs) { + auto&& conv = static_cast(def); + cg::OperatorNodeConfig config{conv.dtype}; + config.name(conv.make_name()); + if (inputs.size() == 2) { + return opr::ConvBias::make( + inputs[0], inputs[1], conv.param(), conv.policy(), config); + } else if (inputs.size() == 3) { + return opr::ConvBias::make( + inputs[0], inputs[1], inputs[2], conv.param(), conv.policy(), config); + } else if (inputs.size() == 4) { + return opr::ConvBias::make( + inputs[0], inputs[1], inputs[2], inputs[3], conv.param(), conv.policy(), + config); + } + mgb_assert(0); +} + +OP_TRAIT_REG(ConvBias, ConvBias).apply_on_var_node(apply_on_var_node).fallback(); +} // namespace conv_bias +} // namespace + +namespace { namespace convolution_backward_data { auto apply_on_var_node(const OpDef& def, const VarNodeArray& inputs) { auto&& conv = static_cast(def); @@ -76,9 +271,159 @@ auto apply_on_var_node(const OpDef& def, const VarNodeArray& inputs) { return opr::Convolution3D::make(inputs[0], inputs[1], conv.param(), conv.policy()); } +TensorLayout do_shape_infer( + const OpDef& def, size_t src_ndim, TensorLayout src, TensorLayout filter) { + auto&& conv = static_cast(def); + using Param = ::megdnn::param::Convolution3D; + auto img_ndim = src_ndim - 2; + mgb_assert( + img_ndim == 3, + "only 3D convolution is supported, and input should be 5-dim; " + "got input dim = %zu", + src_ndim); + + size_t group = 1; + size_t flt_start, flt_spatial_start, ocpg_pos, icpg_pos; + if (conv.sparse == Param::Sparse::DENSE) { + mgb_assert( + filter.ndim == img_ndim + 2 || filter.ndim == img_ndim + 4, + "bad filter ndim for dense convolution: " + "spatial_ndim=%zu filter_ndim=%zu", + img_ndim, filter.ndim); + group = 1; + flt_start = 0; + } else { // Param::Sparse::GROUP + mgb_assert( + filter.ndim == img_ndim + 3 || filter.ndim == img_ndim + 5, + "bad filter ndim for group convolution: " + "spatial_ndim=%zu filter_ndim=%zu", + img_ndim, filter.ndim); + + // grp, oc, ic, dims[] + group = filter[0]; + flt_start = 1; + } + + uint32_t ic_block_size = 1, oc_block_size = 1; + size_t src_or_dst_c_pos = 0; + size_t src_or_dst_spatial_start = 0; + if (conv.format == Param::Format::NCDHW) { + // filter should be (oc, ic, fd, fh, fw) + flt_spatial_start = 2; + ocpg_pos = 0; + icpg_pos = 1; + src_or_dst_c_pos = 1; + src_or_dst_spatial_start = 2; + } else { // Param::Format::NDHWC + // filter should be (oc, fd, fh, fw, ic) + flt_spatial_start = 1; + ocpg_pos = 0; + icpg_pos = 4; + src_or_dst_c_pos = 4; + src_or_dst_spatial_start = 1; + } + size_t ocpg = filter[flt_start + ocpg_pos] * oc_block_size; + size_t icpg = filter[flt_start + icpg_pos] * ic_block_size; + uint32_t dilation[3], dilated_spatial[3], stride[3], padding[3]; + dilation[0] = conv.dilate_d; + dilation[1] = conv.dilate_h; + dilation[2] = conv.dilate_w; + stride[0] = conv.stride_d; + stride[1] = conv.stride_h; + stride[2] = conv.stride_w; + padding[0] = conv.pad_d; + padding[1] = conv.pad_h; + padding[2] = conv.pad_w; + for (size_t i = 0; i < img_ndim; ++i) { + mgb_assert( + dilation[i] > 0, "invalid dilation on spatial dim %zu: %u", i, + dilation[i]); + dilated_spatial[i] = + (filter[i + flt_start + flt_spatial_start] - 1) * dilation[i] + 1; + } + mgb_assert(icpg * group == src[src_or_dst_c_pos], "group conv invalid"); + + TensorLayout dst{src.dtype}; + dst.ndim = src_ndim; + dst[0] = src[0]; + dst[src_or_dst_c_pos] = ocpg * group; + for (size_t i = 0; i < img_ndim; ++i) { + dst[i + src_or_dst_spatial_start] = infer_conv_shape( + src[i + src_or_dst_spatial_start], dilated_spatial[i], stride[i], + padding[i]); + } + dst.init_contiguous_stride(); + + return dst; +} + +std::tuple, bool> infer_output_attrs_fallible( + const OpDef& def, const SmallVector& inputs) { + auto&& conv = static_cast(def); + using Param = ::megdnn::param::Convolution3D; + + SmallVector dests(1); + auto&& desc = dests[0]; + desc.comp_node = inputs[0].comp_node; + + TensorLayout src = inputs[0].layout; + size_t src_ndim = src.ndim; + if (src_ndim == 0) { + return {dests, false}; + } + + TensorLayout filter = inputs[1].layout; + desc.layout = do_shape_infer(def, src_ndim, src, filter); + return {dests, true}; +} + +SmallVector apply_on_physical_tensor( + const OpDef& def, const SmallVector& inputs, + SmallVector& output_descs, const bool& validated) { + // create megdnn opr + auto&& conv = static_cast(def); + + TensorLayout out_layout = output_descs[0].layout; + if (!validated) + out_layout = do_shape_infer( + def, inputs[0]->layout().ndim, inputs[0]->layout(), + inputs[1]->layout()); + + using TensorND = megdnn::TensorND; + CompNode cn = inputs[0]->comp_node(); + SmallVector inp_tensornds(inputs.size()); + TensorLayoutArray inp_shapes(inputs.size()), oup_shapes(output_descs.size()); + for (unsigned i = 0; i < inputs.size(); ++i) { + inp_tensornds[i] = inputs[i]->dnn_tensor(); + inp_shapes[i] = inputs[i]->layout(); + } + oup_shapes[0] = out_layout; + auto&& dnn_opr = opr::intl::create_megdnn_opr(cn); + dnn_opr->param() = conv.param(); + + // shape infer + size_t sz = setup_algo( + {inp_shapes[0], inp_shapes[1], oup_shapes[0]}, dnn_opr.get(), 0, false, + false, cn, conv.policy(), false); + + // alloc memory + DeviceTensorND out = + BlobManager::inst()->alloc_workspace_with_defrag(cn, out_layout); + + auto wk = Blob::make(cn, sz); + auto ptr = wk->storage().get(); + megdnn::Workspace dnn_wk(ptr, sz); + + // exeucte + dnn_opr->exec(inp_tensornds[0], inp_tensornds[1], out.as_megdnn(), dnn_wk); + return {Tensor::make(out)}; +} + OP_TRAIT_REG(Convolution3D, Convolution3D, opr::Convolution3D) .make_from_op_node(make_from_op_node) .apply_on_var_node(apply_on_var_node) + .infer_output_attrs_fallible(infer_output_attrs_fallible) + .apply_on_physical_tensor(apply_on_physical_tensor) .fallback(); } // namespace convolution3d } // namespace diff --git a/imperative/src/impl/ops/specializations.cpp b/imperative/src/impl/ops/specializations.cpp index dae2eee1..681ad132 100644 --- a/imperative/src/impl/ops/specializations.cpp +++ b/imperative/src/impl/ops/specializations.cpp @@ -224,30 +224,6 @@ OP_TRAIT_REG(AdaptivePooling, AdaptivePooling) } // namespace namespace { -namespace conv_bias { -auto apply_on_var_node(const OpDef& def, const VarNodeArray& inputs) { - auto&& conv = static_cast(def); - cg::OperatorNodeConfig config{conv.dtype}; - config.name(conv.make_name()); - if (inputs.size() == 2) { - return opr::ConvBias::make( - inputs[0], inputs[1], conv.param(), conv.policy(), config); - } else if (inputs.size() == 3) { - return opr::ConvBias::make( - inputs[0], inputs[1], inputs[2], conv.param(), conv.policy(), config); - } else if (inputs.size() == 4) { - return opr::ConvBias::make( - inputs[0], inputs[1], inputs[2], inputs[3], conv.param(), conv.policy(), - config); - } - mgb_assert(0); -} - -OP_TRAIT_REG(ConvBias, ConvBias).apply_on_var_node(apply_on_var_node).fallback(); -} // namespace conv_bias -} // namespace - -namespace { namespace batch_conv_bias { auto apply_on_var_node(const OpDef& def, const VarNodeArray& inputs) { auto&& conv = static_cast(def);