diff --git a/src/gopt/impl/framework.cpp b/src/gopt/impl/framework.cpp index 9b5784a5..7a216877 100644 --- a/src/gopt/impl/framework.cpp +++ b/src/gopt/impl/framework.cpp @@ -743,20 +743,28 @@ const GraphOptimizer& GraphOptimizer::add_passes_for_optimize_options( }); cb(nhwcd4, { add_pass(); + add_pass(PaddingChannelPass::make( + cg::GraphCommonOptimizeOptions::LayoutTransform::NHWCD4, true)); add_pass(ConvertFormatPass::make_nhwcd4_converter()); }); cb(nchw88, { add_pass(); + add_pass(PaddingChannelPass::make( + cg::GraphCommonOptimizeOptions::LayoutTransform::NCHW88, true)); add_pass(EnableNchwxxPass::make_nchwxx_converter(8)); add_pass(); }); cb(nchw44, { add_pass(); + add_pass(PaddingChannelPass::make( + cg::GraphCommonOptimizeOptions::LayoutTransform::NCHW44, true)); add_pass(EnableNchwxxPass::make_nchwxx_converter(4)); add_pass(); }); cb(nchw44_dot, { add_pass(); + add_pass(PaddingChannelPass::make( + cg::GraphCommonOptimizeOptions::LayoutTransform::NCHW44_DOT, true)); add_pass(EnableNchw44DotPass::make_nchw44_dot_converter()); add_pass(); }); @@ -784,7 +792,7 @@ const GraphOptimizer& GraphOptimizer::add_passes_for_optimize_options( cb(nchw64, { add_pass(); add_pass(PaddingChannelPass::make( - cg::GraphCommonOptimizeOptions::LayoutTransform::NCHW64)); + cg::GraphCommonOptimizeOptions::LayoutTransform::NCHW64, false)); add_pass(); add_pass(EnableNCHW64Pass::make_nchw64_converter()); add_pass(); diff --git a/src/gopt/impl/padding_channel.cpp b/src/gopt/impl/padding_channel.cpp index d4089a90..8261ff73 100644 --- a/src/gopt/impl/padding_channel.cpp +++ b/src/gopt/impl/padding_channel.cpp @@ -1,5 +1,6 @@ #include "megbrain/gopt/inference.h" #include "megbrain/opr/basic_arith.h" +#include "megbrain/opr/dnn/adaptive_pooling.h" #include "megbrain/opr/dnn/convolution.h" #include "megbrain/opr/dnn/pooling.h" #include "megbrain/opr/imgproc.h" @@ -34,8 +35,8 @@ using ReformatKey = ReformatManager::ReformatKey; /* ==================== PaddingChannelPass ================= */ namespace { -size_t padding_int4(size_t in_channel, bool flag) { - static_cast(flag); + +size_t padding_int4(size_t in_channel, bool) { if (in_channel <= 32) { return (8 - (in_channel % 8)) % 8; } else { @@ -43,6 +44,8 @@ size_t padding_int4(size_t in_channel, bool flag) { } } +//! flag is used by user to identify some case, such as in nchw64, flag is used +//! to identify the convbias and convolution backward size_t padding_int8(size_t in_channel, bool flag) { if (flag) { if (in_channel <= 16) { @@ -58,24 +61,41 @@ size_t padding_4(size_t in_channel, bool) { return (4 - (in_channel % 4)) % 4; }; +size_t padding_8(size_t in_channel, bool) { + return (8 - (in_channel % 8)) % 8; +}; + } // namespace std::unique_ptr PaddingChannelPass::make( - cg::GraphCommonOptimizeOptions::LayoutTransform layout_transform) { + cg::GraphCommonOptimizeOptions::LayoutTransform layout_transform, + bool only_padding_weights) { MIDOUT_B("PaddingChannelPass::make") using LayoutTrans = cg::GraphCommonOptimizeOptions::LayoutTransform; - auto ret = std::make_unique(); + auto ret = std::unique_ptr( + new PaddingChannelPass(only_padding_weights)); auto& alignment_map = ret->m_alignment_map; if (layout_transform == LayoutTrans::NCHW64) { alignment_map[DTypeEnum::QuantizedS4] = padding_int4; alignment_map[DTypeEnum::Quantized4Asymm] = padding_int4; alignment_map[DTypeEnum::QuantizedS8] = padding_int8; } else if ( + layout_transform == LayoutTrans::NHWCD4 || layout_transform == LayoutTrans::NCHW44 || layout_transform == LayoutTrans::NCHW44_DOT) { alignment_map[DTypeEnum::QuantizedS8] = padding_4; alignment_map[DTypeEnum::Quantized8Asymm] = padding_4; alignment_map[DTypeEnum::Float32] = padding_4; +#if !MEGDNN_DISABLE_FLOAT16 + alignment_map[DTypeEnum::Float16] = padding_4; +#endif + } else if (layout_transform == LayoutTrans::NCHW88) { + alignment_map[DTypeEnum::QuantizedS8] = padding_8; + alignment_map[DTypeEnum::Quantized8Asymm] = padding_8; + alignment_map[DTypeEnum::Float32] = padding_8; +#if !MEGDNN_DISABLE_FLOAT16 + alignment_map[DTypeEnum::Float16] = padding_8; +#endif } ret->fill_opr_convert_fun(layout_transform); return ret; @@ -138,6 +158,10 @@ VarNode* PaddingChannelPass::extract_subtensor( mgb_assert(inp->shape()[2] == orig_shape[2]); mgb_assert(inp->shape()[3] == orig_shape[3]); size_t orig_channels = orig_shape[1]; + //! if channel is not padding, do nothing + if (orig_channels == inp->shape()[1]) { + return inp; + } auto x = SymbolVar(inp); auto cv = [&x](int v) { return x.make_scalar(v); }; using AIdx = opr::Subtensor::AxisIndexer; @@ -150,8 +174,25 @@ VarNode* PaddingChannelPass::extract_subtensor( }; VarNode* PaddingChannelPass::pad_in_channels(VarNode* inp, size_t pad_channels) { - mgb_assert(inp->shape().ndim == 4); - TensorShape shape{inp->shape()[0], pad_channels, inp->shape()[2], inp->shape()[3]}; + TensorShape shape; + size_t axis = 0; + if (inp->shape().ndim == 4) { + shape = TensorShape{ + inp->shape()[0], pad_channels, inp->shape()[2], inp->shape()[3]}; + axis = 1; + } else { + mgb_assert(inp->shape().ndim == 5); + //! the channel wise convolution + if (inp->shape()[1] == 1 && inp->shape()[2] == 1) { + shape = TensorShape{ + pad_channels, inp->shape()[1], inp->shape()[2], inp->shape()[3], + inp->shape()[4]}; + axis = 0; + } else { + //! the group convolution + mgb_assert(0, "group convolution can't padding cahnnel\n"); + } + } std::shared_ptr host_val = std::make_shared(inp->comp_node(), inp->dtype()); host_val->resize(shape); @@ -159,13 +200,30 @@ VarNode* PaddingChannelPass::pad_in_channels(VarNode* inp, size_t pad_channels) size_t size_bytes = TensorLayout{shape, inp->dtype()}.span().dist_byte(); std::memset(ptr, 0, size_bytes); auto padding = opr::ImmutableTensor::make(*inp->owner_graph(), *host_val); - auto out = opr::Concat::make({inp, padding}, 1); + auto out = opr::Concat::make({inp, padding}, axis); return out.node(); }; VarNode* PaddingChannelPass::pad_out_channels(VarNode* inp, size_t pad_channels) { - mgb_assert(inp->shape().ndim == 4); - TensorShape shape{pad_channels, inp->shape()[1], inp->shape()[2], inp->shape()[3]}; + TensorShape shape; + size_t axis = 0; + if (inp->shape().ndim == 4) { + shape = TensorShape{ + pad_channels, inp->shape()[1], inp->shape()[2], inp->shape()[3]}; + axis = 0; + } else { + mgb_assert(inp->shape().ndim == 5); + //! the channel wise convolution + if (inp->shape()[1] == 1 && inp->shape()[2] == 1) { + shape = TensorShape{ + pad_channels, inp->shape()[1], inp->shape()[2], inp->shape()[3], + inp->shape()[4]}; + axis = 0; + } else { + //! the group convolution + mgb_assert(0, "group convolution can't padding cahnnel\n"); + } + } std::shared_ptr host_val = std::make_shared(inp->comp_node(), inp->dtype()); host_val->resize(shape); @@ -173,15 +231,15 @@ VarNode* PaddingChannelPass::pad_out_channels(VarNode* inp, size_t pad_channels) size_t size_bytes = TensorLayout{shape, inp->dtype()}.span().dist_byte(); std::memset(ptr, 0, size_bytes); auto padding = opr::ImmutableTensor::make(*inp->owner_graph(), *host_val); - auto out = opr::Concat::make({inp, padding}, 0); + auto out = opr::Concat::make({inp, padding}, axis); return out.node(); }; -// padding policy for conv bias with data type qint8 -OperatorNodeBase* PaddingChannelPass::padding_policy( +// padding policy for dense convolution +OperatorNodeBase* PaddingChannelPass::padding_conv_policy( OperatorNodeBase* opr, const VarNodeArray& new_inp) { mgb_assert(opr->input().size() == new_inp.size()); - mgb_assert(new_inp.size() == 3); + mgb_assert(new_inp.size() >= 2); //! new weights and old weights are same shape mgb_assert(opr->input(1)->shape().eq_shape(new_inp[1]->shape())); auto inps = new_inp; @@ -198,7 +256,8 @@ OperatorNodeBase* PaddingChannelPass::padding_policy( if (m_padding_oprs.count(opr->input(0)->owner_opr())) { //! as the opr of input var is padding, but the dtype of input and output of //! the input opr maybe different, so the alignment is not the same - size_t pad_channels_0 = it->second(new_in_channels, true); + size_t pad_channels_0 = + m_only_padding_weights ? 0 : it->second(new_in_channels, true); size_t pad_channels_1 = it->second(in_channels, true); if (pad_channels_0) { inps[0] = pad_in_channels(new_inp[0], pad_channels_0); @@ -211,7 +270,7 @@ OperatorNodeBase* PaddingChannelPass::padding_policy( } else { mgb_assert(new_in_channels == in_channels); size_t pad_channels = it->second(in_channels, true); - if (pad_channels > 0) { + if (pad_channels > 0 && !m_only_padding_weights) { inps[0] = pad_in_channels(new_inp[0], pad_channels); inps[1] = pad_in_channels(new_inp[1], pad_channels); } @@ -220,31 +279,63 @@ OperatorNodeBase* PaddingChannelPass::padding_policy( size_t pad_channels = it->second(out_channels, true); if (pad_channels > 0) { inps[1] = pad_out_channels(inps[1], pad_channels); - inps[2] = pad_in_channels(inps[2], pad_channels); + if (inps.size() >= 3) { + inps[2] = pad_in_channels(inps[2], pad_channels); + } m_padding_oprs.insert(opr); } return serialization::copy_opr_shallow(*opr, inps, opr->config()); }; +//! padding policy for channel wise convolution +OperatorNodeBase* PaddingChannelPass::padding_channel_wise_conv_policy( + OperatorNodeBase* opr, const VarNodeArray& new_inp) { + mgb_assert(opr->input().size() == new_inp.size()); + mgb_assert(opr->input()[1]->shape().ndim == 5); + mgb_assert(new_inp.size() >= 2); + //! new weights and old weights are same shape + mgb_assert(opr->input(1)->shape().eq_shape(new_inp[1]->shape())); + auto inps = new_inp; + size_t group = opr->input(1)->shape()[0]; + size_t new_in_channels = new_inp[0]->shape()[1]; + auto it = m_alignment_map.find(opr->input(0)->dtype().enumv()); + if (it != m_alignment_map.end()) { + mgb_assert(it->second); + } else { + return serialization::copy_opr_shallow(*opr, inps, opr->config()); + } + // pad input channels + if (m_padding_oprs.count(opr->input(0)->owner_opr())) { + size_t pad_channels_1 = new_in_channels - group; + if (pad_channels_1) { + inps[1] = pad_in_channels(new_inp[1], pad_channels_1); + m_padding_oprs.insert(opr); + } + } + return serialization::copy_opr_shallow(*opr, inps, opr->config()); +}; + void PaddingChannelPass::fill_opr_convert_fun(LayoutTrans layout_trans) { - add_convbias_replace_func(layout_trans); + add_conv_replace_func(layout_trans); add_conv_backward_data_replace_func(layout_trans); add_format_aware_opr_replace_func(layout_trans); add_elemwise_like_opr_replace_func(layout_trans); + add_condition_padding_oprs_replace_func(layout_trans); add_nonpadding_oprs_replace_func(layout_trans); } -void PaddingChannelPass::add_convbias_replace_func(LayoutTrans layout_trans) { +void PaddingChannelPass::add_conv_replace_func(LayoutTrans layout_trans) { if (layout_trans == LayoutTrans::NCHW64) { m_opr_replace_funcs[opr::ConvBiasForward::typeinfo()] = [this](OperatorNodeBase* opr, const VarNodeArray& new_inp) { - if (opr->input(0)->dtype().enumv() == DTypeEnum::QuantizedS8) { - return padding_policy(opr, new_inp); - } else if ( - opr->input(0)->dtype().enumv() == DTypeEnum::QuantizedS4 || - opr->input(0)->dtype().enumv() == - DTypeEnum::Quantized4Asymm) { - return padding_policy(opr, new_inp); + mgb_assert( + opr->input()[1]->shape().ndim == 4, + "nchw64 format only support padding channel in dense " + "convolution\n"); + if (opr->input(0)->dtype().enumv() == DTypeEnum::QuantizedS8 || + opr->input(0)->dtype().enumv() == DTypeEnum::QuantizedS4 || + opr->input(0)->dtype().enumv() == DTypeEnum::Quantized4Asymm) { + return padding_conv_policy(opr, new_inp); } else { mgb_assert( m_padding_oprs.count(opr->input(0)->owner_opr()) == 0, @@ -257,11 +348,36 @@ void PaddingChannelPass::add_convbias_replace_func(LayoutTrans layout_trans) { *opr, new_inp, opr->config()); } }; - } else if (layout_trans == LayoutTrans::NCHW44) { - m_opr_replace_funcs[opr::ConvBiasForward::typeinfo()] = - [this](OperatorNodeBase* opr, const VarNodeArray& new_inp) { - return padding_policy(opr, new_inp); - }; + } else if ( + layout_trans == LayoutTrans::NCHW44 || + layout_trans == LayoutTrans::NCHW44_DOT || + layout_trans == LayoutTrans::NCHW88) { + auto padding_conv = [this](OperatorNodeBase* opr, const VarNodeArray& new_inp) { + if (opr->input()[1]->shape().ndim == 4) { + return padding_conv_policy(opr, new_inp); + } else { + mgb_assert(opr->input()[1]->shape().ndim == 5); + if (opr->input()[1]->shape()[1] == 1 && + opr->input()[1]->shape()[2] == 1) { + return padding_channel_wise_conv_policy(opr, new_inp); + } else { + //! group convolution can't padding channel + mgb_assert(opr->input().size() == new_inp.size()); + auto inps = new_inp; + for (size_t i = 0; i < new_inp.size(); ++i) { + auto cur_inp = opr->input(i); + bool padding_cur_inp = + m_padding_oprs.count(cur_inp->owner_opr()) > 0; + if (padding_cur_inp) { + inps[i] = extract_subtensor(inps[i], cur_inp->shape()); + } + } + return serialization::copy_opr_shallow(*opr, inps, opr->config()); + } + } + }; + m_opr_replace_funcs[opr::ConvBiasForward::typeinfo()] = padding_conv; + m_opr_replace_funcs[opr::Convolution::typeinfo()] = padding_conv; } } @@ -298,7 +414,9 @@ void PaddingChannelPass::add_conv_backward_data_replace_func(LayoutTrans layout_ size_t pad_channels = new_out_channels - out_channels; inps[0] = pad_out_channels(new_inp[0], pad_channels); } else { - size_t pad_channels = it->second(out_channels, false); + size_t pad_channels = m_only_padding_weights + ? 0 + : it->second(out_channels, false); if (pad_channels > 0) { inps[0] = pad_out_channels(new_inp[0], pad_channels); inps[1] = pad_in_channels(new_inp[1], pad_channels); @@ -313,24 +431,43 @@ void PaddingChannelPass::add_conv_backward_data_replace_func(LayoutTrans layout_ } return serialization::copy_opr_shallow(*opr, inps, opr->config()); }; + } else { + m_opr_replace_funcs[opr::ConvolutionBackwardData::typeinfo()] = + [this](OperatorNodeBase* opr, const VarNodeArray& new_inp) { + mgb_assert(opr->input(0)->shape().eq_shape(new_inp[0]->shape())); + auto inps = new_inp; + size_t out_channels = opr->input(0)->shape()[0]; + size_t new_out_channels = new_inp[1]->shape()[1]; + // pad output channels + if (m_padding_oprs.count(opr->input(1)->owner_opr())) { + size_t pad_channels = new_out_channels - out_channels; + inps[0] = pad_out_channels(new_inp[0], pad_channels); + } + out_channels = inps[0]->shape()[0]; + + return serialization::copy_opr_shallow(*opr, inps, opr->config()); + }; } } -void PaddingChannelPass::add_format_aware_opr_replace_func(LayoutTrans) { - auto replace_format_aware_opr = [this](OperatorNodeBase* opr, - const VarNodeArray& new_inp) { - if (opr->input(0)->dtype().enumv() != DTypeEnum::QuantizedS8 && - opr->input(0)->dtype().enumv() != DTypeEnum::QuantizedS4 && - opr->input(0)->dtype().enumv() != DTypeEnum::Quantized4Asymm) { - mgb_assert( - m_padding_oprs.count(opr->input(0)->owner_opr()) == 0, - "operator(type:%s,name:%s) for data type(%s) cannot be " - "padded channel. extra info:" - "consumer(%s), producer(%s)", - opr->dyn_typeinfo()->name, opr->cname(), - opr->input(0)->dtype().name(), opr->cname(), - opr->input(0)->owner_opr()->cname()); - return serialization::copy_opr_shallow(*opr, new_inp, opr->config()); +void PaddingChannelPass::add_format_aware_opr_replace_func(LayoutTrans layout_trans) { + auto replace_format_aware_opr = [this, layout_trans]( + OperatorNodeBase* opr, + const VarNodeArray& new_inp) { + if (layout_trans == LayoutTrans::NCHW64) { + if (opr->input(0)->dtype().enumv() != DTypeEnum::QuantizedS8 && + opr->input(0)->dtype().enumv() != DTypeEnum::QuantizedS4 && + opr->input(0)->dtype().enumv() != DTypeEnum::Quantized4Asymm) { + mgb_assert( + m_padding_oprs.count(opr->input(0)->owner_opr()) == 0, + "operator(type:%s,name:%s) for data type(%s) cannot be " + "padded channel. extra info:" + "consumer(%s), producer(%s)", + opr->dyn_typeinfo()->name, opr->cname(), + opr->input(0)->dtype().name(), opr->cname(), + opr->input(0)->owner_opr()->cname()); + return serialization::copy_opr_shallow(*opr, new_inp, opr->config()); + } } mgb_assert(opr->input().size() == new_inp.size()); if (m_padding_oprs.count(opr->input(0)->owner_opr())) { @@ -341,6 +478,9 @@ void PaddingChannelPass::add_format_aware_opr_replace_func(LayoutTrans) { m_opr_replace_funcs[opr::PoolingForward::typeinfo()] = replace_format_aware_opr; m_opr_replace_funcs[opr::WarpPerspectiveForward::typeinfo()] = replace_format_aware_opr; + m_opr_replace_funcs[opr::WarpAffine::typeinfo()] = replace_format_aware_opr; + m_opr_replace_funcs[opr::AdaptivePooling::typeinfo()] = replace_format_aware_opr; + m_opr_replace_funcs[opr::ResizeForward::typeinfo()] = replace_format_aware_opr; } void PaddingChannelPass::add_elemwise_like_opr_replace_func(LayoutTrans) { @@ -353,6 +493,10 @@ void PaddingChannelPass::add_elemwise_like_opr_replace_func(LayoutTrans) { size_t channels_after_padding = 0; size_t i = 0; for (auto&& cur_inp : opr->input()) { + if (cur_inp->shape().is_scalar()) { + ++i; + continue; + } bool padding_cur_inp = m_padding_oprs.count(cur_inp->owner_opr()) > 0; if (padding_cur_inp) { if (!have_padding_inp) @@ -363,8 +507,9 @@ void PaddingChannelPass::add_elemwise_like_opr_replace_func(LayoutTrans) { same_padding = channels_after_padding == new_inp[i]->shape()[1]; } } - if (padding_all_inps && (!padding_cur_inp || !same_padding)) + if (padding_all_inps && (!padding_cur_inp || !same_padding)) { padding_all_inps = false; + } ++i; } if (have_padding_inp && !padding_all_inps) { @@ -378,7 +523,7 @@ void PaddingChannelPass::add_elemwise_like_opr_replace_func(LayoutTrans) { } return serialization::copy_opr_shallow(*opr, inps, opr->config()); } - if (padding_all_inps) { + if (padding_all_inps && have_padding_inp) { m_padding_oprs.insert(opr); } return serialization::copy_opr_shallow(*opr, new_inp, opr->config()); @@ -386,6 +531,53 @@ void PaddingChannelPass::add_elemwise_like_opr_replace_func(LayoutTrans) { m_opr_replace_funcs[opr::ElemwiseMultiType::typeinfo()] = replace_elemwise_like_opr; m_opr_replace_funcs[opr::Elemwise::typeinfo()] = replace_elemwise_like_opr; m_opr_replace_funcs[opr::TypeCvt::typeinfo()] = replace_elemwise_like_opr; + m_opr_replace_funcs[opr::PowC::typeinfo()] = replace_elemwise_like_opr; +} + +void PaddingChannelPass::add_condition_padding_oprs_replace_func(LayoutTrans) { + auto replace_condition_oprs = [this](OperatorNodeBase* opr, + const VarNodeArray& new_inp) { + mgb_assert(opr->input().size() == new_inp.size()); + bool can_forward_padding = true; + if (auto reduce = opr->try_cast_final()) { + auto axis = reduce->param().axis; + if (axis < 0) { + axis += reduce->input(0)->layout().ndim; + } + //! don't reduce in channel + if (reduce->input().size() > 1) { + can_forward_padding = false; + } else { + can_forward_padding = reduce->param().axis != 1; + } + } else if (auto subtensor = opr->try_cast_final()) { + auto indexs = subtensor->index_desc(); + size_t input_dim = subtensor->input(0)->shape().ndim; + for (size_t id = 0; id < indexs.size(); id++) { + if (indexs[id].axis.get(input_dim) == 1) { + //! when subtensor perform on channel dim, if is idx mode or + //! end is valid, it can forward without add subtensor + can_forward_padding &= + indexs[id].idx.node() || indexs[id].end.node(); + } + } + } + auto inps = new_inp; + for (size_t i = 0; i < new_inp.size(); ++i) { + auto cur_inp = opr->input(i); + bool padding_cur_inp = m_padding_oprs.count(cur_inp->owner_opr()) > 0; + if (padding_cur_inp) { + if (can_forward_padding) { + m_padding_oprs.insert(opr); + } else { + inps[i] = extract_subtensor(inps[i], cur_inp->shape()); + } + } + } + return serialization::copy_opr_shallow(*opr, inps, opr->config()); + }; + m_opr_replace_funcs[opr::Reduce::typeinfo()] = replace_condition_oprs; + m_opr_replace_funcs[opr::Subtensor::typeinfo()] = replace_condition_oprs; } void PaddingChannelPass::add_nonpadding_oprs_replace_func(LayoutTrans) { @@ -405,8 +597,11 @@ void PaddingChannelPass::add_nonpadding_oprs_replace_func(LayoutTrans) { m_opr_replace_funcs[opr::Reshape::typeinfo()] = replace_nonpadding_oprs; m_opr_replace_funcs[opr::GetVarShape::typeinfo()] = replace_nonpadding_oprs; m_opr_replace_funcs[opr::Concat::typeinfo()] = replace_nonpadding_oprs; - m_opr_replace_funcs[opr::Reduce::typeinfo()] = replace_nonpadding_oprs; - m_opr_replace_funcs[opr::Subtensor::typeinfo()] = replace_nonpadding_oprs; + m_opr_replace_funcs[opr::Dimshuffle::typeinfo()] = replace_nonpadding_oprs; + m_opr_replace_funcs[opr::Argmax::typeinfo()] = replace_nonpadding_oprs; + m_opr_replace_funcs[opr::Argmin::typeinfo()] = replace_nonpadding_oprs; + m_opr_replace_funcs[opr::IncrSubtensor::typeinfo()] = replace_nonpadding_oprs; + m_opr_replace_funcs[opr::AssertEqual::typeinfo()] = replace_nonpadding_oprs; } // vim: syntax=cpp.doxygen foldmethod=marker foldmarker=f{{{,f}}} diff --git a/src/gopt/include/megbrain/gopt/inference.h b/src/gopt/include/megbrain/gopt/inference.h index 6f12dfba..6c97db87 100644 --- a/src/gopt/include/megbrain/gopt/inference.h +++ b/src/gopt/include/megbrain/gopt/inference.h @@ -508,39 +508,48 @@ public: * assume input network is built in NCHW tensor format */ class PaddingChannelPass final : public Pass { -public: +private: using ChannelAlignmentMap = ThinHashMap>; using LayoutTrans = cg::GraphCommonOptimizeOptions::LayoutTransform; + using ReplaceFuncs = ThinHashMap< + Typeinfo*, + thin_function>; +public: const char* name() const override; void apply(OptState& opt) const override; void fill_opr_convert_fun(LayoutTrans layout_trans); - using ReplaceFuncs = ThinHashMap< - Typeinfo*, - thin_function>; - //! make channel padding opt pass with given tensor format - static std::unique_ptr make(LayoutTrans layout_transform); + static std::unique_ptr make( + LayoutTrans layout_transform, bool only_padding_weights = false); private: + PaddingChannelPass(bool only_padding_weights = false) + : m_only_padding_weights(only_padding_weights) {} + VarNode* extract_subtensor(VarNode* inp, const TensorShape& orig_shape) const; VarNode* pad_in_channels(VarNode* inp, size_t pad_channels); VarNode* pad_out_channels(VarNode* inp, size_t pad_channels); - OperatorNodeBase* padding_policy( + OperatorNodeBase* padding_conv_policy( + OperatorNodeBase* opr, const VarNodeArray& new_inp); + + OperatorNodeBase* padding_channel_wise_conv_policy( OperatorNodeBase* opr, const VarNodeArray& new_inp); - void add_convbias_replace_func(LayoutTrans layout_transform); + void add_conv_replace_func(LayoutTrans layout_transform); void add_conv_backward_data_replace_func(LayoutTrans layout_transform); void add_format_aware_opr_replace_func(LayoutTrans layout_transform); void add_elemwise_like_opr_replace_func(LayoutTrans layout_transform); + void add_condition_padding_oprs_replace_func(LayoutTrans layout_transform); void add_nonpadding_oprs_replace_func(LayoutTrans layout_transform); ChannelAlignmentMap m_alignment_map; ReplaceFuncs m_opr_replace_funcs; mutable ThinHashSet m_padding_oprs; + bool m_only_padding_weights; }; /*! diff --git a/src/gopt/test/inference.cpp b/src/gopt/test/inference.cpp index 750b5fa7..06eafe01 100644 --- a/src/gopt/test/inference.cpp +++ b/src/gopt/test/inference.cpp @@ -4004,8 +4004,11 @@ TEST(TestGoptInference, ConvertFormatNCHW44) { ASSERT_EQ( opr::Convolution::Param::Format::NCHW44, find_opr(y_opt, "conv4").param().format); + + //! nchw44 add the PaddingChannel pass, it will padding conv5 output + //! channel, so it will convert to nchw44 format ASSERT_EQ( - opr::Convolution::Param::Format::NCHW, + opr::Convolution::Param::Format::NCHW44, find_opr(y_opt, "conv5").param().format); graph->compile({{y_opt, {}}}) @@ -4206,7 +4209,6 @@ TEST(TestGoptInference, ConvertFormatNCHW44_DOT) { auto w1 = mkcvar("w1", {8, 3, 3, 3}), conv1 = opr::Convolution::make( x, w1, param_conv, {}, OperatorNodeConfig("conv1")); - printf("create conv1 %s\n", conv1.node()->owner_opr()->dyn_typeinfo()->name); param_conv.pad_h = param_conv.pad_w = 1; //! no supported hybrid nchw44 opr::ConvBias::Param param_conv_bias_pad0; @@ -4313,8 +4315,10 @@ TEST(TestGoptInference, ConvertFormatNCHW44_DOT) { ASSERT_EQ( opr::Convolution::Param::Format::NCHW44, find_opr(y_opt, "conv4").param().format); + //! nchw44-dot default add PaddingChannel pass, so it output channel will be + //! padding to times of 4, so it can't be used as nchw44 format. ASSERT_EQ( - opr::Convolution::Param::Format::NCHW, + opr::Convolution::Param::Format::NCHW44, find_opr(y_opt, "conv5").param().format); graph->compile({{y_opt, {}}}) diff --git a/src/gopt/test/padding_channel.cpp b/src/gopt/test/padding_channel.cpp new file mode 100644 index 00000000..58f734e4 --- /dev/null +++ b/src/gopt/test/padding_channel.cpp @@ -0,0 +1,446 @@ +#include "megbrain/graph/cg.h" +#include "megbrain/opr/dnn/local.h" + +#include "megbrain/gopt/basic_arith.h" +#include "megbrain/gopt/gtrans.h" +#include "megbrain/gopt/inference.h" + +#include "megbrain/opr/basic_arith_wrapper.h" +#include "megbrain/opr/blas.h" +#include "megbrain/opr/dnn/adaptive_pooling.h" +#include "megbrain/opr/dnn/batch_norm.h" +#include "megbrain/opr/dnn/convolution.h" +#include "megbrain/opr/dnn/pooling.h" +#include "megbrain/opr/imgproc.h" +#include "megbrain/opr/io.h" +#include "megbrain/opr/nn_int.h" +#include "megbrain/opr/tensor_gen.h" +#include "megbrain/opr/tensor_manip.h" +#include "megbrain/opr/utility.h" + +#include "helper.h" +#include "megbrain/comp_node_env.h" +#include "megbrain/test/helper.h" + +#include "megdnn/tensor_format.h" + +#include +#include + +using namespace mgb; + +namespace { +//! find first the operator of specific type; raise exception if not found +template +T* find_opr(SymbolVar endpoint) { + T* found = nullptr; + auto cb = [&found](cg::OperatorNodeBase* opr) { + if (!found && opr->same_type()) { + found = &opr->cast_final_safe(); + } + }; + cg::DepOprIter{cb}.add(endpoint.node()->owner_opr()); + mgb_assert(found, "not found opr from %s", endpoint.node()->name().c_str()); + return found; +} + +template +T* find_opr(SymbolVar endpoint, const std::string& node_name) { + T* found = nullptr; + auto cb = [&found, &node_name](cg::OperatorNodeBase* opr) { + if (!found && opr->same_type() && opr->name() == node_name) { + found = &opr->cast_final_safe(); + } + }; + cg::DepOprIter{cb}.add(endpoint.node()->owner_opr()); + mgb_assert( + found, "not found opr %s from %s", node_name.c_str(), + endpoint.node()->name().c_str()); + return found; +} +} // namespace + +TEST(TestGoptInference, ChannelPaddingNCHW44) { + HostTensorGenerator<> gen; + auto cn = CompNode::load("cpu0"); + auto graph = ComputingGraph::make(); + graph->options().graph_opt_level = 0; + auto mkcvar = [&](const char* name, const TensorShape& shp) { + return opr::SharedDeviceTensor::make(*graph, *gen(shp, cn)).rename(name); + }; + + auto host_x = gen({1, 3, 8, 8}, cn); + auto x = opr::Host2DeviceCopy::make(*graph, host_x); + //! Hybrid nchw44 mode + opr::ConvBias::Param param_conv; + param_conv.pad_h = param_conv.pad_w = 1; + auto w1 = mkcvar("w1", {8, 3, 3, 3}), b1 = mkcvar("w1", {1, 8, 1, 1}), + conv1 = opr::ConvBias::make( + x, w1, b1, param_conv, {}, OperatorNodeConfig("conv1")); + + auto w2 = mkcvar("w2", {6, 8, 3, 3}), b2 = mkcvar("b2", {1, 6, 1, 1}), + conv2 = opr::ConvBias::make( + conv1, w2, b2, param_conv, {}, OperatorNodeConfig("conv2")); + auto w3 = mkcvar("w3", {3, 6, 3, 3}), b3 = mkcvar("b3", {1, 3, 1, 1}), + conv3 = opr::ConvBias::make( + conv2, w3, b3, param_conv, {}, OperatorNodeConfig("conv3")); + + opr::Convolution::Param param_convolution; + param_convolution.sparse = opr::Convolution::Param::Sparse::GROUP; + //! channel wise convolution + auto w4 = mkcvar("w4", {3, 1, 1, 1, 1}), + conv4 = opr::Convolution::make( + conv3, w4, param_convolution, {}, OperatorNodeConfig("conv4")); + + param_convolution.sparse = opr::Convolution::Param::Sparse::DENSE; + auto w5 = mkcvar("w5", {6, 3, 1, 1}), + conv5 = opr::Convolution::make( + conv4, w5, param_convolution, {}, OperatorNodeConfig("conv5")); + + //! group convolution + param_convolution.sparse = opr::Convolution::Param::Sparse::GROUP; + auto w6 = mkcvar("w6", {2, 4, 3, 1, 1}), + conv6 = opr::Convolution::make( + conv5, w6, param_convolution, {}, OperatorNodeConfig("conv6")); + + param_convolution.sparse = opr::Convolution::Param::Sparse::DENSE; + auto w7 = mkcvar("w7", {3, 8, 1, 1}), + y = opr::Convolution::make( + conv6, w7, param_convolution, {}, OperatorNodeConfig("conv7")); + + SymbolVar y_opt; + auto options = gopt::OptimizeForInferenceOptions{}; + options.enable_fuse_conv_bias_nonlinearity(); + options.enable_nchw44(); + unpack_vector(gopt::optimize_for_inference({y}, options), y_opt); + auto conv1_opt = find_opr(y_opt, "conv1"); + auto conv2_opt = find_opr(y_opt, "conv2"); + auto conv3_opt = find_opr(y_opt, "conv3"); + auto conv4_opt = find_opr(y_opt, "conv4"); + auto conv6_opt = find_opr(y_opt, "conv6"); + //! do not padding input tensor + ASSERT_EQ(conv1_opt->input(0)->shape()[1], 3); + ASSERT_EQ(opr::Convolution::Param::Format::NCHW44, conv1_opt->param().format); + //! output tensor padding input tensor + ASSERT_EQ(conv2_opt->input(1)->shape()[0], 2); + ASSERT_EQ(opr::Convolution::Param::Format::NCHW44, conv2_opt->param().format); + ASSERT_EQ(conv3_opt->input(1)->shape()[0], 1); + ASSERT_EQ(opr::Convolution::Param::Format::NCHW44, conv3_opt->param().format); + + ASSERT_EQ(conv4_opt->input(1)->shape()[0], 1); + ASSERT_EQ(opr::Convolution::Param::Format::NCHW44, conv4_opt->param().format); + ASSERT_EQ(conv6_opt->input(0)->shape()[1], 6); + ASSERT_EQ(opr::Convolution::Param::Format::NCHW, conv6_opt->param().format); + + //! the dst tensor channel must stay unchange + ASSERT_EQ(y_opt.node()->shape()[1], 3); + graph->compile({{y_opt, {}}}) + ->to_json() + ->writeto_fpath(output_file("TestGoptInference.ChannelPaddingNCHW44.json")); + + HostTensorND host_y_opt, host_y; + auto func = graph->compile( + {make_callback_copy(y, host_y), make_callback_copy(y_opt, host_y_opt)}); + func->execute(); + MGB_ASSERT_TENSOR_NEAR(host_y, host_y_opt, 1e-2); + + //! test change the input shape + *host_x = *gen({2, 3, 32, 32}, cn); + func->execute(); + MGB_ASSERT_TENSOR_NEAR(host_y, host_y_opt, 1e-2); +} + +TEST(TestGoptInference, ChannelPaddingSubtensor) { + HostTensorGenerator<> gen; + auto cn = CompNode::load("cpu0"); + auto graph = ComputingGraph::make(); + graph->options().graph_opt_level = 0; + auto mkcvar = [&](const char* name, const TensorShape& shp) { + return opr::SharedDeviceTensor::make(*graph, *gen(shp, cn)).rename(name); + }; + + auto host_x = gen({1, 3, 8, 8}, cn); + auto x = opr::Host2DeviceCopy::make(*graph, host_x); + //! Hybrid nchw44 mode + opr::ConvBias::Param param_conv; + param_conv.pad_h = param_conv.pad_w = 1; + auto w1 = mkcvar("w1", {8, 3, 3, 3}), b1 = mkcvar("w1", {1, 8, 1, 1}), + conv1 = opr::ConvBias::make( + x, w1, b1, param_conv, {}, OperatorNodeConfig("conv1")); + + auto w2 = mkcvar("w2", {6, 8, 1, 1}), + conv2 = opr::Convolution::make(conv1, w2, {}, {}, OperatorNodeConfig("conv2")); + using AIdx = opr::indexing::AxisIndexer; + auto sub0 = opr::Subtensor::make( + conv2, + {AIdx::make_interval( + 2, conv2.make_scalar(1), conv2.make_scalar(4), + conv2.make_scalar(1))}, + OperatorNodeConfig("sub0")); + auto sub1 = opr::Subtensor::make( + conv2, + {AIdx::make_interval( + 1, conv2.make_scalar(1), conv2.make_scalar(2), + conv2.make_scalar(1)), + AIdx::make_interval( + 2, conv2.make_scalar(1), conv2.make_scalar(4), + conv2.make_scalar(1))}, + OperatorNodeConfig("sub1")); + auto sub2 = opr::Subtensor::make( + conv2, + {AIdx::make_interval(1, conv2.make_scalar(5), {}, {}), + AIdx::make_interval( + 2, conv2.make_scalar(1), conv2.make_scalar(4), + conv2.make_scalar(1))}, + OperatorNodeConfig("sub2")); + auto y_sub = sub0 + sub1 + sub2; + + SymbolVar y_pad; + unpack_vector( + gopt::GraphOptimizer{} + .add_pass(gopt::PaddingChannelPass::make( + cg::GraphCommonOptimizeOptions::LayoutTransform::NCHW44, + true)) + .apply({{y_sub}}) + .endpoint_vars(), + y_pad); + auto conv1_opt = find_opr(y_pad, "conv1"); + auto conv2_opt = find_opr(y_pad, "conv2"); + auto sub0_opt = find_opr(y_pad, "sub0"); + auto sub1_opt = find_opr(y_pad, "sub1"); + auto sub2_opt = find_opr(y_pad, "sub2"); + //! do not padding input tensor + ASSERT_EQ(conv1_opt->input(0)->shape()[1], 3); + //! output tensor padding input tensor + ASSERT_EQ(conv2_opt->input(1)->shape()[0], 8); + ASSERT_EQ(conv2_opt->output(0)->shape()[1], 8); + + //! sub0 do not perform on channel dim, so no need to add subtensor + ASSERT_EQ(sub0_opt->input(0)->shape()[1], 8); + //! sub1 perform on channel dim, but end is specific, so no need to add subtensor + ASSERT_EQ(sub1_opt->input(0)->shape()[1], 8); + //! sub1 perform on channel dim, and end is default, so need to add subtensor + ASSERT_EQ(sub2_opt->input(0)->shape()[1], 6); + + //! the dst tensor channel must stay unchange + ASSERT_EQ(y_pad.node()->shape()[1], 6); + graph->compile({{y_pad, {}}}) + ->to_json() + ->writeto_fpath( + output_file("TestGoptInference.ChannelPaddingSubtensor.json")); + + HostTensorND host_y_opt, host_y; + auto func = graph->compile( + {make_callback_copy(y_sub, host_y), make_callback_copy(y_pad, host_y_opt)}); + func->execute(); + MGB_ASSERT_TENSOR_NEAR(host_y, host_y_opt, 1e-2); + + //! test change the input shape + *host_x = *gen({2, 3, 32, 32}, cn); + func->execute(); + MGB_ASSERT_TENSOR_NEAR(host_y, host_y_opt, 1e-2); +} + +TEST(TestGoptInference, ChannelPaddingReduce) { + HostTensorGenerator<> gen; + auto cn = CompNode::load("cpu0"); + auto graph = ComputingGraph::make(); + graph->options().graph_opt_level = 0; + auto mkcvar = [&](const char* name, const TensorShape& shp) { + return opr::SharedDeviceTensor::make(*graph, *gen(shp, cn)).rename(name); + }; + + auto host_x = gen({1, 3, 8, 8}, cn); + auto x = opr::Host2DeviceCopy::make(*graph, host_x); + //! Hybrid nchw44 mode + opr::ConvBias::Param param_conv; + param_conv.pad_h = param_conv.pad_w = 1; + auto w1 = mkcvar("w1", {8, 3, 3, 3}), b1 = mkcvar("w1", {1, 8, 1, 1}), + conv1 = opr::ConvBias::make( + x, w1, b1, param_conv, {}, OperatorNodeConfig("conv1")); + + auto w2 = mkcvar("w2", {6, 8, 1, 1}), + conv2 = opr::Convolution::make(conv1, w2, {}, {}, OperatorNodeConfig("conv2")); + auto reduce0 = opr::Reduce::make( + conv2, {opr::Reduce::Mode::MAX, 1}, {}, OperatorNodeConfig("reduce0")); + auto reduce1 = opr::Reduce::make( + conv2, {opr::Reduce::Mode::MAX, 2}, {}, OperatorNodeConfig("reduce1")); + auto y_reduce = reduce0 + reduce1; + + SymbolVar y_pad; + unpack_vector( + gopt::GraphOptimizer{} + .add_pass(gopt::PaddingChannelPass::make( + cg::GraphCommonOptimizeOptions::LayoutTransform::NCHW44, + true)) + .apply({{y_reduce}}) + .endpoint_vars(), + y_pad); + auto conv1_opt = find_opr(y_pad, "conv1"); + auto conv2_opt = find_opr(y_pad, "conv2"); + auto reduce0_opt = find_opr(y_pad, "reduce0"); + auto reduce1_opt = find_opr(y_pad, "reduce1"); + //! do not padding input tensor + ASSERT_EQ(conv1_opt->input(0)->shape()[1], 3); + //! output tensor padding input tensor + ASSERT_EQ(conv2_opt->input(1)->shape()[0], 8); + ASSERT_EQ(conv2_opt->output(0)->shape()[1], 8); + + //! reduce0 perform on channel dim, so need to add subtensor + ASSERT_EQ(reduce0_opt->input(0)->shape()[1], 6); + //! reduce1 don't perform on channel dim, so no need to add subtensor + ASSERT_EQ(reduce1_opt->input(0)->shape()[1], 8); + + //! the dst tensor channel must stay unchange + ASSERT_EQ(y_pad.node()->shape()[1], 6); + graph->compile({{y_pad, {}}}) + ->to_json() + ->writeto_fpath(output_file("TestGoptInference.ChannelPaddingReduce.json")); + + HostTensorND host_y_opt, host_y; + auto func = graph->compile( + {make_callback_copy(y_reduce, host_y), + make_callback_copy(y_pad, host_y_opt)}); + func->execute(); + MGB_ASSERT_TENSOR_NEAR(host_y, host_y_opt, 1e-2); + + //! test change the input shape + *host_x = *gen({2, 3, 32, 32}, cn); + func->execute(); + MGB_ASSERT_TENSOR_NEAR(host_y, host_y_opt, 1e-2); +} + +TEST(TestGoptInference, ChannelPaddingMisc) { + HostTensorGenerator<> gen; + auto cn = CompNode::load("cpu0"); + auto graph = ComputingGraph::make(); + graph->options().graph_opt_level = 0; + auto mkcvar = [&](const char* name, const TensorShape& shp) { + return opr::SharedDeviceTensor::make(*graph, *gen(shp, cn)).rename(name); + }; + + auto host_x = gen({1, 3, 8, 8}, cn); + auto x = opr::Host2DeviceCopy::make(*graph, host_x); + //! Hybrid nchw44 mode + opr::ConvBias::Param param_conv; + param_conv.pad_h = param_conv.pad_w = 1; + auto w1 = mkcvar("w1", {8, 3, 3, 3}), b1 = mkcvar("w1", {1, 8, 1, 1}), + conv1 = opr::ConvBias::make( + x, w1, b1, param_conv, {}, OperatorNodeConfig("conv1")); + + auto w2 = mkcvar("w2", {6, 8, 1, 1}), + conv2 = opr::Convolution::make(conv1, w2, {}, {}, OperatorNodeConfig("conv2")); + auto elem0 = conv2 + 1; + auto concat = opr::Concat::make({elem0, conv2}, 1, OperatorNodeConfig("concat")); + + SymbolVar y_pad; + unpack_vector( + gopt::GraphOptimizer{} + .add_pass(gopt::PaddingChannelPass::make( + cg::GraphCommonOptimizeOptions::LayoutTransform::NCHW44, + true)) + .apply({{concat}}) + .endpoint_vars(), + y_pad); + auto conv1_opt = find_opr(y_pad, "conv1"); + auto conv2_opt = find_opr(y_pad, "conv2"); + auto elemwise0_opt = find_opr(y_pad); + auto concat_opt = find_opr(y_pad, "concat"); + //! do not padding input tensor + ASSERT_EQ(conv1_opt->input(0)->shape()[1], 3); + //! output tensor padding input tensor + ASSERT_EQ(conv2_opt->input(1)->shape()[0], 8); + ASSERT_EQ(conv2_opt->output(0)->shape()[1], 8); + + ASSERT_EQ(elemwise0_opt->output(0)->shape()[1], 8); + ASSERT_EQ(concat_opt->input(0)->shape()[1], 6); + ASSERT_EQ(concat_opt->input(1)->shape()[1], 6); + + //! the dst tensor channel must stay unchange + ASSERT_EQ(y_pad.node()->shape()[1], 12); + graph->compile({{y_pad, {}}}) + ->to_json() + ->writeto_fpath(output_file("TestGoptInference.ChannelPaddingMisc.json")); + + HostTensorND host_y_opt, host_y; + auto func = graph->compile( + {make_callback_copy(concat, host_y), + make_callback_copy(y_pad, host_y_opt)}); + func->execute(); + MGB_ASSERT_TENSOR_NEAR(host_y, host_y_opt, 1e-2); + + //! test change the input shape + *host_x = *gen({2, 3, 32, 32}, cn); + func->execute(); + MGB_ASSERT_TENSOR_NEAR(host_y, host_y_opt, 1e-2); +} + +TEST(TestGoptInference, ChannelPaddingMoreOp) { + HostTensorGenerator<> gen; + auto cn = CompNode::load("cpu0"); + auto graph = ComputingGraph::make(); + auto mkvar = [&](const char* name, const TensorShape& shp) { + return opr::Host2DeviceCopy::make(*graph, gen(shp, cn)).rename(name); + }; + auto mkcvar = [&](const char* name, const TensorShape& shp) { + return opr::SharedDeviceTensor::make(*graph, *gen(shp, cn)).rename(name); + }; + + auto host_x = gen({2, 3, 8, 8}, cn); + auto x = opr::Host2DeviceCopy::make(*graph, host_x); + + opr::Convolution::Param param; + param.pad_h = param.pad_w = 1; + auto w1 = mkcvar("w1", {6, 3, 3, 3}), conv = opr::Convolution::make(x, w1, param); + auto shape_of = opr::GetVarShape::make(conv); + auto subtensor = opr::Subtensor::make( + shape_of, {opr::Subtensor::AxisIndexer::make_interval( + 0, x.make_scalar(2), None, x.make_scalar(1))}); + + opr::Resize::Param param_resize; + param_resize.format = opr::Resize::Param::Format::NCHW; + auto resize = opr::ResizeForward::make(conv, subtensor * 2, param_resize); + auto mat = mkcvar("mat", {2, 3, 3}), + warp = opr::WarpPerspectiveForward::make( + resize, mat, nullptr, cg::var_from_tensor_shape(x, {4, 4})); + + auto b = mkvar("b", {1, 6, 1, 1}), + elem = opr::Elemwise::make({warp + b}, opr::Elemwise::Param::Mode::RELU); + param.pad_h = param.pad_w = 1; + auto w2 = mkcvar("w2", {7, 6, 3, 3}), y = opr::Convolution::make(elem, w2, param), + z = opr::AxisAddRemove::make(y, {opr::AxisAddRemove::AxisDesc::make_add(0)}); + + SymbolVar y_pad, z_pad; + unpack_vector( + gopt::GraphOptimizer{} + .add_pass(gopt::PaddingChannelPass::make( + cg::GraphCommonOptimizeOptions::LayoutTransform::NCHW44, + true)) + .apply({{y}}) + .endpoint_vars(), + y_pad); + unpack_vector( + gopt::GraphOptimizer{} + .add_pass(gopt::PaddingChannelPass::make( + cg::GraphCommonOptimizeOptions::LayoutTransform::NCHW44, + true)) + .apply({{z}}) + .endpoint_vars(), + z_pad); + + graph->compile({{y_pad, {}}}) + ->to_json() + ->writeto_fpath(output_file("TestGoptInference.ChannelPaddingMoreOp.json")); + + HostTensorND host_y_opt, host_y; + auto func = graph->compile( + {make_callback_copy(y, host_y), make_callback_copy(y_pad, host_y_opt)}); + func->execute(); + MGB_ASSERT_TENSOR_NEAR(host_y, host_y_opt, 1e-3); + + *host_x = *gen({2, 3, 16, 16}, cn); + func->execute(); + MGB_ASSERT_TENSOR_NEAR(host_y, host_y_opt, 1e-3); +} + +// vim: syntax=cpp.doxygen foldmethod=marker foldmarker=f{{{,f}}}