From 63a9bd30a85cd428e33829443000761c14d1c3f6 Mon Sep 17 00:00:00 2001 From: Megvii Engine Team Date: Wed, 31 Mar 2021 18:46:54 +0800 Subject: [PATCH] feat(mgb/gopt): add an opt pass for padding channels to enable fast int8/int4 support on GPU GitOrigin-RevId: 94c719bb5c5410925f57d626de088b86afed4750 --- src/gopt/impl/tensor_reformat.cpp | 340 +++++++++++++++++++++++++++++ src/gopt/include/megbrain/gopt/inference.h | 10 + src/gopt/test/inference.cpp | 307 ++++++++++++++++++++++++++ 3 files changed, 657 insertions(+) diff --git a/src/gopt/impl/tensor_reformat.cpp b/src/gopt/impl/tensor_reformat.cpp index b3fe04e3..8a198ab9 100644 --- a/src/gopt/impl/tensor_reformat.cpp +++ b/src/gopt/impl/tensor_reformat.cpp @@ -3624,4 +3624,344 @@ void FoldingConvBiasDimshufflePass::apply(OptState& opt) const { MIDOUT_E } +/* ==================== PaddingChaannelPass ================= */ +const char* PaddingChannelPass::name() const { + return mgb_cstr_log("padding output channel to multiple of 4/32"); +} + +void PaddingChannelPass::apply(OptState& opt) const { + MIDOUT_B("PaddingChannelPassPass::apply"); + // do not check shape + opt.set_var_replace_check_flag(VarReplaceCheckFlag::CHECK_ALL ^ + VarReplaceCheckFlag::CHECK_SHAPE); + + ThinHashSet padding_oprs; + ThinHashMap> + opr_replace_funcs; + + auto rewriter = opt.graph().make_rewriter(); + auto pad_in_channels = [](VarNode* inp, size_t pad_channels) -> VarNode* { + mgb_assert(inp->shape().ndim == 4); + mgb_assert(inp->dtype().enumv() == DTypeEnum::QuantizedS8 || + inp->dtype().enumv() == DTypeEnum::QuantizedS32); + TensorShape shape{inp->shape()[0], pad_channels, inp->shape()[2], + inp->shape()[3]}; + std::shared_ptr host_val = std::make_shared( + inp->comp_node(), shape, inp->dtype()); + auto ptr = host_val->raw_ptr(); + std::memset(ptr, 0, shape.total_nr_elems() * inp->dtype().size()); + auto padding = + opr::ImmutableTensor::make(*inp->owner_graph(), *host_val); + auto out = opr::Concat::make({inp, padding}, 1); + return out.node(); + }; + + auto pad_out_channels = [](VarNode* inp, size_t pad_channels) -> VarNode* { + mgb_assert(inp->shape().ndim == 4); + mgb_assert(inp->dtype().enumv() == DTypeEnum::QuantizedS8 || + inp->dtype().enumv() == DTypeEnum::QuantizedS32); + TensorShape shape{pad_channels, inp->shape()[1], inp->shape()[2], + inp->shape()[3]}; + std::shared_ptr host_val = std::make_shared( + inp->comp_node(), shape, inp->dtype()); + auto ptr = host_val->raw_ptr(); + std::memset(ptr, 0, shape.total_nr_elems() * inp->dtype().size()); + auto padding = + opr::ImmutableTensor::make(*inp->owner_graph(), *host_val); + auto out = opr::Concat::make({inp, padding}, 0); + return out.node(); + }; + + auto extract_subtensor = [](VarNode* inp, + size_t orig_channels) -> VarNode* { + mgb_assert(inp->shape().ndim == 4); + auto x = SymbolVar(inp); + auto cv = [&x](int v) { return x.make_scalar(v); }; + using AIdx = opr::Subtensor::AxisIndexer; + auto sub = opr::Subtensor::make( + x, {AIdx::make_interval(0, None, None, cv(1)), + AIdx::make_interval(1, None, cv(orig_channels), None), + AIdx::make_interval(2, None, None, cv(1)), + AIdx::make_interval(3, None, None, cv(1))}); + return sub.node(); + }; + + // padding policy for conv bias with data type qint8 + auto padding_policy_qint8 = [&padding_oprs, &pad_in_channels, + &pad_out_channels]( + OperatorNodeBase* opr, + const VarNodeArray& new_inp) { + mgb_assert(opr->input().size() == new_inp.size()); + mgb_assert(new_inp.size() == 3); + mgb_assert(opr->input(1)->shape().eq_shape(new_inp[1]->shape())); + auto inps = new_inp; + size_t out_channels = opr->input(1)->shape()[0]; + size_t in_channels = opr->input(1)->shape()[1]; + size_t new_in_channels = new_inp[0]->shape()[1]; + // pad input channels + if (padding_oprs.count(opr->input(0)->owner_opr())) { + size_t pad_channels = new_in_channels - in_channels; + inps[1] = pad_in_channels(new_inp[1], pad_channels); + } else { + size_t pad_channels = 0; + mgb_assert(new_in_channels == in_channels); + if (in_channels <= 16) { + if (in_channels % 4) + pad_channels = 4 - (in_channels % 4); // pad to use dp4a + } else { + if (in_channels % 32) + pad_channels = + 32 - (in_channels % 32); // pad to use tensorcore + } + if (pad_channels > 0) { + inps[0] = pad_in_channels(new_inp[0], pad_channels); + inps[1] = pad_in_channels(new_inp[1], pad_channels); + } + } + out_channels = inps[1]->shape()[0]; + in_channels = inps[1]->shape()[1]; + size_t pad_channels = 0; + if (in_channels <= 16) { + if (out_channels % 4) + pad_channels = 4 - (out_channels % 4); + } else { + if (out_channels <= 16) { + if (out_channels % 4) + pad_channels = 4 - (out_channels % 4); + } else { + if (out_channels % 32) + pad_channels = 32 - (out_channels % 32); + } + } + if (pad_channels > 0) { + inps[1] = pad_out_channels(inps[1], pad_channels); + inps[2] = pad_in_channels(inps[2], pad_channels); + padding_oprs.insert(opr); + } + return serialization::copy_opr_shallow(*opr, inps, opr->config()); + }; + + // padding policy for conv bias with data type qint4 and quint4 + auto padding_policy_int4 = [&padding_oprs, &pad_in_channels, + &pad_out_channels]( + OperatorNodeBase* opr, + const VarNodeArray& new_inp) { + mgb_assert(opr->input().size() == new_inp.size()); + mgb_assert(new_inp.size() == 3); + mgb_assert(opr->input(1)->shape().eq_shape(new_inp[1]->shape())); + auto inps = new_inp; + return serialization::copy_opr_shallow(*opr, inps, opr->config()); + }; + + opr_replace_funcs[opr::ConvBiasForward::typeinfo()] = + [&padding_oprs, &padding_policy_qint8, &padding_policy_int4]( + OperatorNodeBase* opr, const VarNodeArray& new_inp) { + if (opr->input(0)->dtype().enumv() == DTypeEnum::QuantizedS8) { + return padding_policy_qint8(opr, new_inp); + } else if (opr->input(0)->dtype().enumv() == + DTypeEnum::QuantizedS4 || + opr->input(0)->dtype().enumv() == + DTypeEnum::Quantized4Asymm) { + return padding_policy_int4(opr, new_inp); + } else { + mgb_assert( + padding_oprs.count(opr->input(0)->owner_opr()) == 0, + "conv bias operator for data type(%s) cannot be " + "padded channel. " + "consumer(%s), producer(%s)", + opr->input(0)->dtype().name(), opr->cname(), + opr->input(0)->owner_opr()->cname()); + return serialization::copy_opr_shallow(*opr, new_inp, + opr->config()); + } + }; + opr_replace_funcs[opr::ConvolutionBackwardData::typeinfo()] = + [&padding_oprs, &pad_in_channels, &pad_out_channels]( + OperatorNodeBase* opr, const VarNodeArray& new_inp) { + if (opr->input(1)->dtype().enumv() != DTypeEnum::QuantizedS8) { + mgb_assert( + padding_oprs.count(opr->input(0)->owner_opr()) == 0, + "conv bwd data operator for data type(%s) cannot " + "be " + "padded channel. " + "consumer(%s), producer(%s)", + opr->input(0)->dtype().name(), opr->cname(), + opr->input(0)->owner_opr()->cname()); + return serialization::copy_opr_shallow(*opr, new_inp, + opr->config()); + } + mgb_assert(opr->input().size() == new_inp.size()); + mgb_assert(new_inp.size() == 2, + "deconv (conv bwd data) operator for inference can " + "only have 2 input vars(got:%zu)", + new_inp.size()); + mgb_assert( + opr->input(0)->shape().eq_shape(new_inp[0]->shape())); + auto inps = new_inp; + size_t out_channels = opr->input(0)->shape()[0]; + size_t in_channels = opr->input(0)->shape()[1]; + size_t new_out_channels = new_inp[1]->shape()[1]; + // pad output channels + if (padding_oprs.count(opr->input(1)->owner_opr())) { + size_t pad_channels = new_out_channels - out_channels; + inps[0] = pad_out_channels(new_inp[0], pad_channels); + } else { + size_t pad_channels = 0; + if (out_channels % 4) + pad_channels = 4 - (out_channels % 4); + if (pad_channels > 0) { + inps[0] = pad_out_channels(new_inp[0], pad_channels); + inps[1] = pad_in_channels(new_inp[1], pad_channels); + } + } + out_channels = inps[0]->shape()[0]; + in_channels = inps[0]->shape()[1]; + // pad input channels + size_t pad_channels = 0; + if (in_channels % 4) + pad_channels = 4 - (in_channels % 4); + if (pad_channels > 0) { + inps[0] = pad_in_channels(inps[0], pad_channels); + padding_oprs.insert(opr); + } + return serialization::copy_opr_shallow(*opr, inps, + opr->config()); + }; + auto replace_format_aware_opr = [&padding_oprs]( + OperatorNodeBase* opr, + const VarNodeArray& new_inp) { + if (opr->input(0)->dtype().enumv() != DTypeEnum::QuantizedS8 && + opr->input(0)->dtype().enumv() != DTypeEnum::QuantizedS4 && + opr->input(0)->dtype().enumv() != DTypeEnum::Quantized4Asymm) { + mgb_assert(padding_oprs.count(opr->input(0)->owner_opr()) == 0, + "operator(type:%s,name:%s) for data type(%s) cannot be " + "padded channel. extra info:" + "consumer(%s), producer(%s)", + opr->dyn_typeinfo()->name, opr->cname(), + opr->input(0)->dtype().name(), opr->cname(), + opr->input(0)->owner_opr()->cname()); + return serialization::copy_opr_shallow(*opr, new_inp, + opr->config()); + } + mgb_assert(opr->input().size() == new_inp.size()); + if (padding_oprs.count(opr->input(0)->owner_opr())) { + padding_oprs.insert(opr); + } + return serialization::copy_opr_shallow(*opr, new_inp, opr->config()); + }; + opr_replace_funcs[opr::PoolingForward::typeinfo()] = + replace_format_aware_opr; + opr_replace_funcs[opr::WarpPerspectiveForward::typeinfo()] = + replace_format_aware_opr; + + auto replace_elemwise_like_opr = [&padding_oprs, &extract_subtensor]( + OperatorNodeBase* opr, + const VarNodeArray& new_inp) { + mgb_assert(opr->input().size() == new_inp.size()); + bool have_padding_inp = false; + bool padding_all_inps = true; + bool same_padding = true; + size_t channels_after_padding = 0; + for (auto&& cur_inp : opr->input()) { + bool padding_cur_inp = padding_oprs.count(cur_inp->owner_opr()) > 0; + if (padding_cur_inp) { + if (!have_padding_inp) + have_padding_inp = true; + if (channels_after_padding == 0) { + channels_after_padding = cur_inp->shape()[1]; + } else { + same_padding = + channels_after_padding == cur_inp->shape()[1]; + } + } + if (padding_all_inps && (!padding_cur_inp || !same_padding)) + padding_all_inps = false; + } + if (have_padding_inp && !padding_all_inps) { + auto inps = new_inp; + for (size_t i = 0; i < new_inp.size(); ++i) { + auto cur_inp = opr->input(i); + bool padding_cur_inp = + padding_oprs.count(cur_inp->owner_opr()) > 0; + if (padding_cur_inp) { + size_t orig_channels = cur_inp->shape()[1]; + inps[i] = extract_subtensor(inps[i], orig_channels); + } + } + return serialization::copy_opr_shallow(*opr, inps, opr->config()); + } + if (padding_all_inps) { + padding_oprs.insert(opr); + } + return serialization::copy_opr_shallow(*opr, new_inp, opr->config()); + }; + opr_replace_funcs[opr::ElemwiseMultiType::typeinfo()] = + replace_elemwise_like_opr; + opr_replace_funcs[opr::Elemwise::typeinfo()] = replace_elemwise_like_opr; + opr_replace_funcs[opr::TypeCvt::typeinfo()] = replace_elemwise_like_opr; + + auto replace_nonpadding_oprs = [&padding_oprs, &extract_subtensor]( + OperatorNodeBase* opr, + const VarNodeArray& new_inp) { + mgb_assert(opr->input().size() == new_inp.size()); + bool have_padding_inp = false; + auto inps = new_inp; + for (size_t i = 0; i < new_inp.size(); ++i) { + auto cur_inp = opr->input(i); + bool padding_cur_inp = padding_oprs.count(cur_inp->owner_opr()) > 0; + if (padding_cur_inp) { + if (!have_padding_inp) + have_padding_inp = true; + size_t orig_channels = cur_inp->shape()[1]; + inps[i] = extract_subtensor(inps[i], orig_channels); + } + } + return serialization::copy_opr_shallow(*opr, inps, opr->config()); + }; + opr_replace_funcs[opr::Reshape::typeinfo()] = replace_nonpadding_oprs; + opr_replace_funcs[opr::GetVarShape::typeinfo()] = replace_nonpadding_oprs; + opr_replace_funcs[opr::Concat::typeinfo()] = replace_nonpadding_oprs; + + auto on_opr = [&opt, &rewriter, &opr_replace_funcs, + &extract_subtensor](OperatorNodeBase* opr) { + auto it = opr_replace_funcs.find(opr->dyn_typeinfo()); + if (it != opr_replace_funcs.end()) { + VarNodeArray new_inp; + new_inp.reserve(opr->input().size()); + for (auto&& inp : opr->input()) { + new_inp.push_back(rewriter.get_var(inp)); + } + auto new_opr = (it->second)(opr, new_inp); + auto &&out0 = opr->output(), &&out1 = new_opr->output(); + mgb_assert(out0.size() == out1.size(), + "bad opr replace: src=%s{%s} dst=%s{%s}, " + "src.size=%zu " + "dst.size=%zu", + opr->cname(), opr->dyn_typeinfo()->name, + new_opr->cname(), new_opr->dyn_typeinfo()->name, + out0.size(), out1.size()); + for (size_t i = 0; i < out0.size(); ++i) { + if (!out0[i]->contain_flag(VarNode::Flag::VOLATILE_CONTENT)) { + mgb_assert(!out1[i]->contain_flag( + VarNode::Flag::VOLATILE_CONTENT)); + auto src = out0[i]; + auto dst = out1[i]; + if (opt.graph().endpoint_contain(src) && + !src->shape().eq_shape(dst->shape())) { + size_t orig_channels = src->shape()[1]; + dst = extract_subtensor(dst, orig_channels); + } + rewriter.replace_var(src, dst, nullptr); + } + } + } else { + rewriter.auto_replace_outputs(opr); + } + }; + opt.graph().iter(on_opr); + rewriter.apply_inplace(); + + MIDOUT_E +} // vim: syntax=cpp.doxygen foldmethod=marker foldmarker=f{{{,f}}} diff --git a/src/gopt/include/megbrain/gopt/inference.h b/src/gopt/include/megbrain/gopt/inference.h index aca7ad7b..1ff3701e 100644 --- a/src/gopt/include/megbrain/gopt/inference.h +++ b/src/gopt/include/megbrain/gopt/inference.h @@ -409,6 +409,16 @@ namespace gopt { void apply(OptState& opt) const override; }; + /*! + * \brief padding channel to enable fast int8/int4 support + * assume input network is built in NCHW tensor format + */ + class PaddingChannelPass final : public Pass { + public: + const char* name() const override; + void apply(OptState& opt) const override; + }; + } // namespace gopt } // namespace mgb diff --git a/src/gopt/test/inference.cpp b/src/gopt/test/inference.cpp index 70a6ea89..312594f1 100644 --- a/src/gopt/test/inference.cpp +++ b/src/gopt/test/inference.cpp @@ -4178,6 +4178,313 @@ TEST(TestGoptInference, FoldingConvDimshuffleNCHW32NCHW4) { MGB_ASSERT_TENSOR_EQ(host_y_fuse, host_y_non_fuse); } #endif + +TEST(TestGoptInference, PaddingChannels) { + REQUIRE_GPU(1); + auto cn = CompNode::load("gpu0"); + cn.activate(); + auto&& prop = CompNodeEnv::from_comp_node(cn).cuda_env().device_prop; + auto sm_ver = prop.major * 10 + prop.minor; + if (sm_ver < 61) { + printf("This testcast ignored due to insufficient cuda cap(got: %d, " + "expected: %d)\n", + sm_ver, 61); + return; + } + + HostTensorGenerator gen; + auto graph = ComputingGraph::make(); + graph->options().graph_opt_level = 0; + auto mkvar = [&](const char* name, const TensorShape& shp, + const DType& dtype) { + return opr::TypeCvt::make( + opr::Host2DeviceCopy::make(*graph, gen(shp, cn)).rename(name), + dtype); + }; + auto mkcvar = [&](const char* name, const TensorShape& shp, + const DType& dtype) { + return opr::TypeCvt::make( + opr::SharedDeviceTensor::make(*graph, *gen(shp, cn)) + .rename(name), + dtype); + }; + + auto x = mkvar("x", {16, 3, 14, 14}, dtype::QuantizedS8(2.5f)), + w = mkcvar("w", {20, 3, 3, 3}, dtype::QuantizedS8(2.5f)), + b = mkcvar("b", {1, 20, 1, 1}, dtype::QuantizedS32(6.25f)); + opr::ConvBias::Param param; + param.format = opr::ConvBias::Param::Format::NCHW; + param.nonlineMode = opr::ConvBias::Param::NonlineMode::RELU; + param.stride_h = param.stride_w = 1; + param.pad_h = param.pad_w = 1; + + auto y = opr::ConvBias::make(x, w, b, param, {}, + OperatorNodeConfig{dtype::QuantizedS8(2.5f)}); + auto w1 = mkcvar("w1", {24, 20, 3, 3}, dtype::QuantizedS8(2.5f)), + b1 = mkcvar("b1", {1, 24, 1, 1}, dtype::QuantizedS32(6.25f)); + auto y1 = opr::ConvBias::make(y, w1, b1, param, {}, + OperatorNodeConfig{dtype::QuantizedS8(2.5f)}); + auto w2 = mkcvar("w2", {20, 24, 3, 3}, dtype::QuantizedS8(2.5f)), + b2 = mkcvar("b2", {1, 20, 1, 1}, dtype::QuantizedS32(6.25f)); + auto y2 = opr::ConvBias::make(y1, w2, b2, param, {}, + OperatorNodeConfig{dtype::QuantizedS8(2.5f)}); + using ElemMultiMode = opr::ElemwiseMultiType::Param::Mode; + auto y3 = opr::ElemwiseMultiType::make( + {y, y2}, {ElemMultiMode::QFUSE_ADD_RELU}, + OperatorNodeConfig{dtype::QuantizedS8{1.2f}}); + y3 = opr::TypeCvt::make(y3, dtype::Float32()); + SymbolVar y3_pad; + unpack_vector(gopt::GraphOptimizer{} + .add_pass() + .apply({{y3}}) + .endpoint_vars(), + y3_pad); + ASSERT_EQ(y3_pad.node()->shape()[1], y3.node()->shape()[1]); + SmallVector oprs; + auto cb = [&oprs](cg::OperatorNodeBase* opr) { + if (opr->same_type()) { + oprs.push_back(opr); + } + }; + cg::DepOprIter{cb}.add(y3_pad.node()->owner_opr()); + ASSERT_EQ(oprs.size(), 3); + ASSERT_EQ(oprs[0]->output(0)->shape()[1], 20); + ASSERT_EQ(oprs[1]->output(0)->shape()[1], 32); + ASSERT_EQ(oprs[2]->output(0)->shape()[1], 32); + HostTensorND t1, t2; + auto func1 = graph->compile({make_callback_copy(y3, t1)}); + func1->execute(); + auto func2 = graph->compile({make_callback_copy(y3_pad, t2)}); + func2->execute(); + MGB_ASSERT_TENSOR_EQ(t1, t2); +} + +TEST(TestGoptInference, ConcatAfterPaddingChannels) { + REQUIRE_GPU(1); + auto cn = CompNode::load("gpu0"); + cn.activate(); + auto&& prop = CompNodeEnv::from_comp_node(cn).cuda_env().device_prop; + auto sm_ver = prop.major * 10 + prop.minor; + if (sm_ver < 61) { + printf("This testcast ignored due to insufficient cuda cap(got: %d, " + "expected: %d)\n", + sm_ver, 61); + return; + } + + HostTensorGenerator gen; + auto graph = ComputingGraph::make(); + graph->options().graph_opt_level = 0; + auto mkvar = [&](const char* name, const TensorShape& shp, + const DType& dtype) { + return opr::TypeCvt::make( + opr::Host2DeviceCopy::make(*graph, gen(shp, cn)).rename(name), + dtype); + }; + auto mkcvar = [&](const char* name, const TensorShape& shp, + const DType& dtype) { + return opr::TypeCvt::make( + opr::SharedDeviceTensor::make(*graph, *gen(shp, cn)) + .rename(name), + dtype); + }; + + auto x = mkvar("x", {16, 3, 14, 14}, dtype::QuantizedS8(2.5f)), + w = mkcvar("w", {18, 3, 3, 3}, dtype::QuantizedS8(2.5f)), + b = mkcvar("b", {1, 18, 1, 1}, dtype::QuantizedS32(6.25f)); + opr::ConvBias::Param param; + param.format = opr::ConvBias::Param::Format::NCHW; + param.nonlineMode = opr::ConvBias::Param::NonlineMode::RELU; + param.stride_h = param.stride_w = 1; + param.pad_h = param.pad_w = 1; + + auto y = opr::ConvBias::make(x, w, b, param, {}, + OperatorNodeConfig{dtype::QuantizedS8(2.5f)}); + auto w1 = mkcvar("w1", {18, 18, 3, 3}, dtype::QuantizedS8(2.5f)), + b1 = mkcvar("b1", {1, 18, 1, 1}, dtype::QuantizedS32(6.25f)); + auto y1 = opr::ConvBias::make(y, w1, b1, param, {}, + OperatorNodeConfig{dtype::QuantizedS8(2.5f)}); + // concat at batch dim + auto y2 = opr::Concat::make({y, y1}, 0); + y2 = opr::TypeCvt::make(y2, dtype::Float32()); + SymbolVar y2_pad; + unpack_vector(gopt::GraphOptimizer{} + .add_pass() + .apply({{y2}}) + .endpoint_vars(), + y2_pad); + ASSERT_EQ(y2_pad.node()->shape()[1], y2.node()->shape()[1]); + SmallVector oprs; + auto cb = [&oprs](cg::OperatorNodeBase* opr) { + if (opr->same_type()) { + oprs.push_back(opr); + } + }; + cg::DepOprIter{cb}.add(y2_pad.node()->owner_opr()); + ASSERT_EQ(oprs.size(), 2); + ASSERT_EQ(oprs[0]->output(0)->shape()[1], 20); + ASSERT_EQ(oprs[1]->output(0)->shape()[1], 32); + HostTensorND t1, t2; + auto func1 = graph->compile({make_callback_copy(y2, t1)}); + func1->execute(); + auto func2 = graph->compile({make_callback_copy(y2_pad, t2)}); + func2->execute(); + MGB_ASSERT_TENSOR_EQ(t1, t2); +} + +// FIXME replace cpu with gpu to enable gpu validation +TEST(TestGoptInference, PaddingChannelsWithPooling) { + REQUIRE_GPU(1); + auto cn = CompNode::load("cpu0"); +// cn.activate(); +// auto&& prop = CompNodeEnv::from_comp_node(cn).cuda_env().device_prop; +// auto sm_ver = prop.major * 10 + prop.minor; +// if (sm_ver < 61) { +// printf("This testcast ignored due to insufficient cuda cap(got: %d, " +// "expected: %d)\n", +// sm_ver, 61); +// return; +// } + + HostTensorGenerator gen; + auto graph = ComputingGraph::make(); + graph->options().graph_opt_level = 0; + auto mkvar = [&](const char* name, const TensorShape& shp, + const DType& dtype) { + return opr::TypeCvt::make( + opr::Host2DeviceCopy::make(*graph, gen(shp, cn)).rename(name), + dtype); + }; + auto mkcvar = [&](const char* name, const TensorShape& shp, + const DType& dtype) { + return opr::TypeCvt::make( + opr::SharedDeviceTensor::make(*graph, *gen(shp, cn)) + .rename(name), + dtype); + }; + + auto x = mkvar("x", {16, 3, 14, 14}, dtype::QuantizedS8(2.5f)), + w = mkcvar("w", {20, 3, 3, 3}, dtype::QuantizedS8(2.5f)), + b = mkcvar("b", {1, 20, 1, 1}, dtype::QuantizedS32(6.25f)); + opr::ConvBias::Param param; + param.format = opr::ConvBias::Param::Format::NCHW; + param.nonlineMode = opr::ConvBias::Param::NonlineMode::RELU; + param.stride_h = param.stride_w = 1; + param.pad_h = param.pad_w = 1; + + auto y = opr::ConvBias::make(x, w, b, param, {}, + OperatorNodeConfig{dtype::QuantizedS8(2.5f)}); + auto w1 = mkcvar("w1", {24, 20, 3, 3}, dtype::QuantizedS8(2.5f)), + b1 = mkcvar("b1", {1, 24, 1, 1}, dtype::QuantizedS32(6.25f)); + auto y1 = opr::ConvBias::make(y, w1, b1, param, {}, + OperatorNodeConfig{dtype::QuantizedS8(2.5f)}); + + opr::Pooling::Param pool_param; + pool_param.format = opr::Pooling::Param::Format::NCHW; + y1 = opr::Pooling::make(y1, pool_param); + y1 = opr::TypeCvt::make(y1, dtype::Float32()); + SymbolVar y1_pad; + unpack_vector(gopt::GraphOptimizer{} + .add_pass() + .apply({{y1}}) + .endpoint_vars(), + y1_pad); + ASSERT_EQ(y1_pad.node()->shape()[1], y1.node()->shape()[1]); + SmallVector oprs; + auto cb = [&oprs](cg::OperatorNodeBase* opr) { + if (opr->same_type()) { + oprs.push_back(opr); + } + }; + cg::DepOprIter{cb}.add(y1_pad.node()->owner_opr()); + ASSERT_EQ(oprs[0]->output(0)->shape()[1], 32); + HostTensorND t1, t2; + auto func1 = graph->compile({make_callback_copy(y1, t1)}); + func1->execute(); + auto func2 = graph->compile({make_callback_copy(y1_pad, t2)}); + func2->execute(); + MGB_ASSERT_TENSOR_EQ(t1, t2); +} + +// FIXME replace cpu with gpu to enable gpu validation +TEST(TestGoptInference, PaddingChannelsWithWarpPerspective) { + REQUIRE_GPU(1); + auto cn = CompNode::load("cpu0"); +// cn.activate(); +// auto&& prop = CompNodeEnv::from_comp_node(cn).cuda_env().device_prop; +// auto sm_ver = prop.major * 10 + prop.minor; +// if (sm_ver < 61) { +// printf("This testcast ignored due to insufficient cuda cap(got: %d, " +// "expected: %d)\n", +// sm_ver, 61); +// return; +// } + + HostTensorGenerator gen; + auto graph = ComputingGraph::make(); + graph->options().graph_opt_level = 0; + auto mkvar = [&](const char* name, const TensorShape& shp, + const DType& dtype) { + return opr::TypeCvt::make( + opr::Host2DeviceCopy::make(*graph, gen(shp, cn)).rename(name), + dtype); + }; + auto mkcvar = [&](const char* name, const TensorShape& shp, + const DType& dtype) { + return opr::TypeCvt::make( + opr::SharedDeviceTensor::make(*graph, *gen(shp, cn)) + .rename(name), + dtype); + }; + + std::shared_ptr mat = std::make_shared( + cn, TensorShape{16, 3, 3}, dtype::Float32()); + warp_perspective_mat_gen(*mat, 16, 14, 14); + auto mat_var = opr::Host2DeviceCopy::make(*graph, mat).rename("mat"); + + auto x = mkvar("x", {16, 3, 14, 14}, dtype::QuantizedS8(2.5f)), + w = mkcvar("w", {20, 3, 3, 3}, dtype::QuantizedS8(2.5f)), + b = mkcvar("b", {1, 20, 1, 1}, dtype::QuantizedS32(6.25f)); + opr::ConvBias::Param param; + param.format = opr::ConvBias::Param::Format::NCHW; + param.nonlineMode = opr::ConvBias::Param::NonlineMode::RELU; + param.stride_h = param.stride_w = 1; + param.pad_h = param.pad_w = 1; + + auto y = opr::ConvBias::make(x, w, b, param, {}, + OperatorNodeConfig{dtype::QuantizedS8(2.5f)}); + auto w1 = mkcvar("w1", {24, 20, 3, 3}, dtype::QuantizedS8(2.5f)), + b1 = mkcvar("b1", {1, 24, 1, 1}, dtype::QuantizedS32(6.25f)); + auto y1 = opr::ConvBias::make(y, w1, b1, param, {}, + OperatorNodeConfig{dtype::QuantizedS8(2.5f)}); + + opr::WarpPerspective::Param warp_param; + warp_param.format = opr::WarpPerspective::Param::Format::NCHW; + y1 = opr::WarpPerspective::make(y1, mat_var, TensorShape{14, 14}, + warp_param); + y1 = opr::TypeCvt::make(y1, dtype::Float32()); + SymbolVar y1_pad; + unpack_vector(gopt::GraphOptimizer{} + .add_pass() + .apply({{y1}}) + .endpoint_vars(), + y1_pad); + ASSERT_EQ(y1_pad.node()->shape()[1], y1.node()->shape()[1]); + SmallVector oprs; + auto cb = [&oprs](cg::OperatorNodeBase* opr) { + if (opr->same_type()) { + oprs.push_back(opr); + } + }; + cg::DepOprIter{cb}.add(y1_pad.node()->owner_opr()); + ASSERT_EQ(oprs[0]->output(0)->shape()[1], 32); + HostTensorND t1, t2; + auto func1 = graph->compile({make_callback_copy(y1, t1)}); + func1->execute(); + auto func2 = graph->compile({make_callback_copy(y1_pad, t2)}); + func2->execute(); + MGB_ASSERT_TENSOR_EQ(t1, t2); +} #endif // vim: syntax=cpp.doxygen foldmethod=marker foldmarker=f{{{,f}}}