diff --git a/dnn/src/cuda/conv_bias/cudnn_conv.cpp b/dnn/src/cuda/conv_bias/cudnn_conv.cpp index 1be71d81..51d86929 100644 --- a/dnn/src/cuda/conv_bias/cudnn_conv.cpp +++ b/dnn/src/cuda/conv_bias/cudnn_conv.cpp @@ -31,6 +31,15 @@ bool ConvBiasForwardImpl::AlgoCUDNNConv::is_available( } } + // FIXME: cudnn cannot handle the case when the initial value of dst tensor + // contains nan and beta is zero, because the result of 0.f * nan is still + // nan + if (args.src_layout->dtype.enumv() == DTypeEnum::QuantizedS8 && + args.dst_layout->dtype.enumv() == DTypeEnum::Float32 && + args.opr->param().format == param::ConvBias::Format::NCHW) { + return false; + } + auto dst_layout = *args.dst_layout; if (dst_layout.dtype.enumv() != args.bias_layout->dtype.enumv()) { dst_layout.dtype = DType(); diff --git a/dnn/src/cuda/conv_bias/cudnn_conv_bias_activation.cpp b/dnn/src/cuda/conv_bias/cudnn_conv_bias_activation.cpp index fce69d5a..674dd629 100644 --- a/dnn/src/cuda/conv_bias/cudnn_conv_bias_activation.cpp +++ b/dnn/src/cuda/conv_bias/cudnn_conv_bias_activation.cpp @@ -57,6 +57,15 @@ bool ConvBiasForwardImpl::AlgoCUDNNConvBiasActivation::is_available( } #endif + // FIXME: cudnn cannot handle the case when the initial value of dst tensor + // contains nan and beta is zero, because the result of 0.f * nan is still + // nan + if (args.src_layout->dtype.enumv() == DTypeEnum::QuantizedS8 && + args.dst_layout->dtype.enumv() == DTypeEnum::Float32 && + param.format == param::ConvBias::Format::NCHW) { + return false; + } + //! FIXME: conv kernel of cudnn for NCHW4_NCHW tensor format causes illegal //! memory access errors, so we have to disable this kernel here. if (param.format == param::ConvBias::Format::NCHW4_NCHW || diff --git a/src/gopt/impl/tensor_reformat.cpp b/src/gopt/impl/tensor_reformat.cpp index 4cb259d1..7d4241f3 100644 --- a/src/gopt/impl/tensor_reformat.cpp +++ b/src/gopt/impl/tensor_reformat.cpp @@ -1619,6 +1619,8 @@ std::unique_ptr EnableNCHW4Pass::make_nchw4_converter() { megdnn::param::Convolution::Format::NCHW4; megdnn::param::ConvBias::Format conv_bias_format = megdnn::param::ConvBias::Format::NCHW4; + megdnn::param::ConvBias::Format conv_bias_format_nchw4_nchw = + megdnn::param::ConvBias::Format::NCHW4_NCHW; megdnn::param::BatchConvBias::Format batch_conv_bias_format = megdnn::param::BatchConvBias::Format::NCHW4; RelayoutMode src_to_nchw4_mode = RelayoutMode::NCHW_TO_NCHW4; @@ -1821,6 +1823,7 @@ std::unique_ptr EnableNCHW4Pass::make_nchw4_converter() { return new_opr; }; auto replace_conv_bias_opr = [trans_nchw4, conv_bias_format, + conv_bias_format_nchw4_nchw, src_to_nchw4_mode]( OperatorNodeBase* opr, const VarNodeArray& new_inp) { @@ -1851,19 +1854,27 @@ std::unique_ptr EnableNCHW4Pass::make_nchw4_converter() { conv_bias_filter = new_filter.node(); // format: NCHW --> NCHW4 auto new_param = conv_bias_opr.param(); - new_param.format = conv_bias_format; + if (conv_bias_opr.output().size() > 0 && + conv_bias_opr.output(0)->dtype().enumv() == DTypeEnum::Float32) { + new_param.format = conv_bias_format_nchw4_nchw; + } else { + new_param.format = conv_bias_format; + } if (new_inp.size() == 2) { auto new_conv_bias_opr = opr::ConvBias::make( conv_bias_src, conv_bias_filter, new_param, conv_bias_opr.execution_policy(), conv_bias_opr.config()); OperatorNodeBase* new_opr = new_conv_bias_opr.node()->owner_opr(); - mgb_assert(new_conv_bias_opr.shape().ndim == 5, - "The conv_bias dst dim is not trans to nchw4"); + mgb_assert( + new_conv_bias_opr.node()->dtype().enumv() == DTypeEnum::Float32 || + new_conv_bias_opr.shape().ndim == 5, + "The conv_bias dst dim is not trans to nchw4"); return new_opr; } - // bias: NCHW --> NCHW4 + // bias: NCHW --> NCHW4 when bias_dtype is not Float32 VarNode* conv_bias_bias = new_inp[2]; - if (new_inp[2]->shape().ndim == 4) { + if (new_inp[2]->dtype().enumv() != DTypeEnum::Float32 && + new_inp[2]->shape().ndim == 4) { auto new_bias = RelayoutPlaceholder::make(new_inp[2], src_to_nchw4_mode); conv_bias_bias = new_bias.node(); @@ -1873,13 +1884,16 @@ std::unique_ptr EnableNCHW4Pass::make_nchw4_converter() { conv_bias_src, conv_bias_filter, conv_bias_bias, new_param, conv_bias_opr.execution_policy(), conv_bias_opr.config()); OperatorNodeBase* new_opr = new_conv_bias_opr.node()->owner_opr(); - mgb_assert(new_conv_bias_opr.shape().ndim == 5, - "The conv_bias dst dim is not trans to nchw4"); + mgb_assert( + new_conv_bias_opr.node()->dtype().enumv() == DTypeEnum::Float32 || + new_conv_bias_opr.shape().ndim == 5, + "The conv_bias dst dim is not trans to nchw4"); return new_opr; } - // z_inp: NCHW --> NCHW4 + // z_inp: NCHW --> NCHW4 when bias_dtype is not Float32 VarNode* z_inp = new_inp[3]; - if (new_inp[3]->shape().ndim == 4) { + if (new_inp[3]->dtype().enumv() != DTypeEnum::Float32 && + new_inp[3]->shape().ndim == 4) { auto new_z = RelayoutPlaceholder::make(new_inp[3], src_to_nchw4_mode); z_inp = new_z.node(); @@ -1889,8 +1903,10 @@ std::unique_ptr EnableNCHW4Pass::make_nchw4_converter() { new_param, conv_bias_opr.execution_policy(), conv_bias_opr.config()); OperatorNodeBase* new_opr = new_conv_bias_opr.node()->owner_opr(); - mgb_assert(new_conv_bias_opr.shape().ndim == 5, - "The conv_bias dst dim is not trans to nchw4"); + mgb_assert( + new_conv_bias_opr.node()->dtype().enumv() == DTypeEnum::Float32 || + new_conv_bias_opr.shape().ndim == 5, + "The conv_bias dst dim is not trans to nchw4"); return new_opr; }; auto replace_elemwise_opr = [=](OperatorNodeBase* opr, diff --git a/src/gopt/test/inference.cpp b/src/gopt/test/inference.cpp index 9be80442..af6ef436 100644 --- a/src/gopt/test/inference.cpp +++ b/src/gopt/test/inference.cpp @@ -3088,6 +3088,88 @@ TEST(TestGoptInference, ConvertFormatNCHW4GPU) { MGB_ASSERT_TENSOR_EQ(host_y, host_y_opt); } +TEST(TestGoptInference, ConvertFormatNCHW4FloatGPU) { + REQUIRE_GPU(1); + auto cn = CompNode::load("gpu0"); + cn.activate(); + REQUIRE_CUDA_COMPUTE_CAPABILITY_EQ(6, 1); + + HostTensorGenerator<> gen; + auto graph = ComputingGraph::make(); + graph->options().graph_opt_level = 0; + + auto mkvar = [&](const char* name, const TensorShape& shp, + const DType& dtype) { + return opr::TypeCvt::make( + opr::Host2DeviceCopy::make(*graph, gen(shp, cn)).rename(name), + dtype); + }; + + auto mkcvar = [&](const char* name, const TensorShape& shp, + const DType& dtype) { + return opr::TypeCvt::make( + opr::SharedDeviceTensor::make(*graph, *gen(shp, cn)) + .rename(name), + dtype); + }; + + auto x = mkvar("x", {2, 4, 16, 16}, dtype::QuantizedS8(1.2f)); + opr::ConvBias::Param param_conv_bias; + param_conv_bias.pad_h = param_conv_bias.pad_w = 1; + param_conv_bias.sparse = opr::ConvBias::Param::Sparse::DENSE; + + // conv1, with bias + auto w1 = mkcvar("w1", {8, 4, 3, 3}, dtype::QuantizedS8(1.3f)), + b1 = mkcvar("b1", {1, 8, 1, 1}, dtype::Float32()); + auto conv1 = opr::ConvBias::make(x, w1, b1, param_conv_bias, {}, + OperatorNodeConfig{dtype::Float32()}); + + // conv2, with bias and z + auto w2 = mkcvar("w2", {8, 4, 3, 3}, dtype::QuantizedS8(1.3f)), + b2 = mkcvar("b2", {1, 8, 1, 1}, dtype::Float32()), + z2 = mkcvar("z2", {2, 8, 16, 16}, dtype::Float32()); + auto conv2 = opr::ConvBias::make(x, w2, b2, z2, param_conv_bias, {}, + OperatorNodeConfig{dtype::Float32()}); + + // conv3, relu + param_conv_bias.nonlineMode = opr::ConvBias::Param::NonlineMode::RELU; + auto w3 = mkcvar("w3", {8, 4, 3, 3}, dtype::QuantizedS8(1.3f)), + b3 = mkcvar("b3", {1, 8, 1, 1}, dtype::Float32()), + z3 = mkcvar("z3", {2, 8, 16, 16}, dtype::Float32()); + auto conv3 = opr::ConvBias::make(x, w3, b3, z3, param_conv_bias, {}, + OperatorNodeConfig{dtype::Float32()}); + + auto y = conv1 + conv2 + conv3; + + SymbolVar y_opt; + { + auto options = gopt::OptimizeForInferenceOptions{}; + options.enable_nchw4(); + unpack_vector(gopt::optimize_for_inference({y}, options), y_opt); + } + + bool succ = true; + auto cb = [&succ](cg::OperatorNodeBase* opr) { + if (opr->same_type()) { + auto& conv_bias = opr->cast_final_safe(); + if (conv_bias.param().format != + opr::ConvBias::Param::Format::NCHW4_NCHW) { + succ = false; + } + } + }; + + cg::DepOprIter{cb}.add(y_opt); + ASSERT_TRUE(succ); + + HostTensorND host_y, host_y_opt; + auto func = graph->compile({make_callback_copy(y, host_y), + make_callback_copy(y_opt, host_y_opt)}); + func->execute(); + + MGB_ASSERT_TENSOR_NEAR(host_y, host_y_opt, 1e-5); +} + #endif TEST(TestGoptInference, ConvertFormatNCHW4NonConvOpr) {