From 633016a962573ae9578b41f823da5aa1c5724c08 Mon Sep 17 00:00:00 2001 From: Megvii Engine Team Date: Thu, 10 Jun 2021 18:30:41 +0800 Subject: [PATCH] fix(dnn/cuda): fix AlgoFallbackNCHWQS8 to support Float32 dst GitOrigin-RevId: 06f90f5cf384bc4ddb2f97860e4f530ee9a85705 --- dnn/src/cuda/conv_bias/conv_nchwqs8.cpp | 82 +++++++++++++-------- .../cuda/conv_bias/cudnn_conv_bias_activation.cpp | 3 +- dnn/test/cuda/conv_bias_int8.cpp | 85 ++++++++++++++++++++++ 3 files changed, 139 insertions(+), 31 deletions(-) diff --git a/dnn/src/cuda/conv_bias/conv_nchwqs8.cpp b/dnn/src/cuda/conv_bias/conv_nchwqs8.cpp index 2520cc70..04a6697a 100644 --- a/dnn/src/cuda/conv_bias/conv_nchwqs8.cpp +++ b/dnn/src/cuda/conv_bias/conv_nchwqs8.cpp @@ -50,15 +50,23 @@ void ConvBiasForwardImpl::AlgoFallbackNCHWQS8::make_inner_layout( deduce_reformat_layout(relayout_src, *args.filter_layout, inner_weight_layout, RelayoutFormat::Param::Mode::NCHW_NCHW4_WEIGHT); - deduce_reformat_layout(relayout_src, *args.dst_layout, inner_dst_layout, - RelayoutFormat::Param::Mode::NCHW_NCHW4, 0, - args.filter_meta.group); - deduce_reformat_layout(relayout_src, *args.bias_layout, inner_bias_layout, - RelayoutFormat::Param::Mode::NCHW_NCHW4, 0, - args.filter_meta.group); - deduce_reformat_layout(relayout_src, *args.z_layout, inner_z_layout, - RelayoutFormat::Param::Mode::NCHW_NCHW4, 0, - args.filter_meta.group); + bool dst_float = args.dst_layout->dtype.enumv() == DTypeEnum::Float32; + if (dst_float) { + inner_dst_layout = *args.dst_layout; + inner_bias_layout = *args.bias_layout; + inner_z_layout = *args.z_layout; + } else { + deduce_reformat_layout(relayout_src, *args.dst_layout, inner_dst_layout, + RelayoutFormat::Param::Mode::NCHW_NCHW4, 0, + args.filter_meta.group); + deduce_reformat_layout(relayout_src, *args.bias_layout, + inner_bias_layout, + RelayoutFormat::Param::Mode::NCHW_NCHW4, 0, + args.filter_meta.group); + deduce_reformat_layout(relayout_src, *args.z_layout, inner_z_layout, + RelayoutFormat::Param::Mode::NCHW_NCHW4, 0, + args.filter_meta.group); + } }; bool ConvBiasForwardImpl::AlgoFallbackNCHWQS8::is_available( @@ -70,8 +78,7 @@ bool ConvBiasForwardImpl::AlgoFallbackNCHWQS8::is_available( auto&& param = args.opr->param(); bool is_format_ok = param.format == param::ConvBias::Format::NCHW; bool is_version_ok = CUDNN_VERSION >= 7500; - bool is_dtype_ok = - args.src_layout->dtype.enumv() == DTypeEnum::QuantizedS8; + bool is_dtype_ok = args.src_layout->dtype.enumv() == DTypeEnum::QuantizedS8; bool is_bias_ok = args.bias_layout->ndim == 0 || (args.bias_layout->ndim == 4 && args.bias_layout->shape[0] == 1 && @@ -90,17 +97,23 @@ WorkspaceBundle ConvBiasForwardImpl::AlgoFallbackNCHWQS8::get_workspace_bundle( TensorLayout inner_z_layout; make_inner_layout(args, inner_src_layout, inner_weight_layout, inner_dst_layout, inner_bias_layout, inner_z_layout); - auto opr = args.handle->create_operator(); Param inner_conv_param = args.opr->param(); - inner_conv_param.format = Param::Format::NCHW4; + size_t ws_dst = 0, ws_bias = 0, ws_z = 0; + if (args.dst_layout->dtype.enumv() == DTypeEnum::Float32) { + inner_conv_param.format = Param::Format::NCHW4_NCHW; + } else { + inner_conv_param.format = Param::Format::NCHW4; + ws_dst = inner_dst_layout.span().dist_byte(); + ws_bias = inner_bias_layout.span().dist_byte(); + ws_z = inner_z_layout.span().dist_byte(); + } + auto opr = args.handle->create_operator(); opr->param() = inner_conv_param; - return WorkspaceBundle(ptr, {inner_src_layout.span().dist_byte(), - inner_weight_layout.span().dist_byte(), - inner_dst_layout.span().dist_byte(), - inner_bias_layout.span().dist_byte(), - inner_z_layout.span().dist_byte(), - opr->get_workspace_in_bytes( - inner_src_layout, inner_weight_layout, + return WorkspaceBundle( + ptr, + {inner_src_layout.span().dist_byte(), + inner_weight_layout.span().dist_byte(), ws_dst, ws_bias, ws_z, + opr->get_workspace_in_bytes(inner_src_layout, inner_weight_layout, inner_bias_layout, inner_z_layout, inner_dst_layout, nullptr)}); } @@ -145,22 +158,33 @@ void ConvBiasForwardImpl::AlgoFallbackNCHWQS8::exec( TensorND inner_bias(bundle.get(3), inner_bias_layout); TensorND inner_z(bundle.get(4), inner_z_layout); + bool dst_float = args.dst_layout->dtype.enumv() == DTypeEnum::Float32; + Param inner_conv_param = args.opr->param(); - inner_conv_param.format = Param::Format::NCHW4; + inner_conv_param.format = + dst_float ? Param::Format::NCHW4_NCHW : Param::Format::NCHW4; auto inner_opr = args.handle->create_operator(); inner_opr->param() = inner_conv_param; relayout_nchw_nchw4->exec(*args.src_tensor, inner_src, {}); relayout_weight->exec(*args.filter_tensor, inner_weight, {}); - if (inner_bias_layout.ndim > 0) { - relayout_nchw_nchw4->exec(*args.bias_tensor, inner_bias, {}); - } - if (inner_z_layout.ndim > 0) { - relayout_nchw_nchw4->exec(*args.z_tensor, inner_z, {}); + + if (dst_float) { + inner_opr->exec(inner_src, inner_weight, *args.bias_tensor, + *args.z_tensor, *args.dst_tensor, nullptr, + Workspace((dt_byte*)bundle.get(5), bundle.get_size(5))); + } else { + if (inner_bias_layout.ndim > 0) { + relayout_nchw_nchw4->exec(*args.bias_tensor, inner_bias, {}); + } + if (inner_z_layout.ndim > 0) { + relayout_nchw_nchw4->exec(*args.z_tensor, inner_z, {}); + } + inner_opr->exec(inner_src, inner_weight, inner_bias, inner_z, inner_dst, + nullptr, + Workspace((dt_byte*)bundle.get(5), bundle.get_size(5))); + relayout_nchw4_nchw->exec(inner_dst, *args.dst_tensor, {}); } - inner_opr->exec(inner_src, inner_weight, inner_bias, inner_z, inner_dst, - nullptr, Workspace((dt_byte*)bundle.get(5), bundle.get_size(5))); - relayout_nchw4_nchw->exec(inner_dst, *args.dst_tensor, {}); } // vim: syntax=cpp.doxygen diff --git a/dnn/src/cuda/conv_bias/cudnn_conv_bias_activation.cpp b/dnn/src/cuda/conv_bias/cudnn_conv_bias_activation.cpp index 9839dcbe..fce69d5a 100644 --- a/dnn/src/cuda/conv_bias/cudnn_conv_bias_activation.cpp +++ b/dnn/src/cuda/conv_bias/cudnn_conv_bias_activation.cpp @@ -192,8 +192,7 @@ void ConvBiasForwardImpl::AlgoCUDNNConvBiasActivation::exec( dst_dtype = args.dst_layout->dtype; megdnn_assert( (src_dtype.category() == dst_dtype.category()) || - (args.opr->param().format == param::ConvBias::Format::NCHW4_NCHW && - src_dtype.enumv() == DTypeEnum::QuantizedS8 && + (src_dtype.enumv() == DTypeEnum::QuantizedS8 && dst_dtype.enumv() == DTypeEnum::Float32)); megdnn_assert(src_dtype.category() == filter_dtype.category()); diff --git a/dnn/test/cuda/conv_bias_int8.cpp b/dnn/test/cuda/conv_bias_int8.cpp index e13a562c..707545f8 100644 --- a/dnn/test/cuda/conv_bias_int8.cpp +++ b/dnn/test/cuda/conv_bias_int8.cpp @@ -28,6 +28,15 @@ namespace megdnn { namespace test { namespace conv{ +TEST_F(CUDA, CONV_BIAS_INT8_NCHW4_CUDNN_CONVOLUTION) { + require_compute_capability(7, 5); + conv_bias::check_conv_bias( + dtype::QuantizedS8{1.2f}, dtype::QuantizedS8{1.3f}, + dtype::QuantizedS32{1.2f * 1.3f}, dtype::QuantizedS8{1.3f}, + handle_cuda(), "DEFAULT:CUDNN:ConvBiasActivation:", + param::ConvBias::Format::NCHW4); +} + TEST_F(CUDA, CONV_BIAS_INT8_NCHW4_1x1) { require_compute_capability(6, 1); conv_bias::check_conv_bias( @@ -689,6 +698,82 @@ TEST_F(CUDA, CONV_BIAS_INT8_CHWN4_UNROLL_WIDTH_TENSORCORE_1x1_ALGO_2) { } +TEST_F(CUDA, FALLBACK_CONV_QS8) { + require_compute_capability_eq(7, 5); + Checker checker(handle_cuda()); + auto check = [&checker](const std::string&& algo) { + checker.set_before_exec_callback( + conv_bias::ConvBiasAlgoChecker(algo.c_str())); + UniformIntRNG rng{-3, 3}; + UniformIntRNG bias_rng{-50, 50}; + checker.set_rng(0, &rng) + .set_rng(1, &rng) + .set_rng(2, &bias_rng) + .set_rng(3, &rng) + .set_dtype(0, dtype::QuantizedS8{1.2f}) + .set_dtype(1, dtype::QuantizedS8{1.3f}) + .set_dtype(2, dtype::QuantizedS32{1.2f * 1.3f}) + .set_dtype(3, dtype::QuantizedS8{19.990229f}) + .set_dtype(4, dtype::QuantizedS8{19.990228f}) + .set_epsilon(1e-3) + .set_max_avg_error(1e-1) + .set_max_avg_biased_error(1e-3); + param::ConvBias param; + param.pad_h = param.pad_w = 1; + param.stride_h = param.stride_w = 2; + param.format = param::ConvBias::Format::NCHW; + checker.set_param(param).execs({{16, 15, 14, 14}, + {28, 15, 3, 3}, + {1, 28, 1, 1}, + {16, 28, 7, 7}, + {}}); + checker.set_param(param).execs({{16, 32, 14, 14}, + {32, 32, 3, 3}, + {1, 32, 1, 1}, + {}, + {}}); + }; + check("FALLBACK_CONV_NCHW_QS8"); +} + +TEST_F(CUDA, FALLBACK_CONV_QS8_F32) { + require_compute_capability_eq(7, 5); + Checker checker(handle_cuda()); + auto check = [&checker](const std::string&& algo) { + checker.set_before_exec_callback( + conv_bias::ConvBiasAlgoChecker(algo.c_str())); + UniformIntRNG rng{-3, 3}; + UniformFloatRNG bias_rng{-50.f, 50.f}; + checker.set_rng(0, &rng) + .set_rng(1, &rng) + .set_rng(2, &bias_rng) + .set_rng(3, &rng) + .set_dtype(0, dtype::QuantizedS8{1.2f}) + .set_dtype(1, dtype::QuantizedS8{1.3f}) + .set_dtype(2, dtype::Float32{}) + .set_dtype(3, dtype::Float32{}) + .set_dtype(4, dtype::Float32{}) + .set_epsilon(1e-3) + .set_max_avg_error(1e-1) + .set_max_avg_biased_error(1e-3); + param::ConvBias param; + param.pad_h = param.pad_w = 1; + param.stride_h = param.stride_w = 2; + param.format = param::ConvBias::Format::NCHW; + checker.set_param(param).execs({{16, 15, 14, 14}, + {28, 15, 3, 3}, + {1, 28, 1, 1}, + {16, 28, 7, 7}, + {}}); + checker.set_param(param).execs({{16, 32, 14, 14}, + {32, 32, 3, 3}, + {1, 32, 1, 1}, + {}, + {}}); + }; + check("FALLBACK_CONV_NCHW_QS8"); +} + TEST_F(CUDA, CUTLASS_CONV_BIAS_INT8_WEIGHT_PREPROCESS) { require_compute_capability(6, 1); Checker> checker(