GitOrigin-RevId: b29b009de0
release-1.7
@@ -19,9 +19,6 @@ using namespace cuda; | |||
using namespace conv_bias; | |||
bool ConvBiasForwardImpl::AlgoCUDNNConv::is_available(const SizeArgs& args) const { | |||
if (args.z_layout->ndim > 0) | |||
return false; | |||
if (args.filter_meta.format != Param::Format::NCHW && | |||
args.filter_meta.format != Param::Format::NHWC) { | |||
if (!args.src_layout->is_contiguous() || !args.dst_layout->is_contiguous()) { | |||
@@ -75,6 +72,15 @@ WorkspaceBundle ConvBiasForwardImpl::AlgoCUDNNConv::get_workspace_bundle( | |||
sizes.push_back(dst_layout.span().dist_byte()); | |||
} | |||
if (args.z_layout->ndim > 0 && | |||
args.z_layout->dtype.enumv() != args.bias_layout->dtype.enumv()) { | |||
auto z_layout = *args.z_layout; | |||
z_layout.dtype = DType(); | |||
args.opr->check_or_deduce_dtype_fwd( | |||
args.src_layout->dtype, args.filter_layout->dtype, z_layout.dtype); | |||
sizes.push_back(z_layout.span().dist_byte()); | |||
} | |||
SizeArgs conv_args = args; | |||
conv_args.dst_layout = &dst_layout; | |||
@@ -129,6 +135,22 @@ void ConvBiasForwardImpl::AlgoCUDNNConv::exec(const ExecArgs& args) const { | |||
cudnnGetErrorString(status), conv_args.to_string().c_str()); | |||
} | |||
if (args.z_layout->ndim > 0) { | |||
auto z_tensor = *args.z_tensor; | |||
if (args.z_layout->dtype.enumv() != args.bias_layout->dtype.enumv()) { | |||
z_tensor.raw_ptr = bundle.get(2); | |||
z_tensor.layout.dtype = DType(); | |||
args.opr->check_or_deduce_dtype_fwd( | |||
args.src_layout->dtype, args.filter_layout->dtype, | |||
z_tensor.layout.dtype); | |||
auto typecvt = args.handle->create_operator<TypeCvt>(); | |||
typecvt->exec(*args.z_tensor, z_tensor); | |||
} | |||
auto add = args.handle->create_operator<ElemwiseForward>(); | |||
add->param().mode = Elemwise::Param::Mode::ADD; | |||
add->exec({conv_dst_tensor, z_tensor}, conv_dst_tensor); | |||
} | |||
handle_bias_and_nonlinear( | |||
args.handle, args.nonlinear_mode, &conv_dst_tensor, args.dst_tensor, | |||
args.bias_tensor); | |||
@@ -71,11 +71,12 @@ bool ConvBiasForwardImpl::AlgoCUDNNConvBiasActivation::is_available( | |||
return false; | |||
} | |||
#if CUDNN_VERSION < 7605 | |||
if (args.src_layout->dtype.enumv() == DTypeEnum::Float16 && | |||
args.dst_layout->dtype.enumv() == DTypeEnum::Float16 && | |||
param.format == param::ConvBias::Format::NHWC) { | |||
args.dst_layout->dtype.enumv() == DTypeEnum::Float16) { | |||
return false; | |||
} | |||
#endif | |||
#if CUDNN_MAJOR < 8 | |||
if (m_cudnn_enum == CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_GEMM && | |||
@@ -1293,6 +1293,56 @@ TEST_F(CUDA, CONV_BIAS_FORWARD_TENSORCORE_INT8) { | |||
} | |||
} | |||
TEST_F(CUDA, CONV_BIAS_ADD_Z_CUDNN_CONVOLUTION) { | |||
using namespace conv_bias; | |||
Checker<ConvBiasForward> checker(handle_cuda()); | |||
checker.set_before_exec_callback(conv_bias::ConvBiasAlgoChecker<ConvBias>( | |||
ConvBiasForward::algo_name<ConvBias::DefaultParam>("CUDNN:Convolution", {}) | |||
.c_str())); | |||
NormalRNG default_rng; | |||
param::ConvBias param; | |||
param.pad_h = param.pad_w = 1; | |||
using Format = param::ConvBias::Format; | |||
using NLMode = param::ConvBias::NonlineMode; | |||
param.nonlineMode = NLMode::RELU; | |||
auto c = [&](DType dt) { | |||
param.format = Format::NCHW; | |||
/// set epsilon to be 2e-3 to bypass low accuracy of winograd algorithm | |||
float eps = 2e-3; | |||
if (dt == dtype::Float16()) { | |||
eps = 1e-2; | |||
param.compute_mode = param::ConvBias::ComputeMode::FLOAT32; | |||
} | |||
checker.set_dtype(0, dt) | |||
.set_dtype(1, dt) | |||
.set_dtype(2, dt) | |||
.set_dtype(3, dt) | |||
.set_dtype(4, dt) | |||
.set_rng(0, &default_rng) | |||
.set_rng(1, &default_rng) | |||
.set_rng(2, &default_rng) | |||
.set_rng(3, &default_rng) | |||
.set_epsilon(eps) | |||
.set_param(param) | |||
.execs({{16, 256, 7, 7}, | |||
{256, 256, 3, 3}, | |||
{1, 256, 1, 1}, | |||
{16, 256, 7, 7}, | |||
{}}); | |||
param.format = Format::NHWC; | |||
checker.set_param(param).execs( | |||
{{16, 7, 7, 256}, | |||
{256, 3, 3, 256}, | |||
{1, 1, 1, 256}, | |||
{16, 7, 7, 256}, | |||
{}}); | |||
}; | |||
c(dtype::Float32()); | |||
c(dtype::Float16()); | |||
} | |||
#if MEGDNN_WITH_BENCHMARK | |||
TEST_F(CUDA, BENCHMARK_CONV_BIAS_FORWARD_TENSORCORE_INT8) { | |||
require_compute_capability(7, 5); | |||