@@ -14,6 +14,7 @@ | |||
#include "src/cuda/utils.h" | |||
#include "src/cuda/cudnn_wrapper.h" | |||
#include "src/cuda/convolution/helper.h" | |||
#include "src/cuda/conv_bias/helper.h" | |||
using namespace megdnn; | |||
using namespace cuda; | |||
@@ -31,27 +32,16 @@ bool ConvolutionBackwardDataImpl::AlgoCUDNN::is_available( | |||
CUDNNBwdDataDescs D; | |||
if (!is_cudnn_supported(args.as_fwd_args())) | |||
TensorLayout bias_layout, z_layout; | |||
conv_bias::CanonizedFilterMeta meta; | |||
meta.copy_from(args.filter_meta); | |||
conv_bias::BiasForwardSizeArgs bias_args{args.handle, | |||
args.grad_layout, args.filter_layout, &bias_layout, | |||
&z_layout, meta, args.diff_layout, param::ConvBias::NonlineMode::IDENTITY, | |||
}; | |||
if (!conv_bias::is_cudnn_supported(bias_args)) | |||
return false; | |||
#if CUDNN_VERSION >= 7500 | |||
// As in cuda10.0 and cudnn7.5, algo CUDNN_CONVOLUTION_BWD_DATA_ALGO_1 with | |||
// TensorCore operations produces incorrect result. So we disable | |||
// this algo. Please remove the following code, when | |||
// nvidia has fixed this issue. | |||
// incorrect case: | |||
// inp={2x8x18x18}, kern={8x8x2x2}, pad_h=pad_w=2, stride_h=stride_w=2, | |||
// dtype=float16 | |||
if (args.filter_meta.dtype == dtype::Float16()) { | |||
const char* algo_1 = "CUDNN_CONVOLUTION_BWD_DATA_ALGO_1"; | |||
auto cmp_len = strlen(algo_1); | |||
if (is_compute_capability_required(7, 0) && | |||
strncmp(name(), algo_1, cmp_len) == 0) { | |||
return false; | |||
} | |||
} | |||
#endif | |||
auto& cudnn = args.handle->cudnn(); | |||
args.init_desc(D); | |||
size_t workspace_size; | |||
@@ -14,6 +14,7 @@ | |||
#include "src/cuda/utils.h" | |||
#include "src/cuda/cudnn_wrapper.h" | |||
#include "src/cuda/convolution/helper.h" | |||
#include "src/cuda/conv_bias/helper.h" | |||
using namespace megdnn; | |||
using namespace cuda; | |||
@@ -31,7 +32,14 @@ bool ConvolutionBackwardFilterImpl::AlgoCUDNN::is_available( | |||
auto& cudnn = args.handle->cudnn(); | |||
CUDNNBwdFilterDescs D; | |||
if (!is_cudnn_supported(args.as_fwd_args())) | |||
TensorLayout bias_layout, z_layout; | |||
conv_bias::CanonizedFilterMeta meta; | |||
meta.copy_from(args.grad_filter_meta); | |||
conv_bias::BiasForwardSizeArgs bias_args{args.handle, | |||
args.src_layout, args.grad_layout, &bias_layout, | |||
&z_layout, meta, args.diff_layout, param::ConvBias::NonlineMode::IDENTITY, | |||
}; | |||
if (!conv_bias::is_cudnn_supported(bias_args)) | |||
return false; | |||
args.init_desc(D); | |||
@@ -33,7 +33,8 @@ bool convolution::is_cudnn_supported(const ForwardSizeArgs &args) { | |||
args.dst_layout->dtype.enumv() != DTypeEnum::QuantizedS8) { | |||
return false; | |||
} | |||
} else if (args.filter_meta.format != param::Convolution::Format::NCHW) { | |||
} else if (args.filter_meta.format != param::Convolution::Format::NCHW && | |||
args.filter_meta.format != param::Convolution::Format::NHWC) { | |||
return false; | |||
} | |||
auto& fm = args.filter_meta; | |||
@@ -284,6 +284,16 @@ std::vector<TestArg> convolution::get_args_cudnn_5_1_failures() { | |||
return args; | |||
} | |||
std::vector<TestArg> convolution::get_args_cudnn_5_1_backward() { | |||
std::vector<TestArg> args; | |||
args.emplace_back( | |||
param::Convolution{param::Convolution::Mode::CROSS_CORRELATION, 2, | |||
2, 2, 2}, | |||
TensorShape{2, 8, 18, 18}, TensorShape{8, 8, 2, 2}); | |||
return args; | |||
} | |||
std::vector<TestArg> convolution::get_args_x86_winograd_algorithm() { | |||
std::vector<TestArg> args; | |||
for (size_t ic_size : {8, 16}) { | |||
@@ -40,6 +40,7 @@ std::vector<TestArg> get_args_x86_direct_case_2(); | |||
std::vector<TestArg> get_args_fallback_templated_impl(); | |||
std::vector<TestArg> get_args_fallback_non_templated_impl(); | |||
std::vector<TestArg> get_args_cudnn_5_1_failures(); | |||
std::vector<TestArg> get_args_cudnn_5_1_backward(); | |||
std::vector<TestArg> get_args_x86_winograd_algorithm(); | |||
std::vector<TestArg> get_args_BRAIN_481(); | |||
std::vector<TestArg> get_args(); | |||
@@ -238,6 +238,87 @@ TEST_F(CUDA, CONVOLUTION_BACKWARD_DATA) { | |||
} | |||
} | |||
TEST_F(CUDA, CONVOLUTION_BACKWARD_DATA_FP16_CUDNN7_5) { | |||
// algo CUDNN_CONVOLUTION_BWD_DATA_ALGO_1 with | |||
// TensorCore operations produces incorrect result. | |||
// Maybe nvidia has fixed this issue | |||
// There is a test using incorrect case: | |||
// inp={2x8x18x18}, kern={8x8x2x2}, pad_h=pad_w=2, stride_h=stride_w=2, | |||
// dtype=float16 | |||
using namespace convolution; | |||
std::vector<TestArg> args = get_args_cudnn_5_1_backward(); | |||
Checker<ConvolutionBackwardData> checker(handle_cuda()); | |||
NormalRNG default_rng; | |||
for (auto&& arg : args) { | |||
float scale = | |||
128.f / sqrt(arg.filter[0] * arg.filter[2] * arg.filter[3]); | |||
scale = std::max(scale, 1.f); | |||
UniformFloatRNG rng(scale, 2 * scale); | |||
arg.param.format = param::Convolution::Format::NHWC; | |||
arg.src = cvt_src_or_dst_nchw2nhwc(arg.src); | |||
arg.filter = cvt_filter_nchw2nhwc(arg.filter); | |||
auto src = TensorLayout(arg.src, dtype::Float32()); | |||
auto filter = TensorLayout(arg.filter, dtype::Float32()); | |||
TensorLayout dst; | |||
{ | |||
auto opr = handle_cuda()->create_operator<Convolution>(); | |||
opr->param() = arg.param; | |||
opr->deduce_layout(src, filter, dst); | |||
} | |||
src.dtype = dst.dtype = filter.dtype = dtype::Float16(); | |||
arg.param.compute_mode = param::Convolution::ComputeMode::FLOAT32; | |||
checker.set_rng(0, &rng) | |||
.set_rng(1, &rng) | |||
.set_epsilon(1e-2) | |||
.set_param(arg.param) | |||
.exec(TensorLayoutArray{filter, dst, src}); | |||
src.dtype = dst.dtype = filter.dtype = dtype::Float32(); | |||
arg.param.compute_mode = param::Convolution::ComputeMode::DEFAULT; | |||
checker.set_rng(0, &rng) | |||
.set_rng(1, &rng) | |||
.set_epsilon(1e-2) | |||
.set_param(arg.param) | |||
.exec(TensorLayoutArray{filter, dst, src}); | |||
} | |||
} | |||
TEST_F(CUDA, CONVOLUTION_BACKWARD_DATA_NHWC) { | |||
using namespace convolution; | |||
std::vector<TestArg> args = get_args_cuda_conv_bwd_data(); | |||
Checker<ConvolutionBackwardData> checker(handle_cuda()); | |||
NormalRNG default_rng; | |||
for (auto&& arg : args) { | |||
float scale = | |||
64.f / sqrt(arg.filter[0] * arg.filter[2] * arg.filter[3]); | |||
UniformFloatRNG rng(scale, 2 * scale); | |||
arg.param.format = param::Convolution::Format::NHWC; | |||
arg.src = cvt_src_or_dst_nchw2nhwc(arg.src); | |||
arg.filter = cvt_filter_nchw2nhwc(arg.filter); | |||
auto src = TensorLayout(arg.src, dtype::Float32()); | |||
auto filter = TensorLayout(arg.filter, dtype::Float32()); | |||
TensorLayout dst; | |||
{ | |||
auto opr = handle_cuda()->create_operator<Convolution>(); | |||
opr->param() = arg.param; | |||
opr->deduce_layout(src, filter, dst); | |||
} | |||
src.dtype = dst.dtype = filter.dtype = dtype::Float16(); | |||
arg.param.compute_mode = param::Convolution::ComputeMode::FLOAT32; | |||
checker.set_rng(0, &rng) | |||
.set_rng(1, &rng) | |||
.set_epsilon(1e-2) | |||
.set_param(arg.param) | |||
.exec(TensorLayoutArray{filter, dst, src}); | |||
src.dtype = dst.dtype = filter.dtype = dtype::Float32(); | |||
arg.param.compute_mode = param::Convolution::ComputeMode::DEFAULT; | |||
checker.set_rng(0, &rng) | |||
.set_rng(1, &rng) | |||
.set_epsilon(1e-2) | |||
.set_param(arg.param) | |||
.exec(TensorLayoutArray{filter, dst, src}); | |||
} | |||
} | |||
TEST_F(CUDA, CONVOLUTION_BACKWARD_DATA_CUDNN) { | |||
if (cuda::is_compute_capability_required(7, 0)) | |||
return; | |||