@@ -14,6 +14,7 @@ | |||||
#include "src/cuda/utils.h" | #include "src/cuda/utils.h" | ||||
#include "src/cuda/cudnn_wrapper.h" | #include "src/cuda/cudnn_wrapper.h" | ||||
#include "src/cuda/convolution/helper.h" | #include "src/cuda/convolution/helper.h" | ||||
#include "src/cuda/conv_bias/helper.h" | |||||
using namespace megdnn; | using namespace megdnn; | ||||
using namespace cuda; | using namespace cuda; | ||||
@@ -31,27 +32,16 @@ bool ConvolutionBackwardDataImpl::AlgoCUDNN::is_available( | |||||
CUDNNBwdDataDescs D; | CUDNNBwdDataDescs D; | ||||
if (!is_cudnn_supported(args.as_fwd_args())) | |||||
TensorLayout bias_layout, z_layout; | |||||
conv_bias::CanonizedFilterMeta meta; | |||||
meta.copy_from(args.filter_meta); | |||||
conv_bias::BiasForwardSizeArgs bias_args{args.handle, | |||||
args.grad_layout, args.filter_layout, &bias_layout, | |||||
&z_layout, meta, args.diff_layout, param::ConvBias::NonlineMode::IDENTITY, | |||||
}; | |||||
if (!conv_bias::is_cudnn_supported(bias_args)) | |||||
return false; | return false; | ||||
#if CUDNN_VERSION >= 7500 | |||||
// As in cuda10.0 and cudnn7.5, algo CUDNN_CONVOLUTION_BWD_DATA_ALGO_1 with | |||||
// TensorCore operations produces incorrect result. So we disable | |||||
// this algo. Please remove the following code, when | |||||
// nvidia has fixed this issue. | |||||
// incorrect case: | |||||
// inp={2x8x18x18}, kern={8x8x2x2}, pad_h=pad_w=2, stride_h=stride_w=2, | |||||
// dtype=float16 | |||||
if (args.filter_meta.dtype == dtype::Float16()) { | |||||
const char* algo_1 = "CUDNN_CONVOLUTION_BWD_DATA_ALGO_1"; | |||||
auto cmp_len = strlen(algo_1); | |||||
if (is_compute_capability_required(7, 0) && | |||||
strncmp(name(), algo_1, cmp_len) == 0) { | |||||
return false; | |||||
} | |||||
} | |||||
#endif | |||||
auto& cudnn = args.handle->cudnn(); | auto& cudnn = args.handle->cudnn(); | ||||
args.init_desc(D); | args.init_desc(D); | ||||
size_t workspace_size; | size_t workspace_size; | ||||
@@ -14,6 +14,7 @@ | |||||
#include "src/cuda/utils.h" | #include "src/cuda/utils.h" | ||||
#include "src/cuda/cudnn_wrapper.h" | #include "src/cuda/cudnn_wrapper.h" | ||||
#include "src/cuda/convolution/helper.h" | #include "src/cuda/convolution/helper.h" | ||||
#include "src/cuda/conv_bias/helper.h" | |||||
using namespace megdnn; | using namespace megdnn; | ||||
using namespace cuda; | using namespace cuda; | ||||
@@ -31,7 +32,14 @@ bool ConvolutionBackwardFilterImpl::AlgoCUDNN::is_available( | |||||
auto& cudnn = args.handle->cudnn(); | auto& cudnn = args.handle->cudnn(); | ||||
CUDNNBwdFilterDescs D; | CUDNNBwdFilterDescs D; | ||||
if (!is_cudnn_supported(args.as_fwd_args())) | |||||
TensorLayout bias_layout, z_layout; | |||||
conv_bias::CanonizedFilterMeta meta; | |||||
meta.copy_from(args.grad_filter_meta); | |||||
conv_bias::BiasForwardSizeArgs bias_args{args.handle, | |||||
args.src_layout, args.grad_layout, &bias_layout, | |||||
&z_layout, meta, args.diff_layout, param::ConvBias::NonlineMode::IDENTITY, | |||||
}; | |||||
if (!conv_bias::is_cudnn_supported(bias_args)) | |||||
return false; | return false; | ||||
args.init_desc(D); | args.init_desc(D); | ||||
@@ -33,7 +33,8 @@ bool convolution::is_cudnn_supported(const ForwardSizeArgs &args) { | |||||
args.dst_layout->dtype.enumv() != DTypeEnum::QuantizedS8) { | args.dst_layout->dtype.enumv() != DTypeEnum::QuantizedS8) { | ||||
return false; | return false; | ||||
} | } | ||||
} else if (args.filter_meta.format != param::Convolution::Format::NCHW) { | |||||
} else if (args.filter_meta.format != param::Convolution::Format::NCHW && | |||||
args.filter_meta.format != param::Convolution::Format::NHWC) { | |||||
return false; | return false; | ||||
} | } | ||||
auto& fm = args.filter_meta; | auto& fm = args.filter_meta; | ||||
@@ -284,6 +284,16 @@ std::vector<TestArg> convolution::get_args_cudnn_5_1_failures() { | |||||
return args; | return args; | ||||
} | } | ||||
std::vector<TestArg> convolution::get_args_cudnn_5_1_backward() { | |||||
std::vector<TestArg> args; | |||||
args.emplace_back( | |||||
param::Convolution{param::Convolution::Mode::CROSS_CORRELATION, 2, | |||||
2, 2, 2}, | |||||
TensorShape{2, 8, 18, 18}, TensorShape{8, 8, 2, 2}); | |||||
return args; | |||||
} | |||||
std::vector<TestArg> convolution::get_args_x86_winograd_algorithm() { | std::vector<TestArg> convolution::get_args_x86_winograd_algorithm() { | ||||
std::vector<TestArg> args; | std::vector<TestArg> args; | ||||
for (size_t ic_size : {8, 16}) { | for (size_t ic_size : {8, 16}) { | ||||
@@ -40,6 +40,7 @@ std::vector<TestArg> get_args_x86_direct_case_2(); | |||||
std::vector<TestArg> get_args_fallback_templated_impl(); | std::vector<TestArg> get_args_fallback_templated_impl(); | ||||
std::vector<TestArg> get_args_fallback_non_templated_impl(); | std::vector<TestArg> get_args_fallback_non_templated_impl(); | ||||
std::vector<TestArg> get_args_cudnn_5_1_failures(); | std::vector<TestArg> get_args_cudnn_5_1_failures(); | ||||
std::vector<TestArg> get_args_cudnn_5_1_backward(); | |||||
std::vector<TestArg> get_args_x86_winograd_algorithm(); | std::vector<TestArg> get_args_x86_winograd_algorithm(); | ||||
std::vector<TestArg> get_args_BRAIN_481(); | std::vector<TestArg> get_args_BRAIN_481(); | ||||
std::vector<TestArg> get_args(); | std::vector<TestArg> get_args(); | ||||
@@ -238,6 +238,87 @@ TEST_F(CUDA, CONVOLUTION_BACKWARD_DATA) { | |||||
} | } | ||||
} | } | ||||
TEST_F(CUDA, CONVOLUTION_BACKWARD_DATA_FP16_CUDNN7_5) { | |||||
// algo CUDNN_CONVOLUTION_BWD_DATA_ALGO_1 with | |||||
// TensorCore operations produces incorrect result. | |||||
// Maybe nvidia has fixed this issue | |||||
// There is a test using incorrect case: | |||||
// inp={2x8x18x18}, kern={8x8x2x2}, pad_h=pad_w=2, stride_h=stride_w=2, | |||||
// dtype=float16 | |||||
using namespace convolution; | |||||
std::vector<TestArg> args = get_args_cudnn_5_1_backward(); | |||||
Checker<ConvolutionBackwardData> checker(handle_cuda()); | |||||
NormalRNG default_rng; | |||||
for (auto&& arg : args) { | |||||
float scale = | |||||
128.f / sqrt(arg.filter[0] * arg.filter[2] * arg.filter[3]); | |||||
scale = std::max(scale, 1.f); | |||||
UniformFloatRNG rng(scale, 2 * scale); | |||||
arg.param.format = param::Convolution::Format::NHWC; | |||||
arg.src = cvt_src_or_dst_nchw2nhwc(arg.src); | |||||
arg.filter = cvt_filter_nchw2nhwc(arg.filter); | |||||
auto src = TensorLayout(arg.src, dtype::Float32()); | |||||
auto filter = TensorLayout(arg.filter, dtype::Float32()); | |||||
TensorLayout dst; | |||||
{ | |||||
auto opr = handle_cuda()->create_operator<Convolution>(); | |||||
opr->param() = arg.param; | |||||
opr->deduce_layout(src, filter, dst); | |||||
} | |||||
src.dtype = dst.dtype = filter.dtype = dtype::Float16(); | |||||
arg.param.compute_mode = param::Convolution::ComputeMode::FLOAT32; | |||||
checker.set_rng(0, &rng) | |||||
.set_rng(1, &rng) | |||||
.set_epsilon(1e-2) | |||||
.set_param(arg.param) | |||||
.exec(TensorLayoutArray{filter, dst, src}); | |||||
src.dtype = dst.dtype = filter.dtype = dtype::Float32(); | |||||
arg.param.compute_mode = param::Convolution::ComputeMode::DEFAULT; | |||||
checker.set_rng(0, &rng) | |||||
.set_rng(1, &rng) | |||||
.set_epsilon(1e-2) | |||||
.set_param(arg.param) | |||||
.exec(TensorLayoutArray{filter, dst, src}); | |||||
} | |||||
} | |||||
TEST_F(CUDA, CONVOLUTION_BACKWARD_DATA_NHWC) { | |||||
using namespace convolution; | |||||
std::vector<TestArg> args = get_args_cuda_conv_bwd_data(); | |||||
Checker<ConvolutionBackwardData> checker(handle_cuda()); | |||||
NormalRNG default_rng; | |||||
for (auto&& arg : args) { | |||||
float scale = | |||||
64.f / sqrt(arg.filter[0] * arg.filter[2] * arg.filter[3]); | |||||
UniformFloatRNG rng(scale, 2 * scale); | |||||
arg.param.format = param::Convolution::Format::NHWC; | |||||
arg.src = cvt_src_or_dst_nchw2nhwc(arg.src); | |||||
arg.filter = cvt_filter_nchw2nhwc(arg.filter); | |||||
auto src = TensorLayout(arg.src, dtype::Float32()); | |||||
auto filter = TensorLayout(arg.filter, dtype::Float32()); | |||||
TensorLayout dst; | |||||
{ | |||||
auto opr = handle_cuda()->create_operator<Convolution>(); | |||||
opr->param() = arg.param; | |||||
opr->deduce_layout(src, filter, dst); | |||||
} | |||||
src.dtype = dst.dtype = filter.dtype = dtype::Float16(); | |||||
arg.param.compute_mode = param::Convolution::ComputeMode::FLOAT32; | |||||
checker.set_rng(0, &rng) | |||||
.set_rng(1, &rng) | |||||
.set_epsilon(1e-2) | |||||
.set_param(arg.param) | |||||
.exec(TensorLayoutArray{filter, dst, src}); | |||||
src.dtype = dst.dtype = filter.dtype = dtype::Float32(); | |||||
arg.param.compute_mode = param::Convolution::ComputeMode::DEFAULT; | |||||
checker.set_rng(0, &rng) | |||||
.set_rng(1, &rng) | |||||
.set_epsilon(1e-2) | |||||
.set_param(arg.param) | |||||
.exec(TensorLayoutArray{filter, dst, src}); | |||||
} | |||||
} | |||||
TEST_F(CUDA, CONVOLUTION_BACKWARD_DATA_CUDNN) { | TEST_F(CUDA, CONVOLUTION_BACKWARD_DATA_CUDNN) { | ||||
if (cuda::is_compute_capability_required(7, 0)) | if (cuda::is_compute_capability_required(7, 0)) | ||||
return; | return; | ||||