Browse Source

fix(dnn/cuda): fix cublas matmul on sm60

GitOrigin-RevId: 3fc0c30a23
tags/v0.4.0
Megvii Engine Team Xinran Xu 5 years ago
parent
commit
f5833a5294
9 changed files with 20 additions and 13 deletions
  1. +1
    -1
      dnn/src/cuda/conv_bias/matmul_8x8x32.cpp
  2. +1
    -1
      dnn/src/cuda/matrix_mul/cublas.cpp
  3. +1
    -1
      dnn/test/cuda/benchmark.cpp
  4. +3
    -3
      dnn/test/cuda/conv_bias.cpp
  5. +2
    -2
      dnn/test/cuda/convolution.cpp
  6. +1
    -1
      dnn/test/cuda/convolution3d.cpp
  7. +1
    -1
      dnn/test/cuda/group_conv.cpp
  8. +1
    -1
      dnn/test/cuda/group_conv3d.cpp
  9. +9
    -2
      dnn/test/cuda/matrix_mul.cpp

+ 1
- 1
dnn/src/cuda/conv_bias/matmul_8x8x32.cpp View File

@@ -21,7 +21,7 @@ bool ConvBiasForwardImpl::AlgoMatmul8x8x32::is_available(
const SizeArgs& args) const { const SizeArgs& args) const {
if (args.z_layout->ndim > 0) if (args.z_layout->ndim > 0)
return false; return false;
if (cuda::current_device_prop().major < 6)
if (!is_compute_capability_required(6, 1))
return false; return false;


auto dst_layout = *args.dst_layout; auto dst_layout = *args.dst_layout;


+ 1
- 1
dnn/src/cuda/matrix_mul/cublas.cpp View File

@@ -42,7 +42,7 @@ bool MatrixMulForwardImpl::AlgoCuBlas::is_available(
*/ */
return args.layout_a.stride[0] % 4 == 0 && return args.layout_a.stride[0] % 4 == 0 &&
args.layout_b.stride[0] % 4 == 0 && args.layout_b.stride[0] % 4 == 0 &&
current_device_prop().major > 5;
is_compute_capability_required(6, 1);
} }
return false; return false;
} }


+ 1
- 1
dnn/test/cuda/benchmark.cpp View File

@@ -24,7 +24,7 @@ namespace test {


TEST_F(CUDA, BENCHMARK_CONVOLUTION_8X8X32) TEST_F(CUDA, BENCHMARK_CONVOLUTION_8X8X32)
{ {
if (cuda::current_device_prop().major < 6) {
if (!cuda::is_compute_capability_required(6, 1)) {
printf("Skip CUDA.BENCHMARK_CONVOLUTION_8X8X32 test as current device" printf("Skip CUDA.BENCHMARK_CONVOLUTION_8X8X32 test as current device"
"doesn't support\n"); "doesn't support\n");
return; return;


+ 3
- 3
dnn/test/cuda/conv_bias.cpp View File

@@ -325,7 +325,7 @@ TEST_F(CUDA, CONV_BIAS_FORWARD_CHANWISE_SMALL) {
} }


TEST_F(CUDA, CONV_BIAS_FORWARD_CHANWISE_8x8x32) { TEST_F(CUDA, CONV_BIAS_FORWARD_CHANWISE_8x8x32) {
require_compute_capability(6, 0);
require_compute_capability(6, 1);
Checker<ConvBiasForward> checker(handle_cuda()); Checker<ConvBiasForward> checker(handle_cuda());
checker.set_before_exec_callback(conv_bias::ConvBiasAlgoChecker<ConvBias>( checker.set_before_exec_callback(conv_bias::ConvBiasAlgoChecker<ConvBias>(
ConvBiasForward::algo_name<ConvBias::DirectParam>( ConvBiasForward::algo_name<ConvBias::DirectParam>(
@@ -472,7 +472,7 @@ TEST_F(CUDA, CONV_BIAS_FORWARD_MATMUL) {
} }


TEST_F(CUDA, CONV_BIAS_FORWARD_MATMUL_8x8x32) { TEST_F(CUDA, CONV_BIAS_FORWARD_MATMUL_8x8x32) {
require_compute_capability(6, 0);
require_compute_capability(6, 1);
Checker<ConvBiasForward> checker(handle_cuda()); Checker<ConvBiasForward> checker(handle_cuda());
checker.set_before_exec_callback(conv_bias::ConvBiasAlgoChecker<ConvBias>( checker.set_before_exec_callback(conv_bias::ConvBiasAlgoChecker<ConvBias>(
ConvBiasForward::algo_name<ConvBiasForward::MatmulParam>( ConvBiasForward::algo_name<ConvBiasForward::MatmulParam>(
@@ -517,7 +517,7 @@ TEST_F(CUDA, CONV_BIAS_FORWARD_MATMUL_8x8x32) {
} }


TEST_F(CUDA, CONV_BIAS_FORWARD_MATMUL_NCHW4) { TEST_F(CUDA, CONV_BIAS_FORWARD_MATMUL_NCHW4) {
require_compute_capability(6, 0);
require_compute_capability(6, 1);
Checker<ConvBiasForward> checker(handle_cuda()); Checker<ConvBiasForward> checker(handle_cuda());
checker.set_before_exec_callback(conv_bias::ConvBiasAlgoChecker<ConvBias>( checker.set_before_exec_callback(conv_bias::ConvBiasAlgoChecker<ConvBias>(
ConvBiasForward::algo_name<ConvBiasForward::MatmulParam>( ConvBiasForward::algo_name<ConvBiasForward::MatmulParam>(


+ 2
- 2
dnn/test/cuda/convolution.cpp View File

@@ -30,7 +30,7 @@ namespace test {


TEST_F(CUDA, CONVOLUTION_8X8X32) TEST_F(CUDA, CONVOLUTION_8X8X32)
{ {
if (cuda::current_device_prop().major < 6) {
if (!cuda::is_compute_capability_required(6, 1)) {
printf("Skip CUDA.CONVOLUTION_8X8X32 test as current device" printf("Skip CUDA.CONVOLUTION_8X8X32 test as current device"
"doesn't support\n"); "doesn't support\n");
return; return;
@@ -112,7 +112,7 @@ TEST_F(CUDA, CONVOLUTION_FORWARD)
} }


TEST_F(CUDA, CONV_FORWARD_MATMUL_NCHW4) { TEST_F(CUDA, CONV_FORWARD_MATMUL_NCHW4) {
if (cuda::current_device_prop().major < 6)
if (!cuda::is_compute_capability_required(6, 1))
return; return;
using namespace convolution; using namespace convolution;
Checker<Convolution> checker(handle_cuda()); Checker<Convolution> checker(handle_cuda());


+ 1
- 1
dnn/test/cuda/convolution3d.cpp View File

@@ -24,7 +24,7 @@ namespace test {


#if 0 #if 0
TEST_F(CUDA, CONVOLUTION3D_8X8X32) { TEST_F(CUDA, CONVOLUTION3D_8X8X32) {
if (cuda::current_device_prop().major < 6) {
if (!cuda::is_compute_capability_required(6, 1)) {
printf("Skip CUDA.CONVOLUTION_8X8X32 test as current device" printf("Skip CUDA.CONVOLUTION_8X8X32 test as current device"
"doesn't support\n"); "doesn't support\n");
return; return;


+ 1
- 1
dnn/test/cuda/group_conv.cpp View File

@@ -23,7 +23,7 @@ namespace test {


TEST_F(CUDA, GROUP_CONV_FORWARD) TEST_F(CUDA, GROUP_CONV_FORWARD)
{ {
bool is_int_available = (cuda::current_device_prop().major >= 6);
bool is_int_available = cuda::is_compute_capability_required(6, 1);
auto run = [&](size_t N, size_t IC, size_t IH, size_t IW, auto run = [&](size_t N, size_t IC, size_t IH, size_t IW,
size_t FH, size_t FW, size_t FH, size_t FW,
size_t OC, size_t /* OH */, size_t /* OW */, size_t OC, size_t /* OH */, size_t /* OW */,


+ 1
- 1
dnn/test/cuda/group_conv3d.cpp View File

@@ -21,7 +21,7 @@ namespace megdnn {
namespace test { namespace test {


TEST_F(CUDA, GROUP_CONVOLUTION3D_FORWARD) { TEST_F(CUDA, GROUP_CONVOLUTION3D_FORWARD) {
bool is_int_available = (cuda::current_device_prop().major >= 6);
bool is_int_available = cuda::is_compute_capability_required(6, 1);
static_cast<void>(is_int_available); static_cast<void>(is_int_available);
auto run = [&](size_t N, size_t IC, size_t ID, size_t IH, size_t IW, auto run = [&](size_t N, size_t IC, size_t ID, size_t IH, size_t IW,
size_t FD, size_t FH, size_t FW, size_t OC, size_t PD, size_t FD, size_t FH, size_t FW, size_t OC, size_t PD,


+ 9
- 2
dnn/test/cuda/matrix_mul.cpp View File

@@ -193,8 +193,15 @@ TEST_F(CUDA, MATRIX_MUL)
Checker<MatrixMul> checker(handle_cuda()); Checker<MatrixMul> checker(handle_cuda());
using Param = MatrixMul::Param; using Param = MatrixMul::Param;
size_t m = 12, n = 16, k = 20; size_t m = 12, n = 16, k = 20;
for (DType dtype: std::array<DType, 3>{
{dtype::Float32(), dtype::Float16(), dtype::Int32()}}) {

bool is_int_available = cuda::is_compute_capability_required(6, 1);
std::vector<DType> dtype_array;
dtype_array.push_back(dtype::Float32());
dtype_array.push_back(dtype::Float16());
if (is_int_available)
dtype_array.push_back(dtype::Int32());

for (DType dtype : dtype_array) {
for (unsigned mask = 0; mask < 4; ++mask) { for (unsigned mask = 0; mask < 4; ++mask) {
Param param; Param param;
param.transposeA = mask & 1; param.transposeA = mask & 1;


Loading…
Cancel
Save