diff --git a/dnn/src/cuda/utils.cuh b/dnn/src/cuda/utils.cuh index 3c6caa06..c59a8f4a 100644 --- a/dnn/src/cuda/utils.cuh +++ b/dnn/src/cuda/utils.cuh @@ -83,6 +83,12 @@ cuda_check(cudaGetLastError()); \ } while (0) +#if MEGDNN_TEGRA_X2 +//! tx2 only have 256 cuda cores +#define NR_THREADS 256 +#define NR_THREADS_X 32 +#define NR_THREADS_Y 8 +#else #if MEGDNN_THREADS_512 #define NR_THREADS 512 #define NR_THREADS_X 32 @@ -92,6 +98,7 @@ #define NR_THREADS_X 32 #define NR_THREADS_Y 32 #endif +#endif #define DIVUP(x, y) (((x) + (y)-1) / (y)) #define ROUNDUP(x, y) (DIVUP(x, y) * (y)) diff --git a/dnn/test/cuda/convolution.cpp b/dnn/test/cuda/convolution.cpp index 4429e4b0..48a40986 100644 --- a/dnn/test/cuda/convolution.cpp +++ b/dnn/test/cuda/convolution.cpp @@ -22,6 +22,8 @@ #include "test/cuda/fixture.h" #include "test/cuda/utils.h" +#include + #define V1(x) #x #define V(x) V1(x) #define CUDNN_VERSION_STRING \ @@ -161,23 +163,6 @@ TEST_F(CUDA, CONVOLUTION_1X1_FORWARD) { } } -TEST_F(CUDA, BENCHMARK_CONVOLUTION_1X1_FORWARD) { - using namespace convolution; - std::vector args = get_1x1_args(); - Benchmarker marker(handle_cuda()); - NormalRNG default_rng; - for (auto&& arg : args) { - float scale = 1.0f / sqrt(arg.filter[1] * arg.filter[2] * arg.filter[3]); - UniformFloatRNG rng(scale, 2 * scale); - marker.set_dtype(0, dtype::Float32()) - .set_dtype(1, dtype::Float32()) - .set_rng(0, &default_rng) - .set_rng(1, &default_rng) - .set_param(arg.param) - .execs({arg.src, arg.filter, {}}); - } -} - TEST_F(CUDA, CONVOLUTION_BACKWARD_DATA) { using namespace convolution; std::vector args = get_args_cuda_conv_bwd_data(); @@ -767,6 +752,23 @@ TEST_F(CUDA, CONVOLUTION_BACKWARD_DEPTHWISE_LARGE_FILTER) { } #if MEGDNN_WITH_BENCHMARK +TEST_F(CUDA, BENCHMARK_CONVOLUTION_1X1_FORWARD) { + using namespace convolution; + std::vector args = get_1x1_args(); + Benchmarker marker(handle_cuda()); + NormalRNG default_rng; + for (auto&& arg : args) { + float scale = 1.0f / sqrt(arg.filter[1] * arg.filter[2] * arg.filter[3]); + UniformFloatRNG rng(scale, 2 * scale); + marker.set_dtype(0, dtype::Float32()) + .set_dtype(1, dtype::Float32()) + .set_rng(0, &default_rng) + .set_rng(1, &default_rng) + .set_param(arg.param) + .execs({arg.src, arg.filter, {}}); + } +} + TEST_F(CUDA, CONV_FWD_BENCHMARK) { auto run = [&](size_t N, size_t OC, size_t IC, size_t IH, size_t IW, size_t SH = 1, size_t SW = 1, size_t FH = 1, size_t FW = 1, size_t PH = 0, diff --git a/dnn/test/cuda/flip.cpp b/dnn/test/cuda/flip.cpp index 118cd274..fffb947e 100644 --- a/dnn/test/cuda/flip.cpp +++ b/dnn/test/cuda/flip.cpp @@ -44,6 +44,7 @@ TEST_F(CUDA, FLIP) { } } +#if MEGDNN_WITH_BENCHMARK TEST_F(CUDA, FLIP_BENCHMARK) { auto run = [&](const TensorShapeArray& shapes) { Benchmarker benchmarker(handle_cuda()); @@ -75,6 +76,7 @@ TEST_F(CUDA, FLIP_BENCHMARK) { run(shapes); } +#endif } // namespace test } // namespace megdnn diff --git a/dnn/test/cuda/images2neibs.cpp b/dnn/test/cuda/images2neibs.cpp index 2830f9c2..b67b0f8a 100644 --- a/dnn/test/cuda/images2neibs.cpp +++ b/dnn/test/cuda/images2neibs.cpp @@ -14,6 +14,7 @@ #include "test/common/images2neibs.h" #include "test/common/rng.h" #include "test/cuda/benchmark.h" +#include "test/cuda/utils.h" namespace megdnn { namespace test { @@ -44,6 +45,7 @@ TEST_F(CUDA, BENCHMARK_IMAGES2NEIBS_FORWARD) { #endif TEST_F(CUDA, IMAGES2NEIBS_BACKWARD) { + require_compute_capability(6, 1); UniformFloatRNG rng(0, 1); auto args = images2neibs::get_args(); for (auto&& arg : args) { diff --git a/dnn/test/cuda/indexing_one_hot.cpp b/dnn/test/cuda/indexing_one_hot.cpp index aa888dc4..779819b4 100644 --- a/dnn/test/cuda/indexing_one_hot.cpp +++ b/dnn/test/cuda/indexing_one_hot.cpp @@ -39,6 +39,11 @@ TEST_F(CUDA_ERROR_INFO, INDEXING_ONE_HOT) { ASSERT_TRUE(failed); } +TEST_F(CUDA, INDEXING_SET_ONE_HOT) { + run_indexing_set_one_hot_test(handle_cuda()); +} + +#if MEGDNN_WITH_BENCHMARK TEST_F(CUDA, BENCHMARK_INDEXING_ONE_HOT) { Benchmarker bench{handle_cuda()}; bench.set_times(1); @@ -53,9 +58,6 @@ TEST_F(CUDA, BENCHMARK_INDEXING_ONE_HOT) { printf("bandwidth: %.2fGiB/s\n", A * B * D * sizeof(float) / 1024.0 / 1024 / 1024 / time); } - -TEST_F(CUDA, INDEXING_SET_ONE_HOT) { - run_indexing_set_one_hot_test(handle_cuda()); -} +#endif // vim: syntax=cpp.doxygen foldmethod=marker foldmarker=f{{{,f}}} diff --git a/dnn/test/cuda/matrix_mul.cpp b/dnn/test/cuda/matrix_mul.cpp index 1d48a35a..0a1c44c7 100644 --- a/dnn/test/cuda/matrix_mul.cpp +++ b/dnn/test/cuda/matrix_mul.cpp @@ -14,13 +14,12 @@ #include "test/common/benchmarker.h" #include "test/common/checker.h" #include "test/common/matrix_mul.h" +#include "test/cuda/utils.h" #if defined(cuda_check) #undef cuda_check #endif -#include "test/cuda/utils.h" - -#include +#include "src/cuda/utils.h" namespace megdnn { namespace test { @@ -47,13 +46,7 @@ TEST_F(CUDA, MATRIX_MUL_QUANTIZED4x4x32_EXCEPTION) { } TEST_F(CUDA, MATRIX_MUL_QUANTIZED4x4x32) { - if (cuda::current_device_prop().major < 7 || - (cuda::current_device_prop().major == 7 && - cuda::current_device_prop().minor < 5)) { - printf("Skip CUDA.MATRIX_MUL_QUANTIZED4x4x32 test as current device " - "doesn't support\n"); - return; - } + require_compute_capability(7, 5); Checker checker(handle_cuda(), false); using Param = MatrixMul::Param; Param param; @@ -65,21 +58,15 @@ TEST_F(CUDA, MATRIX_MUL_QUANTIZED4x4x32) { checker.exec({{256, 256}, {256, 256}, {256, 256}}); auto args = matrix_mul::get_matmul_args(); for (auto arg : args) { - size_t m = DIVUP(arg.m, 8) * 8, n = DIVUP(arg.n, 8) * 8, - k = DIVUP(arg.k, 32) * 32; + size_t m = (arg.m + 7) / 8 * 8, n = (arg.n + 7) / 8 * 8, + k = (arg.k + 31) / 32 * 32; checker.exec({{m, k}, {n, k}, {m, n}}); } } #if MEGDNN_WITH_BENCHMARK TEST_F(CUDA, BENCHMARK_MATRIX_MUL_QUANTIZED4x4x32) { - if (cuda::current_device_prop().major < 7 || - (cuda::current_device_prop().major == 7 && - cuda::current_device_prop().minor < 5)) { - printf("Skip CUDA.BENCHMARK_MATRIX_MUL_QUANTIZED4x4x32 test as current " - "device doesn't support\n"); - return; - } + require_compute_capability(7, 5); Benchmarker bencher(handle_cuda()); using Param = MatrixMul::Param; Param param; @@ -102,14 +89,7 @@ TEST_F(CUDA, BENCHMARK_MATRIX_MUL_QUANTIZED4x4x32) { } TEST_F(CUDA, PEAK_BENCHMARK_MATRIX_MUL_QUANTIZED4x4x32) { - if (cuda::current_device_prop().major < 7 || - (cuda::current_device_prop().major == 7 && - cuda::current_device_prop().minor < 5)) { - printf("Skip CUDA.PEAK_BENCHMARK_MATRIX_MUL_QUANTIZED4x4x32 test as " - "current " - "device doesn't support\n"); - return; - } + require_compute_capability(7, 5); Benchmarker bencher(handle_cuda()); using Param = MatrixMul::Param; Param param; diff --git a/dnn/test/cuda/padding.cpp b/dnn/test/cuda/padding.cpp index 1ae5ea22..7969706d 100644 --- a/dnn/test/cuda/padding.cpp +++ b/dnn/test/cuda/padding.cpp @@ -188,8 +188,7 @@ TEST_F(CUDA, PADDING_REPLICATE2) { 6, 7, 7, 8, 9, 9, 9, 9})}); } -// #if MEGDNN_WITH_BENCHMARK - +#if MEGDNN_WITH_BENCHMARK TEST_F(CUDA, BENCHMARK_PADDING_CONSTANT) { using Param = Padding::Param; @@ -240,5 +239,4 @@ TEST_F(CUDA, BENCHMARK_PADDING_CONSTANT) { run(shapes, param); } } - -// #endif \ No newline at end of file +#endif diff --git a/dnn/test/cuda/rotate.cpp b/dnn/test/cuda/rotate.cpp index def9d424..142caa15 100644 --- a/dnn/test/cuda/rotate.cpp +++ b/dnn/test/cuda/rotate.cpp @@ -40,6 +40,7 @@ TEST_F(CUDA, ROTATE) { } } +#if MEGDNN_WITH_BENCHMARK TEST_F(CUDA, BENCHMARK_ROTATE) { auto run = [&](const TensorShapeArray& shapes) { Benchmarker benchmarker(handle_cuda()); @@ -74,6 +75,7 @@ TEST_F(CUDA, BENCHMARK_ROTATE) { run(shapes); } +#endif } // namespace rotate } // namespace test diff --git a/dnn/test/cuda/sliding_window_transpose.cpp b/dnn/test/cuda/sliding_window_transpose.cpp index 9621dd14..74f78bd2 100644 --- a/dnn/test/cuda/sliding_window_transpose.cpp +++ b/dnn/test/cuda/sliding_window_transpose.cpp @@ -42,18 +42,6 @@ TEST_F(CUDA, SLIDINGWINDOWTRANSPOSE_FORWARD) { } } -#if MEGDNN_WITH_BENCHMARK -TEST_F(CUDA, BENCHMARK_SLIDINGWINDOWTRANSPOSE_FORWARD) { - auto args = sliding_window_transpose::get_benchmark_args(); - for (auto&& arg : args) { - CUBenchmarker bencher(handle_cuda()); - bencher.set_param(arg.param) - .set_dtype(0, dtype::Float32()) - .exec(TensorShapeArray{arg.ishape, {}}); - } -} -#endif - TEST_F(CUDA, SLIDINGWINDOWTRANSPOSE_BACKWARD) { UniformFloatRNG rng(0, 1); auto args = sliding_window_transpose::get_args(); @@ -78,6 +66,18 @@ TEST_F(CUDA, SLIDINGWINDOWTRANSPOSE_BACKWARD) { } } +#if MEGDNN_WITH_BENCHMARK +TEST_F(CUDA, BENCHMARK_SLIDINGWINDOWTRANSPOSE_FORWARD) { + auto args = sliding_window_transpose::get_benchmark_args(); + for (auto&& arg : args) { + CUBenchmarker bencher(handle_cuda()); + bencher.set_param(arg.param) + .set_dtype(0, dtype::Float32()) + .exec(TensorShapeArray{arg.ishape, {}}); + } +} +#endif + } // namespace test } // namespace megdnn diff --git a/dnn/test/cuda/type_cvt.cpp b/dnn/test/cuda/type_cvt.cpp index 6cd7e2a9..14feae90 100644 --- a/dnn/test/cuda/type_cvt.cpp +++ b/dnn/test/cuda/type_cvt.cpp @@ -33,25 +33,6 @@ TEST_F(CUDA, TYPE_CVT) { } } -TEST_F(CUDA, BENCHMARK_TYPE_CVT_LAST_NOT_CONTIG) { - const size_t RUNS = 3; - - auto run = [&](TensorLayout src, TensorLayout dst) { - Benchmarker benchmarker(handle_cuda()); - auto&& layout = src; - benchmarker.set_times(RUNS); - dst.init_contiguous_stride(); - auto used = benchmarker.execl({src, dst}); - printf("layout: %s bandwith: %f gbps/s\n", layout.to_string().c_str(), - 2 * layout.total_nr_elems() * layout.dtype.size() * RUNS / used * 1000 / - (1024 * 1024 * 1024)); - }; - - TensorLayout src({16, 128, 128}, {49152, 384, 3}, dtype::Float32()), - dst({16, 128, 128}, {16384, 128, 1}, dtype::Float32()); - run(src, dst); -} - TEST_F(CUDA, QUANTIZED_TYPECVT) { UniformIntRNG int_rng{-66, 66}; Checker checker(handle_cuda()); @@ -162,6 +143,25 @@ TEST_F(CUDA, TYPE_CVT_BFLOAT16) { } #if MEGDNN_WITH_BENCHMARK +TEST_F(CUDA, BENCHMARK_TYPE_CVT_LAST_NOT_CONTIG) { + const size_t RUNS = 3; + + auto run = [&](TensorLayout src, TensorLayout dst) { + Benchmarker benchmarker(handle_cuda()); + auto&& layout = src; + benchmarker.set_times(RUNS); + dst.init_contiguous_stride(); + auto used = benchmarker.execl({src, dst}); + printf("layout: %s bandwith: %f gbps/s\n", layout.to_string().c_str(), + 2 * layout.total_nr_elems() * layout.dtype.size() * RUNS / used * 1000 / + (1024 * 1024 * 1024)); + }; + + TensorLayout src({16, 128, 128}, {49152, 384, 3}, dtype::Float32()), + dst({16, 128, 128}, {16384, 128, 1}, dtype::Float32()); + run(src, dst); +} + TEST_F(CUDA, BENCHMARK_TYPE_CVT) { UniformIntRNG rng{-128, 127}; auto run = [&](TensorLayout src, TensorLayout dst) {