@@ -83,6 +83,12 @@ | |||||
cuda_check(cudaGetLastError()); \ | cuda_check(cudaGetLastError()); \ | ||||
} while (0) | } while (0) | ||||
#if MEGDNN_TEGRA_X2 | |||||
//! tx2 only have 256 cuda cores | |||||
#define NR_THREADS 256 | |||||
#define NR_THREADS_X 32 | |||||
#define NR_THREADS_Y 8 | |||||
#else | |||||
#if MEGDNN_THREADS_512 | #if MEGDNN_THREADS_512 | ||||
#define NR_THREADS 512 | #define NR_THREADS 512 | ||||
#define NR_THREADS_X 32 | #define NR_THREADS_X 32 | ||||
@@ -92,6 +98,7 @@ | |||||
#define NR_THREADS_X 32 | #define NR_THREADS_X 32 | ||||
#define NR_THREADS_Y 32 | #define NR_THREADS_Y 32 | ||||
#endif | #endif | ||||
#endif | |||||
#define DIVUP(x, y) (((x) + (y)-1) / (y)) | #define DIVUP(x, y) (((x) + (y)-1) / (y)) | ||||
#define ROUNDUP(x, y) (DIVUP(x, y) * (y)) | #define ROUNDUP(x, y) (DIVUP(x, y) * (y)) | ||||
@@ -22,6 +22,8 @@ | |||||
#include "test/cuda/fixture.h" | #include "test/cuda/fixture.h" | ||||
#include "test/cuda/utils.h" | #include "test/cuda/utils.h" | ||||
#include <cudnn.h> | |||||
#define V1(x) #x | #define V1(x) #x | ||||
#define V(x) V1(x) | #define V(x) V1(x) | ||||
#define CUDNN_VERSION_STRING \ | #define CUDNN_VERSION_STRING \ | ||||
@@ -161,23 +163,6 @@ TEST_F(CUDA, CONVOLUTION_1X1_FORWARD) { | |||||
} | } | ||||
} | } | ||||
TEST_F(CUDA, BENCHMARK_CONVOLUTION_1X1_FORWARD) { | |||||
using namespace convolution; | |||||
std::vector<TestArg> args = get_1x1_args(); | |||||
Benchmarker<ConvolutionForward> marker(handle_cuda()); | |||||
NormalRNG default_rng; | |||||
for (auto&& arg : args) { | |||||
float scale = 1.0f / sqrt(arg.filter[1] * arg.filter[2] * arg.filter[3]); | |||||
UniformFloatRNG rng(scale, 2 * scale); | |||||
marker.set_dtype(0, dtype::Float32()) | |||||
.set_dtype(1, dtype::Float32()) | |||||
.set_rng(0, &default_rng) | |||||
.set_rng(1, &default_rng) | |||||
.set_param(arg.param) | |||||
.execs({arg.src, arg.filter, {}}); | |||||
} | |||||
} | |||||
TEST_F(CUDA, CONVOLUTION_BACKWARD_DATA) { | TEST_F(CUDA, CONVOLUTION_BACKWARD_DATA) { | ||||
using namespace convolution; | using namespace convolution; | ||||
std::vector<TestArg> args = get_args_cuda_conv_bwd_data(); | std::vector<TestArg> args = get_args_cuda_conv_bwd_data(); | ||||
@@ -767,6 +752,23 @@ TEST_F(CUDA, CONVOLUTION_BACKWARD_DEPTHWISE_LARGE_FILTER) { | |||||
} | } | ||||
#if MEGDNN_WITH_BENCHMARK | #if MEGDNN_WITH_BENCHMARK | ||||
TEST_F(CUDA, BENCHMARK_CONVOLUTION_1X1_FORWARD) { | |||||
using namespace convolution; | |||||
std::vector<TestArg> args = get_1x1_args(); | |||||
Benchmarker<ConvolutionForward> marker(handle_cuda()); | |||||
NormalRNG default_rng; | |||||
for (auto&& arg : args) { | |||||
float scale = 1.0f / sqrt(arg.filter[1] * arg.filter[2] * arg.filter[3]); | |||||
UniformFloatRNG rng(scale, 2 * scale); | |||||
marker.set_dtype(0, dtype::Float32()) | |||||
.set_dtype(1, dtype::Float32()) | |||||
.set_rng(0, &default_rng) | |||||
.set_rng(1, &default_rng) | |||||
.set_param(arg.param) | |||||
.execs({arg.src, arg.filter, {}}); | |||||
} | |||||
} | |||||
TEST_F(CUDA, CONV_FWD_BENCHMARK) { | TEST_F(CUDA, CONV_FWD_BENCHMARK) { | ||||
auto run = [&](size_t N, size_t OC, size_t IC, size_t IH, size_t IW, size_t SH = 1, | auto run = [&](size_t N, size_t OC, size_t IC, size_t IH, size_t IW, size_t SH = 1, | ||||
size_t SW = 1, size_t FH = 1, size_t FW = 1, size_t PH = 0, | size_t SW = 1, size_t FH = 1, size_t FW = 1, size_t PH = 0, | ||||
@@ -44,6 +44,7 @@ TEST_F(CUDA, FLIP) { | |||||
} | } | ||||
} | } | ||||
#if MEGDNN_WITH_BENCHMARK | |||||
TEST_F(CUDA, FLIP_BENCHMARK) { | TEST_F(CUDA, FLIP_BENCHMARK) { | ||||
auto run = [&](const TensorShapeArray& shapes) { | auto run = [&](const TensorShapeArray& shapes) { | ||||
Benchmarker<Flip> benchmarker(handle_cuda()); | Benchmarker<Flip> benchmarker(handle_cuda()); | ||||
@@ -75,6 +76,7 @@ TEST_F(CUDA, FLIP_BENCHMARK) { | |||||
run(shapes); | run(shapes); | ||||
} | } | ||||
#endif | |||||
} // namespace test | } // namespace test | ||||
} // namespace megdnn | } // namespace megdnn | ||||
@@ -14,6 +14,7 @@ | |||||
#include "test/common/images2neibs.h" | #include "test/common/images2neibs.h" | ||||
#include "test/common/rng.h" | #include "test/common/rng.h" | ||||
#include "test/cuda/benchmark.h" | #include "test/cuda/benchmark.h" | ||||
#include "test/cuda/utils.h" | |||||
namespace megdnn { | namespace megdnn { | ||||
namespace test { | namespace test { | ||||
@@ -44,6 +45,7 @@ TEST_F(CUDA, BENCHMARK_IMAGES2NEIBS_FORWARD) { | |||||
#endif | #endif | ||||
TEST_F(CUDA, IMAGES2NEIBS_BACKWARD) { | TEST_F(CUDA, IMAGES2NEIBS_BACKWARD) { | ||||
require_compute_capability(6, 1); | |||||
UniformFloatRNG rng(0, 1); | UniformFloatRNG rng(0, 1); | ||||
auto args = images2neibs::get_args(); | auto args = images2neibs::get_args(); | ||||
for (auto&& arg : args) { | for (auto&& arg : args) { | ||||
@@ -39,6 +39,11 @@ TEST_F(CUDA_ERROR_INFO, INDEXING_ONE_HOT) { | |||||
ASSERT_TRUE(failed); | ASSERT_TRUE(failed); | ||||
} | } | ||||
TEST_F(CUDA, INDEXING_SET_ONE_HOT) { | |||||
run_indexing_set_one_hot_test(handle_cuda()); | |||||
} | |||||
#if MEGDNN_WITH_BENCHMARK | |||||
TEST_F(CUDA, BENCHMARK_INDEXING_ONE_HOT) { | TEST_F(CUDA, BENCHMARK_INDEXING_ONE_HOT) { | ||||
Benchmarker<IndexingOneHot> bench{handle_cuda()}; | Benchmarker<IndexingOneHot> bench{handle_cuda()}; | ||||
bench.set_times(1); | bench.set_times(1); | ||||
@@ -53,9 +58,6 @@ TEST_F(CUDA, BENCHMARK_INDEXING_ONE_HOT) { | |||||
printf("bandwidth: %.2fGiB/s\n", | printf("bandwidth: %.2fGiB/s\n", | ||||
A * B * D * sizeof(float) / 1024.0 / 1024 / 1024 / time); | A * B * D * sizeof(float) / 1024.0 / 1024 / 1024 / time); | ||||
} | } | ||||
TEST_F(CUDA, INDEXING_SET_ONE_HOT) { | |||||
run_indexing_set_one_hot_test(handle_cuda()); | |||||
} | |||||
#endif | |||||
// vim: syntax=cpp.doxygen foldmethod=marker foldmarker=f{{{,f}}} | // vim: syntax=cpp.doxygen foldmethod=marker foldmarker=f{{{,f}}} |
@@ -14,13 +14,12 @@ | |||||
#include "test/common/benchmarker.h" | #include "test/common/benchmarker.h" | ||||
#include "test/common/checker.h" | #include "test/common/checker.h" | ||||
#include "test/common/matrix_mul.h" | #include "test/common/matrix_mul.h" | ||||
#include "test/cuda/utils.h" | |||||
#if defined(cuda_check) | #if defined(cuda_check) | ||||
#undef cuda_check | #undef cuda_check | ||||
#endif | #endif | ||||
#include "test/cuda/utils.h" | |||||
#include <cuda.h> | |||||
#include "src/cuda/utils.h" | |||||
namespace megdnn { | namespace megdnn { | ||||
namespace test { | namespace test { | ||||
@@ -47,13 +46,7 @@ TEST_F(CUDA, MATRIX_MUL_QUANTIZED4x4x32_EXCEPTION) { | |||||
} | } | ||||
TEST_F(CUDA, MATRIX_MUL_QUANTIZED4x4x32) { | TEST_F(CUDA, MATRIX_MUL_QUANTIZED4x4x32) { | ||||
if (cuda::current_device_prop().major < 7 || | |||||
(cuda::current_device_prop().major == 7 && | |||||
cuda::current_device_prop().minor < 5)) { | |||||
printf("Skip CUDA.MATRIX_MUL_QUANTIZED4x4x32 test as current device " | |||||
"doesn't support\n"); | |||||
return; | |||||
} | |||||
require_compute_capability(7, 5); | |||||
Checker<MatrixMul> checker(handle_cuda(), false); | Checker<MatrixMul> checker(handle_cuda(), false); | ||||
using Param = MatrixMul::Param; | using Param = MatrixMul::Param; | ||||
Param param; | Param param; | ||||
@@ -65,21 +58,15 @@ TEST_F(CUDA, MATRIX_MUL_QUANTIZED4x4x32) { | |||||
checker.exec({{256, 256}, {256, 256}, {256, 256}}); | checker.exec({{256, 256}, {256, 256}, {256, 256}}); | ||||
auto args = matrix_mul::get_matmul_args(); | auto args = matrix_mul::get_matmul_args(); | ||||
for (auto arg : args) { | for (auto arg : args) { | ||||
size_t m = DIVUP(arg.m, 8) * 8, n = DIVUP(arg.n, 8) * 8, | |||||
k = DIVUP(arg.k, 32) * 32; | |||||
size_t m = (arg.m + 7) / 8 * 8, n = (arg.n + 7) / 8 * 8, | |||||
k = (arg.k + 31) / 32 * 32; | |||||
checker.exec({{m, k}, {n, k}, {m, n}}); | checker.exec({{m, k}, {n, k}, {m, n}}); | ||||
} | } | ||||
} | } | ||||
#if MEGDNN_WITH_BENCHMARK | #if MEGDNN_WITH_BENCHMARK | ||||
TEST_F(CUDA, BENCHMARK_MATRIX_MUL_QUANTIZED4x4x32) { | TEST_F(CUDA, BENCHMARK_MATRIX_MUL_QUANTIZED4x4x32) { | ||||
if (cuda::current_device_prop().major < 7 || | |||||
(cuda::current_device_prop().major == 7 && | |||||
cuda::current_device_prop().minor < 5)) { | |||||
printf("Skip CUDA.BENCHMARK_MATRIX_MUL_QUANTIZED4x4x32 test as current " | |||||
"device doesn't support\n"); | |||||
return; | |||||
} | |||||
require_compute_capability(7, 5); | |||||
Benchmarker<MatrixMul> bencher(handle_cuda()); | Benchmarker<MatrixMul> bencher(handle_cuda()); | ||||
using Param = MatrixMul::Param; | using Param = MatrixMul::Param; | ||||
Param param; | Param param; | ||||
@@ -102,14 +89,7 @@ TEST_F(CUDA, BENCHMARK_MATRIX_MUL_QUANTIZED4x4x32) { | |||||
} | } | ||||
TEST_F(CUDA, PEAK_BENCHMARK_MATRIX_MUL_QUANTIZED4x4x32) { | TEST_F(CUDA, PEAK_BENCHMARK_MATRIX_MUL_QUANTIZED4x4x32) { | ||||
if (cuda::current_device_prop().major < 7 || | |||||
(cuda::current_device_prop().major == 7 && | |||||
cuda::current_device_prop().minor < 5)) { | |||||
printf("Skip CUDA.PEAK_BENCHMARK_MATRIX_MUL_QUANTIZED4x4x32 test as " | |||||
"current " | |||||
"device doesn't support\n"); | |||||
return; | |||||
} | |||||
require_compute_capability(7, 5); | |||||
Benchmarker<MatrixMul> bencher(handle_cuda()); | Benchmarker<MatrixMul> bencher(handle_cuda()); | ||||
using Param = MatrixMul::Param; | using Param = MatrixMul::Param; | ||||
Param param; | Param param; | ||||
@@ -188,8 +188,7 @@ TEST_F(CUDA, PADDING_REPLICATE2) { | |||||
6, 7, 7, 8, 9, 9, 9, 9})}); | 6, 7, 7, 8, 9, 9, 9, 9})}); | ||||
} | } | ||||
// #if MEGDNN_WITH_BENCHMARK | |||||
#if MEGDNN_WITH_BENCHMARK | |||||
TEST_F(CUDA, BENCHMARK_PADDING_CONSTANT) { | TEST_F(CUDA, BENCHMARK_PADDING_CONSTANT) { | ||||
using Param = Padding::Param; | using Param = Padding::Param; | ||||
@@ -240,5 +239,4 @@ TEST_F(CUDA, BENCHMARK_PADDING_CONSTANT) { | |||||
run(shapes, param); | run(shapes, param); | ||||
} | } | ||||
} | } | ||||
// #endif | |||||
#endif |
@@ -40,6 +40,7 @@ TEST_F(CUDA, ROTATE) { | |||||
} | } | ||||
} | } | ||||
#if MEGDNN_WITH_BENCHMARK | |||||
TEST_F(CUDA, BENCHMARK_ROTATE) { | TEST_F(CUDA, BENCHMARK_ROTATE) { | ||||
auto run = [&](const TensorShapeArray& shapes) { | auto run = [&](const TensorShapeArray& shapes) { | ||||
Benchmarker<Rotate> benchmarker(handle_cuda()); | Benchmarker<Rotate> benchmarker(handle_cuda()); | ||||
@@ -74,6 +75,7 @@ TEST_F(CUDA, BENCHMARK_ROTATE) { | |||||
run(shapes); | run(shapes); | ||||
} | } | ||||
#endif | |||||
} // namespace rotate | } // namespace rotate | ||||
} // namespace test | } // namespace test | ||||
@@ -42,18 +42,6 @@ TEST_F(CUDA, SLIDINGWINDOWTRANSPOSE_FORWARD) { | |||||
} | } | ||||
} | } | ||||
#if MEGDNN_WITH_BENCHMARK | |||||
TEST_F(CUDA, BENCHMARK_SLIDINGWINDOWTRANSPOSE_FORWARD) { | |||||
auto args = sliding_window_transpose::get_benchmark_args(); | |||||
for (auto&& arg : args) { | |||||
CUBenchmarker<SlidingWindowTransposeForward> bencher(handle_cuda()); | |||||
bencher.set_param(arg.param) | |||||
.set_dtype(0, dtype::Float32()) | |||||
.exec(TensorShapeArray{arg.ishape, {}}); | |||||
} | |||||
} | |||||
#endif | |||||
TEST_F(CUDA, SLIDINGWINDOWTRANSPOSE_BACKWARD) { | TEST_F(CUDA, SLIDINGWINDOWTRANSPOSE_BACKWARD) { | ||||
UniformFloatRNG rng(0, 1); | UniformFloatRNG rng(0, 1); | ||||
auto args = sliding_window_transpose::get_args(); | auto args = sliding_window_transpose::get_args(); | ||||
@@ -78,6 +66,18 @@ TEST_F(CUDA, SLIDINGWINDOWTRANSPOSE_BACKWARD) { | |||||
} | } | ||||
} | } | ||||
#if MEGDNN_WITH_BENCHMARK | |||||
TEST_F(CUDA, BENCHMARK_SLIDINGWINDOWTRANSPOSE_FORWARD) { | |||||
auto args = sliding_window_transpose::get_benchmark_args(); | |||||
for (auto&& arg : args) { | |||||
CUBenchmarker<SlidingWindowTransposeForward> bencher(handle_cuda()); | |||||
bencher.set_param(arg.param) | |||||
.set_dtype(0, dtype::Float32()) | |||||
.exec(TensorShapeArray{arg.ishape, {}}); | |||||
} | |||||
} | |||||
#endif | |||||
} // namespace test | } // namespace test | ||||
} // namespace megdnn | } // namespace megdnn | ||||
@@ -33,25 +33,6 @@ TEST_F(CUDA, TYPE_CVT) { | |||||
} | } | ||||
} | } | ||||
TEST_F(CUDA, BENCHMARK_TYPE_CVT_LAST_NOT_CONTIG) { | |||||
const size_t RUNS = 3; | |||||
auto run = [&](TensorLayout src, TensorLayout dst) { | |||||
Benchmarker<TypeCvt> benchmarker(handle_cuda()); | |||||
auto&& layout = src; | |||||
benchmarker.set_times(RUNS); | |||||
dst.init_contiguous_stride(); | |||||
auto used = benchmarker.execl({src, dst}); | |||||
printf("layout: %s bandwith: %f gbps/s\n", layout.to_string().c_str(), | |||||
2 * layout.total_nr_elems() * layout.dtype.size() * RUNS / used * 1000 / | |||||
(1024 * 1024 * 1024)); | |||||
}; | |||||
TensorLayout src({16, 128, 128}, {49152, 384, 3}, dtype::Float32()), | |||||
dst({16, 128, 128}, {16384, 128, 1}, dtype::Float32()); | |||||
run(src, dst); | |||||
} | |||||
TEST_F(CUDA, QUANTIZED_TYPECVT) { | TEST_F(CUDA, QUANTIZED_TYPECVT) { | ||||
UniformIntRNG int_rng{-66, 66}; | UniformIntRNG int_rng{-66, 66}; | ||||
Checker<TypeCvt> checker(handle_cuda()); | Checker<TypeCvt> checker(handle_cuda()); | ||||
@@ -162,6 +143,25 @@ TEST_F(CUDA, TYPE_CVT_BFLOAT16) { | |||||
} | } | ||||
#if MEGDNN_WITH_BENCHMARK | #if MEGDNN_WITH_BENCHMARK | ||||
TEST_F(CUDA, BENCHMARK_TYPE_CVT_LAST_NOT_CONTIG) { | |||||
const size_t RUNS = 3; | |||||
auto run = [&](TensorLayout src, TensorLayout dst) { | |||||
Benchmarker<TypeCvt> benchmarker(handle_cuda()); | |||||
auto&& layout = src; | |||||
benchmarker.set_times(RUNS); | |||||
dst.init_contiguous_stride(); | |||||
auto used = benchmarker.execl({src, dst}); | |||||
printf("layout: %s bandwith: %f gbps/s\n", layout.to_string().c_str(), | |||||
2 * layout.total_nr_elems() * layout.dtype.size() * RUNS / used * 1000 / | |||||
(1024 * 1024 * 1024)); | |||||
}; | |||||
TensorLayout src({16, 128, 128}, {49152, 384, 3}, dtype::Float32()), | |||||
dst({16, 128, 128}, {16384, 128, 1}, dtype::Float32()); | |||||
run(src, dst); | |||||
} | |||||
TEST_F(CUDA, BENCHMARK_TYPE_CVT) { | TEST_F(CUDA, BENCHMARK_TYPE_CVT) { | ||||
UniformIntRNG rng{-128, 127}; | UniformIntRNG rng{-128, 127}; | ||||
auto run = [&](TensorLayout src, TensorLayout dst) { | auto run = [&](TensorLayout src, TensorLayout dst) { | ||||