Browse Source

fix(megdnn): some fixes of execution policy

GitOrigin-RevId: 920f39bcb6
tags/v1.3.0
Megvii Engine Team 4 years ago
parent
commit
af42ce7e69
7 changed files with 91 additions and 22 deletions
  1. +5
    -0
      dnn/src/cuda/conv_bias/batched_matmul.cpp
  2. +2
    -1
      dnn/src/cuda/conv_bias/group_conv.cpp
  3. +34
    -4
      dnn/test/common/matrix_mul.cpp
  4. +10
    -4
      dnn/test/common/matrix_mul.h
  5. +25
    -0
      dnn/test/cuda/batched_matrix_mul.cpp
  6. +4
    -4
      dnn/test/cuda/dilated_convolution.cpp
  7. +11
    -9
      dnn/test/cuda/matrix_mul.cpp

+ 5
- 0
dnn/src/cuda/conv_bias/batched_matmul.cpp View File

@@ -44,6 +44,9 @@ std::pair<TensorLayoutArray, MatrixMulForward::Param> sub_opr_config(
B.dtype = src_layout.dtype; B.dtype = src_layout.dtype;
C = {{dst_layout.shape[0], dst_layout.shape[1], B.shape[2]}, C = {{dst_layout.shape[0], dst_layout.shape[1], B.shape[2]},
dst_layout.dtype}; dst_layout.dtype};
C.stride[2] = 1;
C.stride[1] = dst_layout.stride[1];
C.stride[0] = dst_layout.stride[0];


MatrixMulForward::Param param; MatrixMulForward::Param param;
if (opr->param().compute_mode == param::Convolution::ComputeMode::FLOAT32) { if (opr->param().compute_mode == param::Convolution::ComputeMode::FLOAT32) {
@@ -89,6 +92,8 @@ bool ConvBiasForwardImpl::AlgoBatchedMatmul::is_available(
return false; return false;


auto config = prepare_sub_opr(args); auto config = prepare_sub_opr(args);
//! The dst of batched matmul should be contiguous
if (!config.first[2].is_contiguous()) return false;


auto&& fm = args.filter_meta; auto&& fm = args.filter_meta;
return fm.format == Param::Format::NCHW && return fm.format == Param::Format::NCHW &&


+ 2
- 1
dnn/src/cuda/conv_bias/group_conv.cpp View File

@@ -109,7 +109,8 @@ void ConvBiasForwardImpl::AlgoGroupConvGeneral::exec(
auto sub_args = args; auto sub_args = args;
sub_args.dst_tensor = &conv_dst_tensor; sub_args.dst_tensor = &conv_dst_tensor;
sub_args.dst_layout = &conv_dst_tensor.layout; sub_args.dst_layout = &conv_dst_tensor.layout;
TensorND tsrc{*args.src_tensor}, tdst{conv_dst_tensor}, tbias{*args.bias_tensor};
TensorND tsrc{*args.src_tensor}, tdst{conv_dst_tensor},
tbias{*args.bias_tensor};
SmallVector<size_t> flt_shape(0); SmallVector<size_t> flt_shape(0);
std::vector<ptrdiff_t> flt_stride(0); std::vector<ptrdiff_t> flt_stride(0);
size_t idx = 0; size_t idx = 0;


+ 34
- 4
dnn/test/common/matrix_mul.cpp View File

@@ -17,6 +17,8 @@
using namespace megdnn; using namespace megdnn;
using namespace test; using namespace test;


constexpr size_t matrix_mul::TestArg::UNSET_STRIDE_VAL;

std::vector<matrix_mul::TestArg> matrix_mul::get_matmul_args_no_mask() { std::vector<matrix_mul::TestArg> matrix_mul::get_matmul_args_no_mask() {
std::vector<TestArg> args; std::vector<TestArg> args;


@@ -57,7 +59,9 @@ matrix_mul::get_batched_matmul_args_cublaslt() {
// so please uncomment it if the bug is fixed // so please uncomment it if the bug is fixed


for (size_t k : {32, 64}) { for (size_t k : {32, 64}) {
args.emplace_back(m, n, k, 0, 0, 0, 0, 2);
args.emplace_back(m, n, k, 0, TestArg::UNSET_STRIDE_VAL,
TestArg::UNSET_STRIDE_VAL,
TestArg::UNSET_STRIDE_VAL, 2);
} }
} }
} }
@@ -70,7 +74,9 @@ matrix_mul::get_batched_matmul_args_int8x8x32() {
for (size_t m : {1, 2, 3, 4, 5, 8, 64}) { for (size_t m : {1, 2, 3, 4, 5, 8, 64}) {
for (size_t n : {1, 2, 3, 4, 5, 8, 64}) { for (size_t n : {1, 2, 3, 4, 5, 8, 64}) {
for (size_t k : {1, 2, 3, 4, 5, 8, 64}) { for (size_t k : {1, 2, 3, 4, 5, 8, 64}) {
args.emplace_back(m, n, k, 0, 0, 0, 0, 2);
args.emplace_back(m, n, k, 0, TestArg::UNSET_STRIDE_VAL,
TestArg::UNSET_STRIDE_VAL,
TestArg::UNSET_STRIDE_VAL, 2);
} }
} }
} }
@@ -136,6 +142,30 @@ std::vector<matrix_mul::TestArg> matrix_mul::get_batched_matmul_args() {
return args; return args;
} }


std::vector<matrix_mul::TestArg>
matrix_mul::get_batched_matmul_broadcast_args() {
std::vector<TestArg> args;
for (size_t mask = 0; mask < 4; ++mask) {
std::vector<TestArg> args_temp =
matrix_mul::get_batched_matmul_broadcast_args_mask(mask);
for (auto arg : args_temp)
args.emplace_back(arg);
}
return args;
}

std::vector<matrix_mul::TestArg>
matrix_mul::get_batched_matmul_broadcast_args_mask(uint8_t mask) {
std::vector<TestArg> args;
std::vector<TestArg> args_temp =
matrix_mul::get_batched_matmul_args_mask(mask);
for (auto arg : args_temp) {
args.emplace_back(arg);
args.back().A_batch_stride = 0;
}
return args;
}

template <typename Opr> template <typename Opr>
void matrix_mul::check_matrix_mul(DType A_dtype, DType B_dtype, DType C_dtype, void matrix_mul::check_matrix_mul(DType A_dtype, DType B_dtype, DType C_dtype,
Handle* handle, Handle* handle,
@@ -170,9 +200,9 @@ void matrix_mul::check_matrix_mul(DType A_dtype, DType B_dtype, DType C_dtype,
checker.set_rng(0, rng.get()).set_rng(1, rng.get()); checker.set_rng(0, rng.get()).set_rng(1, rng.get());
} }


//! return expect if stride == 0, stride otherwise
//! return expect if stride == -1, stride otherwise
auto stride_val = [](size_t stride, size_t expect) -> size_t { auto stride_val = [](size_t stride, size_t expect) -> size_t {
if (stride == 0) {
if (stride == TestArg::UNSET_STRIDE_VAL) {
return expect; return expect;
} else { } else {
return stride; return stride;


+ 10
- 4
dnn/test/common/matrix_mul.h View File

@@ -24,15 +24,19 @@ namespace matrix_mul {


// mask & 1 denotes transposeA; mask & 2 denotes transposeB // mask & 1 denotes transposeA; mask & 2 denotes transposeB
struct TestArg { struct TestArg {
constexpr static size_t UNSET_STRIDE_VAL = static_cast<size_t>(-1);
size_t m, n, k, mask; size_t m, n, k, mask;
size_t A_stride, B_stride, C_stride, b; size_t A_stride, B_stride, C_stride, b;
size_t A_batch_stride, B_batch_stride, C_batch_stride; size_t A_batch_stride, B_batch_stride, C_batch_stride;
// stride = 0 means the default stride, the dim is contiguous, i.e. the // stride = 0 means the default stride, the dim is contiguous, i.e. the
// stride value which makes tensor compact. // stride value which makes tensor compact.
TestArg(size_t m, size_t n, size_t k, size_t mask, size_t A_stride = 0,
size_t B_stride = 0, size_t C_stride = 0, size_t b = 1,
size_t A_batch_stride = 0, size_t B_batch_stride = 0,
size_t C_batch_stride = 0)
TestArg(size_t m, size_t n, size_t k, size_t mask,
size_t A_stride = UNSET_STRIDE_VAL,
size_t B_stride = UNSET_STRIDE_VAL,
size_t C_stride = UNSET_STRIDE_VAL, size_t b = 1,
size_t A_batch_stride = UNSET_STRIDE_VAL,
size_t B_batch_stride = UNSET_STRIDE_VAL,
size_t C_batch_stride = UNSET_STRIDE_VAL)
: m{m}, : m{m},
n{n}, n{n},
k{k}, k{k},
@@ -51,6 +55,8 @@ std::vector<TestArg> get_matmul_args_mask(uint8_t mask);
std::vector<TestArg> get_matmul_args(); std::vector<TestArg> get_matmul_args();
std::vector<TestArg> get_batched_matmul_args_mask(uint8_t mask); std::vector<TestArg> get_batched_matmul_args_mask(uint8_t mask);
std::vector<TestArg> get_batched_matmul_args(); std::vector<TestArg> get_batched_matmul_args();
std::vector<TestArg> get_batched_matmul_broadcast_args();
std::vector<TestArg> get_batched_matmul_broadcast_args_mask(uint8_t mask);
std::vector<TestArg> get_matmul_mk_packed_args(size_t nbase); std::vector<TestArg> get_matmul_mk_packed_args(size_t nbase);
std::vector<TestArg> get_batched_matmul_args_cublaslt(); std::vector<TestArg> get_batched_matmul_args_cublaslt();
std::vector<TestArg> get_batched_matmul_args_int8x8x32(); std::vector<TestArg> get_batched_matmul_args_int8x8x32();


+ 25
- 0
dnn/test/cuda/batched_matrix_mul.cpp View File

@@ -8,6 +8,7 @@
* software distributed under the License is distributed on an * software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
*/ */
#include <vector>
#include "test/cuda/fixture.h" #include "test/cuda/fixture.h"


#include "test/common/checker.h" #include "test/common/checker.h"
@@ -62,6 +63,30 @@ TEST_F(CUDA, BATCHED_MATRIX_MUL_LT_F32_PART4) {


#undef F32_TEST_PART #undef F32_TEST_PART


TEST_F(CUDA, BATCHED_MATRIX_MUL_F32_BROADCAST_PART1){
matrix_mul::check_batched_matrix_mul(
dtype::Float32{}, dtype::Float32{}, {}, handle_cuda(), "CUBLAS",
1e-3, matrix_mul::get_batched_matmul_broadcast_args_mask(0));
}

TEST_F(CUDA, BATCHED_MATRIX_MUL_F32_BROADCAST_PART2){
matrix_mul::check_batched_matrix_mul(
dtype::Float32{}, dtype::Float32{}, {}, handle_cuda(), "CUBLAS",
1e-3, matrix_mul::get_batched_matmul_broadcast_args_mask(1));
}

TEST_F(CUDA, BATCHED_MATRIX_MUL_F32_BROADCAST_PART3){
matrix_mul::check_batched_matrix_mul(
dtype::Float32{}, dtype::Float32{}, {}, handle_cuda(), "CUBLAS",
1e-3, matrix_mul::get_batched_matmul_broadcast_args_mask(2));
}

TEST_F(CUDA, BATCHED_MATRIX_MUL_F32_BROADCAST_PART4){
matrix_mul::check_batched_matrix_mul(
dtype::Float32{}, dtype::Float32{}, {}, handle_cuda(), "CUBLAS",
1e-3, matrix_mul::get_batched_matmul_broadcast_args_mask(3));
}

TEST_F(CUDA, BATCHED_MATRIX_MUL_F32_BRUTE_FORCE_PART1) { TEST_F(CUDA, BATCHED_MATRIX_MUL_F32_BRUTE_FORCE_PART1) {
matrix_mul::check_batched_matrix_mul( matrix_mul::check_batched_matrix_mul(
dtype::Float32{}, dtype::Float32{}, {}, handle_cuda(), dtype::Float32{}, dtype::Float32{}, {}, handle_cuda(),


+ 4
- 4
dnn/test/cuda/dilated_convolution.cpp View File

@@ -75,8 +75,8 @@ TEST_F(CUDA, DILATED_CONVOLUTION_BACKWARD_DATA)
"CUDNN_CONVOLUTION_BWD_DATA_ALGO_1" CUDNN_VERSION_STRING)); "CUDNN_CONVOLUTION_BWD_DATA_ALGO_1" CUDNN_VERSION_STRING));
printf("cudnn version >= 7.5, use cudnn impl for dilated convolution\n"); printf("cudnn version >= 7.5, use cudnn impl for dilated convolution\n");
#else #else
checker.set_before_exec_callback(
AlgoChecker<ConvolutionBackwardData>("MATMUL"));
checker.set_before_exec_callback(AlgoChecker<ConvolutionBackwardData>(
ExecutionPolicyAlgoName{"MATMUL", {{"CUBLAS", {}}}}));
#endif #endif
NormalRNG default_rng; NormalRNG default_rng;
for (auto &&arg: args) { for (auto &&arg: args) {
@@ -139,8 +139,8 @@ TEST_F(CUDA, DILATED_CONVOLUTION_BACKWARD_FILTER)
"CUDNN_CONVOLUTION_BWD_FILTER_ALGO_1" CUDNN_VERSION_STRING)); "CUDNN_CONVOLUTION_BWD_FILTER_ALGO_1" CUDNN_VERSION_STRING));
printf("cudnn version >= 7.5, use cudnn impl for dilated convolution\n"); printf("cudnn version >= 7.5, use cudnn impl for dilated convolution\n");
#else #else
checker.set_before_exec_callback(
AlgoChecker<ConvolutionBackwardFilter>("MATMUL"));
checker.set_before_exec_callback(AlgoChecker<ConvolutionBackwardFilter>(
ExecutionPolicyAlgoName{"MATMUL", {{"CUBLAS", {}}}}));
#endif #endif
NormalRNG default_rng; NormalRNG default_rng;
bool first_run = true; bool first_run = true;


+ 11
- 9
dnn/test/cuda/matrix_mul.cpp View File

@@ -51,7 +51,8 @@ TEST_F(CUDA, MATRIX_MUL_QUANTIZED4x4x32) {
if (cuda::current_device_prop().major < 7 || if (cuda::current_device_prop().major < 7 ||
(cuda::current_device_prop().major == 7 && (cuda::current_device_prop().major == 7 &&
cuda::current_device_prop().minor < 5)) { cuda::current_device_prop().minor < 5)) {
printf("Skip CUDA.MATRIX_MUL_QUANTIZED4x4x32 test as current device doesn't support\n");
printf("Skip CUDA.MATRIX_MUL_QUANTIZED4x4x32 test as current device "
"doesn't support\n");
return; return;
} }
Checker<MatrixMul> checker(handle_cuda(), false); Checker<MatrixMul> checker(handle_cuda(), false);
@@ -257,19 +258,19 @@ TEST_F(CUDA, MATRIX_MUL) {
BS = TensorShape{k, n}; BS = TensorShape{k, n};
CS = TensorShape{m, n}; CS = TensorShape{m, n};
TensorLayout AL, BL, CL; TensorLayout AL, BL, CL;
if (arg.A_stride == 0) {
if (arg.A_stride == matrix_mul::TestArg::UNSET_STRIDE_VAL) {
AL = TensorLayout(AS, dtype::Float32()); AL = TensorLayout(AS, dtype::Float32());
} else { } else {
AL = TensorLayout(AS, {ptrdiff_t(arg.A_stride), 1}, AL = TensorLayout(AS, {ptrdiff_t(arg.A_stride), 1},
dtype::Float32()); dtype::Float32());
} }
if (arg.B_stride == 0) {
if (arg.B_stride == matrix_mul::TestArg::UNSET_STRIDE_VAL) {
BL = TensorLayout(BS, dtype::Float32()); BL = TensorLayout(BS, dtype::Float32());
} else { } else {
BL = TensorLayout(BS, {ptrdiff_t(arg.B_stride), 1}, BL = TensorLayout(BS, {ptrdiff_t(arg.B_stride), 1},
dtype::Float32()); dtype::Float32());
} }
if (arg.C_stride == 0) {
if (arg.C_stride == matrix_mul::TestArg::UNSET_STRIDE_VAL) {
CL = TensorLayout(CS, dtype::Float32()); CL = TensorLayout(CS, dtype::Float32());
} else { } else {
CL = TensorLayout(CS, {ptrdiff_t(arg.C_stride), 1}, CL = TensorLayout(CS, {ptrdiff_t(arg.C_stride), 1},
@@ -285,8 +286,9 @@ TEST_F(CUDA, MATRIX_MUL_CUBLASLT)
NormalRNG normal_rng; NormalRNG normal_rng;
Checker<MatrixMul> checker(handle_cuda()); Checker<MatrixMul> checker(handle_cuda());
checker.set_rng(0, &normal_rng) checker.set_rng(0, &normal_rng)
.set_rng(1, &normal_rng)
.set_before_exec_callback(AlgoChecker<MatrixMulForward>("CUBLAS_LT"));
.set_rng(1, &normal_rng)
.set_before_exec_callback(
AlgoChecker<MatrixMulForward>("CUBLAS_LT"));
using Param = MatrixMul::Param; using Param = MatrixMul::Param;
size_t m = 32, n = 32, k = 32; size_t m = 32, n = 32, k = 32;
// test Int8 matmul // test Int8 matmul
@@ -350,19 +352,19 @@ TEST_F(CUDA, MATRIX_MUL_CUBLASLT)
BS = TensorShape{k, n}; BS = TensorShape{k, n};
CS = TensorShape{m, n}; CS = TensorShape{m, n};
TensorLayout AL, BL, CL; TensorLayout AL, BL, CL;
if (arg.A_stride == 0) {
if (arg.A_stride == matrix_mul::TestArg::UNSET_STRIDE_VAL) {
AL = TensorLayout(AS, dtype::Float32()); AL = TensorLayout(AS, dtype::Float32());
} else { } else {
AL = TensorLayout(AS, {ptrdiff_t(arg.A_stride), 1}, AL = TensorLayout(AS, {ptrdiff_t(arg.A_stride), 1},
dtype::Float32()); dtype::Float32());
} }
if (arg.B_stride == 0) {
if (arg.B_stride == matrix_mul::TestArg::UNSET_STRIDE_VAL) {
BL = TensorLayout(BS, dtype::Float32()); BL = TensorLayout(BS, dtype::Float32());
} else { } else {
BL = TensorLayout(BS, {ptrdiff_t(arg.B_stride), 1}, BL = TensorLayout(BS, {ptrdiff_t(arg.B_stride), 1},
dtype::Float32()); dtype::Float32());
} }
if (arg.C_stride == 0) {
if (arg.C_stride == matrix_mul::TestArg::UNSET_STRIDE_VAL) {
CL = TensorLayout(CS, dtype::Float32()); CL = TensorLayout(CS, dtype::Float32());
} else { } else {
CL = TensorLayout(CS, {ptrdiff_t(arg.C_stride), 1}, CL = TensorLayout(CS, {ptrdiff_t(arg.C_stride), 1},


Loading…
Cancel
Save