disable tensor op matmul kernels when input and output tensors are in f32 data type to avoid potential accuracy loss
GitOrigin-RevId: 36859cba5a
release-1.6
@@ -313,6 +313,19 @@ bool CUBLASLTMatmulDesc::get_algorithm_heuristic(const SizeArgs& args, | |||||
cublas_check(cublasLtMatmulPreferenceSetAttribute( | cublas_check(cublasLtMatmulPreferenceSetAttribute( | ||||
algo_pref, CUBLASLT_MATMUL_PREF_MAX_WORKSPACE_BYTES, &algo_ws_limit, | algo_pref, CUBLASLT_MATMUL_PREF_MAX_WORKSPACE_BYTES, &algo_ws_limit, | ||||
sizeof(algo_ws_limit))); | sizeof(algo_ws_limit))); | ||||
#if CUDA_VERSION < 11000 | |||||
bool is_f32_config = args.layout_a.dtype == dtype::Float32() && | |||||
args.layout_b.dtype == dtype::Float32() && | |||||
args.layout_c.dtype == dtype::Float32(); | |||||
if (is_f32_config) { | |||||
// disable HMMA tensor op matmul when inputs and output are all f32 | |||||
// tensors, to avoid the potential accuracy loss | |||||
uint32_t math_mode = CUBLAS_DEFAULT_MATH; | |||||
cublas_check(cublasLtMatmulPreferenceSetAttribute( | |||||
algo_pref, CUBLASLT_MATMUL_PREF_MATH_MODE_MASK, &math_mode, | |||||
sizeof(math_mode))); | |||||
} | |||||
#endif | |||||
status = cublasLtMatmulAlgoGetHeuristic( | status = cublasLtMatmulAlgoGetHeuristic( | ||||
cublasLt_handle, matmul_desc, | cublasLt_handle, matmul_desc, | ||||
dt_c == CUDA_R_32I ? layout_trans_b : layout_b, | dt_c == CUDA_R_32I ? layout_trans_b : layout_b, | ||||
@@ -215,6 +215,7 @@ std::vector<BenchArgs> get_feat_model_args() { | |||||
return args; | return args; | ||||
} | } | ||||
#if CUDA_VERSION >= 10020 | |||||
std::vector<BenchArgs> get_f16_feat_model_args() { | std::vector<BenchArgs> get_f16_feat_model_args() { | ||||
std::vector<BenchArgs> args; | std::vector<BenchArgs> args; | ||||
args.emplace_back(BenchArgs{128, 9216, 9216}); | args.emplace_back(BenchArgs{128, 9216, 9216}); | ||||
@@ -222,6 +223,7 @@ std::vector<BenchArgs> get_f16_feat_model_args() { | |||||
args.emplace_back(BenchArgs{128, 5184, 5184}); | args.emplace_back(BenchArgs{128, 5184, 5184}); | ||||
return args; | return args; | ||||
} | } | ||||
#endif | |||||
void benchmark_matrix_mul( | void benchmark_matrix_mul( | ||||
Handle* handle, const std::vector<BenchArgs>& args, DType A_dtype, | Handle* handle, const std::vector<BenchArgs>& args, DType A_dtype, | ||||
@@ -473,7 +473,34 @@ TEST_F(CUDA, MATRIX_MUL_CUBLASLT_INT8) { | |||||
execs({A, B, {}}); | execs({A, B, {}}); | ||||
} | } | ||||
} | } | ||||
TEST_F(CUDA, MATRIX_MUL_CUBLASLT_F32) { | |||||
require_compute_capability(7, 5); | |||||
size_t m = 128, n = 1024, k = 18432; | |||||
Checker<MatrixMul> checker(handle_cuda()); | |||||
checker.set_before_exec_callback( | |||||
AlgoChecker<MatrixMulForward>("CUBLAS_LT")); | |||||
using Param = MatrixMul::Param; | |||||
Param param; | |||||
DType stype = dtype::Float32(); | |||||
DType dtype = dtype::Float32(); | |||||
TensorShape A, B; | |||||
param.transposeA = param.transposeB = 0; | |||||
if (param.transposeA) | |||||
A = TensorShape{k, m}; | |||||
else | |||||
A = TensorShape{m, k}; | |||||
if (param.transposeB) | |||||
B = TensorShape{n, k}; | |||||
else | |||||
B = TensorShape{k, n}; | |||||
checker.set_param(param) | |||||
.set_dtype(0, stype) | |||||
.set_dtype(1, stype) | |||||
.set_dtype(2, dtype) | |||||
.execs({A, B, {}}); | |||||
} | |||||
} // namespace test | } // namespace test | ||||
} // namespace megdnn | } // namespace megdnn | ||||
// vim: syntax=cpp.doxygen | // vim: syntax=cpp.doxygen |