fix(dnn/cuda): disallow implicit dtype conversion in cublaslt matmul algos

disable tensor op matmul kernels when input and output tensors are in f32 data type to avoid potential accuracy loss GitOrigin-RevId: 36859cba5a
3 years ago · 0708bc780c
--- a/dnn/src/cuda/matrix_mul/cublasLt_wrapper.cpp
+++ b/dnn/src/cuda/matrix_mul/cublasLt_wrapper.cpp
@@ -313,6 +313,19 @@ bool CUBLASLTMatmulDesc::get_algorithm_heuristic(const SizeArgs& args,
    cublas_check(cublasLtMatmulPreferenceSetAttribute(
            algo_pref, CUBLASLT_MATMUL_PREF_MAX_WORKSPACE_BYTES, &algo_ws_limit,
            sizeof(algo_ws_limit)));
 #if CUDA_VERSION < 11000
    bool is_f32_config = args.layout_a.dtype == dtype::Float32() &&
                         args.layout_b.dtype == dtype::Float32() &&
                         args.layout_c.dtype == dtype::Float32();
    if (is_f32_config) {
        // disable HMMA tensor op matmul when inputs and output are all f32
        // tensors, to avoid the potential accuracy loss
        uint32_t math_mode = CUBLAS_DEFAULT_MATH;
        cublas_check(cublasLtMatmulPreferenceSetAttribute(
                algo_pref, CUBLASLT_MATMUL_PREF_MATH_MODE_MASK, &math_mode,
                sizeof(math_mode)));
    }
 #endif
    status = cublasLtMatmulAlgoGetHeuristic(
            cublasLt_handle, matmul_desc,
            dt_c == CUDA_R_32I ? layout_trans_b : layout_b,
--- a/dnn/test/cuda/cutlass_matmul.cpp
+++ b/dnn/test/cuda/cutlass_matmul.cpp
@@ -215,6 +215,7 @@ std::vector<BenchArgs> get_feat_model_args() {
    return args;
 }

 #if CUDA_VERSION >= 10020
 std::vector<BenchArgs> get_f16_feat_model_args() {
    std::vector<BenchArgs> args;
    args.emplace_back(BenchArgs{128, 9216, 9216});
@@ -222,6 +223,7 @@ std::vector<BenchArgs> get_f16_feat_model_args() {
    args.emplace_back(BenchArgs{128, 5184, 5184});
    return args;
 }
 #endif

 void benchmark_matrix_mul(
        Handle* handle, const std::vector<BenchArgs>& args, DType A_dtype,
--- a/dnn/test/cuda/matrix_mul.cpp
+++ b/dnn/test/cuda/matrix_mul.cpp
@@ -473,7 +473,34 @@ TEST_F(CUDA, MATRIX_MUL_CUBLASLT_INT8) {
            execs({A, B, {}});
    }
 }
 TEST_F(CUDA, MATRIX_MUL_CUBLASLT_F32) {
    require_compute_capability(7, 5);
    size_t m = 128, n = 1024, k = 18432;
    Checker<MatrixMul> checker(handle_cuda());
    checker.set_before_exec_callback(
            AlgoChecker<MatrixMulForward>("CUBLAS_LT"));

    using Param = MatrixMul::Param;

    Param param;
    DType stype = dtype::Float32();
    DType dtype = dtype::Float32();
    TensorShape A, B;
    param.transposeA = param.transposeB = 0;
    if (param.transposeA)
        A = TensorShape{k, m};
    else
        A = TensorShape{m, k};
    if (param.transposeB)
        B = TensorShape{n, k};
    else
        B = TensorShape{k, n};
    checker.set_param(param)
            .set_dtype(0, stype)
            .set_dtype(1, stype)
            .set_dtype(2, dtype)
            .execs({A, B, {}});
 }
 } // namespace test
 } // namespace megdnn
 // vim: syntax=cpp.doxygen