From ff0e6be7b97f721e85be13996ceee0b8aec87cbb Mon Sep 17 00:00:00 2001 From: Megvii Engine Team Date: Mon, 16 Aug 2021 21:43:11 +0800 Subject: [PATCH] fix(dnn/cuda): fix cutlass tensorop kernels do not compile cutlass tensorop kernels, when using cuda version less than 10.2 GitOrigin-RevId: d4c37d5f41ff6e20cf712149067594559529e6cf --- dnn/scripts/cutlass_generator/conv2d_operation.py | 22 +++++++++---- dnn/scripts/cutlass_generator/gemm_operation.py | 37 +++++++++++++++------- dnn/scripts/cutlass_generator/generator.py | 28 ++++++++++++---- dnn/scripts/cutlass_generator/manifest.py | 12 +++++-- dnn/src/cuda/cutlass/initialize_all.cu | 16 +++++++--- dnn/src/cuda/matrix_mul/algos.cpp | 4 +++ dnn/src/cuda/matrix_mul/algos.h | 34 +++++++++++++++++++- .../cuda/matrix_mul/cutlass_float16_tensorop.cpp | 4 +-- .../cutlass_float16_tensorop_split_k.cpp | 10 +++--- dnn/src/cuda/matrix_mul/cutlass_float32_simt.cpp | 2 +- .../cutlass_float32_simt_gemv_batched_strided.cpp | 2 +- .../matrix_mul/cutlass_float32_simt_split_k.cpp | 8 ++--- .../cuda/matrix_mul/cutlass_matrix_mul_base.cpp | 2 +- dnn/test/cuda/cutlass_matmul.cpp | 5 ++- 14 files changed, 139 insertions(+), 47 deletions(-) diff --git a/dnn/scripts/cutlass_generator/conv2d_operation.py b/dnn/scripts/cutlass_generator/conv2d_operation.py index a931a8fb..735f4d94 100644 --- a/dnn/scripts/cutlass_generator/conv2d_operation.py +++ b/dnn/scripts/cutlass_generator/conv2d_operation.py @@ -19,7 +19,8 @@ class Conv2dOperation: # def __init__(self, conv_kind, conv_type, arch, tile_description, src, flt, bias, dst, element_epilogue, \ epilogue_functor = EpilogueFunctor.LinearCombination, swizzling_functor = SwizzlingFunctor.Identity4, \ - need_load_from_const = True, implicit_gemm_mode = ImplicitGemmMode.GemmNT, without_shared_load = False): + need_load_from_const = True, implicit_gemm_mode = ImplicitGemmMode.GemmNT, without_shared_load = False, \ + required_cuda_ver_major = 9, required_cuda_ver_minor = 2): self.operation_kind = OperationKind.Conv2d self.conv_kind = conv_kind @@ -36,6 +37,9 @@ class Conv2dOperation: self.need_load_from_const = need_load_from_const self.implicit_gemm_mode = implicit_gemm_mode self.without_shared_load = without_shared_load + self.required_cuda_ver_major = required_cuda_ver_major + self.required_cuda_ver_minor = required_cuda_ver_minor + # def accumulator_type(self): accum = self.tile_description.math_instruction.element_accumulator @@ -320,7 +324,8 @@ using Deconvolution = # def GenerateConv2d(conv_kind, tile_descriptions, src_layout, flt_layout, dst_layout, dst_type, min_cc, src_align = 32, flt_align = 32, dst_align = 128, \ - skip_unity_kernel = False, implicit_gemm_mode = ImplicitGemmMode.GemmNT, without_shared_load = False): + skip_unity_kernel = False, implicit_gemm_mode = ImplicitGemmMode.GemmNT, without_shared_load = False, required_cuda_ver_major = 9, \ + required_cuda_ver_minor = 2): operations = [] element_epilogue = DataType.f32 @@ -407,10 +412,10 @@ def GenerateConv2d(conv_kind, tile_descriptions, src_layout, flt_layout, dst_lay bias = TensorDescription(bias_type, dst_layout, max(1, int(32 / DataTypeSize[bias_type]))) dst = TensorDescription(dst_type, dst_layout, int(dst_align / DataTypeSize[dst_type])) - new_operation = Conv2dOperation(conv_kind, ConvType.Convolution, min_cc, tile, src, flt, bias, dst, element_epilogue, epilogue, swizzling_functor, True, implicit_gemm_mode, without_shared_load) + new_operation = Conv2dOperation(conv_kind, ConvType.Convolution, min_cc, tile, src, flt, bias, dst, element_epilogue, epilogue, swizzling_functor, True, implicit_gemm_mode, without_shared_load, required_cuda_ver_major, required_cuda_ver_minor) operations.append(new_operation) if not skip_unity_kernel: - new_operation = Conv2dOperation(conv_kind, ConvType.Convolution, min_cc, tile, src, flt, bias, dst, element_epilogue, epilogue, swizzling_functor, False, implicit_gemm_mode, without_shared_load) + new_operation = Conv2dOperation(conv_kind, ConvType.Convolution, min_cc, tile, src, flt, bias, dst, element_epilogue, epilogue, swizzling_functor, False, implicit_gemm_mode, without_shared_load, required_cuda_ver_major, required_cuda_ver_minor) operations.append(new_operation) return operations @@ -545,7 +550,7 @@ class EmitConvSingleKernelWrapper(): self.convolution_name = "Deconvolution" self.header_template = """ -#if !MEGDNN_TEGRA_X1 +#if __CUDACC_VER_MAJOR__ > ${required_cuda_ver_major} || (__CUDACC_VER_MAJOR__ == ${required_cuda_ver_major} && __CUDACC_VER_MINOR__ >= ${required_cuda_ver_minor}) // ignore warning of cutlass #pragma GCC diagnostic push #pragma GCC diagnostic ignored "-Wunused-parameter" @@ -589,14 +594,17 @@ void initialize_${operation_name}(Manifest &manifest) { else: self.kernel_path = os.path.join(self.kernel_path, "%s.cu" % self.operation.procedural_name()) self.kernel_file = open(self.kernel_path, "w") - self.kernel_file.write(self.header_template) + self.kernel_file.write(SubstituteTemplate(self.header_template, { + 'required_cuda_ver_major': str(self.operation.required_cuda_ver_major), + 'required_cuda_ver_minor': str(self.operation.required_cuda_ver_minor), + })) return self # def emit(self): self.kernel_file.write(SubstituteTemplate(self.instance_template, { 'operation_instance': self.instance_emitter.emit(self.operation), - })) + })) # emit manifest helper manifest = SubstituteTemplate(self.manifest_template, { diff --git a/dnn/scripts/cutlass_generator/gemm_operation.py b/dnn/scripts/cutlass_generator/gemm_operation.py index a6f33b18..382d6aaa 100644 --- a/dnn/scripts/cutlass_generator/gemm_operation.py +++ b/dnn/scripts/cutlass_generator/gemm_operation.py @@ -23,7 +23,8 @@ from library import * class GemmOperation: # def __init__(self, gemm_kind, arch, tile_description, A, B, C, element_epilogue, \ - epilogue_functor = EpilogueFunctor.LinearCombination, swizzling_functor = SwizzlingFunctor.Identity8): + epilogue_functor = EpilogueFunctor.LinearCombination, swizzling_functor = SwizzlingFunctor.Identity8, \ + required_cuda_ver_major = 9, required_cuda_ver_minor = 2): self.operation_kind = OperationKind.Gemm self.arch = arch @@ -35,6 +36,9 @@ class GemmOperation: self.element_epilogue = element_epilogue self.epilogue_functor = epilogue_functor self.swizzling_functor = swizzling_functor + self.required_cuda_ver_major = required_cuda_ver_major + self.required_cuda_ver_minor = required_cuda_ver_minor + # def is_complex(self): @@ -161,7 +165,8 @@ class GemmOperation: # class GemvBatchedStridedOperation: # - def __init__(self, gemm_kind, arch, math_inst, threadblock_shape, thread_shape, A, B, C): + def __init__(self, gemm_kind, arch, math_inst, threadblock_shape, thread_shape, A, B, C, \ + required_cuda_ver_major = 9, required_cuda_ver_minor = 2): self.operation_kind = OperationKind.Gemm self.arch = arch @@ -172,6 +177,8 @@ class GemvBatchedStridedOperation: self.A = A self.B = B self.C = C + self.required_cuda_ver_major = required_cuda_ver_major + self.required_cuda_ver_minor = required_cuda_ver_minor # def accumulator_type(self): @@ -243,7 +250,7 @@ class GemvBatchedStridedOperation: return self.procedural_name() # -def GeneratesGemm(tile, data_type, layout_a, layout_b, layout_c, min_cc, align_a = 32, align_b = 32, align_c = 32): +def GeneratesGemm(tile, data_type, layout_a, layout_b, layout_c, min_cc, align_a = 32, align_b = 32, align_c = 32, required_cuda_ver_major = 9, required_cuda_ver_minor = 2): operations = [] swizzling_functor = SwizzlingFunctor.Identity1 @@ -261,20 +268,23 @@ def GeneratesGemm(tile, data_type, layout_a, layout_b, layout_c, min_cc, align_a B = TensorDescription(element_b, layout_b, int(align_b//DataTypeSize[element_b])) C = TensorDescription(element_c, layout_c, int(align_c//DataTypeSize[element_c])) operations.append(GemmOperation(GemmKind.Gemm, min_cc, tile, A, B, C, \ - element_epilogue, epilogue, swizzling_functor)) + element_epilogue, epilogue, swizzling_functor, \ + required_cuda_ver_major, required_cuda_ver_minor)) operations.append(GemmOperation(GemmKind.SplitKParallel, min_cc, tile, A, B, C, \ - element_epilogue, epilogue, swizzling_functor)) + element_epilogue, epilogue, swizzling_functor, \ + required_cuda_ver_major, required_cuda_ver_minor)) return operations def GeneratesGemv(math_inst, threadblock_shape, thread_shape, data_type, layout_a, layout_b, layout_c, min_cc, \ - align_a = 32, align_b = 32, align_c = 32): + align_a = 32, align_b = 32, align_c = 32, \ + required_cuda_ver_major = 9, required_cuda_ver_minor = 2): element_a, element_b, element_c, element_epilogue = data_type A = TensorDescription(element_a, layout_a, int(align_a//DataTypeSize[element_a])) B = TensorDescription(element_b, layout_b, int(align_b//DataTypeSize[element_b])) C = TensorDescription(element_c, layout_c, int(align_c//DataTypeSize[element_c])) return GemvBatchedStridedOperation(GemmKind.GemvBatchedStrided, min_cc, math_inst, threadblock_shape, thread_shape, \ - A, B, C) + A, B, C, required_cuda_ver_major, required_cuda_ver_minor) ################################################################################################### # @@ -1025,7 +1035,7 @@ class EmitGemmSingleKernelWrapper: self.instance_emitter = instance_emitters[self.operation.gemm_kind] self.header_template = """ -#if __CUDACC_VER_MAJOR__ > 9 || (__CUDACC_VER_MAJOR__ == 9 && __CUDACC_VER_MINOR__ >= 2) +#if __CUDACC_VER_MAJOR__ > ${required_cuda_ver_major} || (__CUDACC_VER_MAJOR__ == ${required_cuda_ver_major} && __CUDACC_VER_MINOR__ >= ${required_cuda_ver_minor}) // ignore warning of cutlass #pragma GCC diagnostic push #pragma GCC diagnostic ignored "-Wunused-parameter" @@ -1065,7 +1075,10 @@ void initialize_${operation_name}(Manifest &manifest) { def __enter__(self): self.kernel_path = os.path.join(self.kernel_path, "%s.cu" % self.operation.procedural_name()) self.kernel_file = open(self.kernel_path, "w") - self.kernel_file.write(self.header_template) + self.kernel_file.write(SubstituteTemplate(self.header_template, { + 'required_cuda_ver_major': str(self.operation.required_cuda_ver_major), + 'required_cuda_ver_minor': str(self.operation.required_cuda_ver_minor), + })) return self # @@ -1109,7 +1122,7 @@ template void megdnn::cuda::cutlass_wrapper:: self.instance_emitter = EmitGemvBatchedStridedInstance() self.header_template = """ -#if __CUDACC_VER_MAJOR__ > 9 || (__CUDACC_VER_MAJOR__ == 9 && __CUDACC_VER_MINOR__ >= 2) +#if __CUDACC_VER_MAJOR__ > ${required_cuda_ver_major} || (__CUDACC_VER_MAJOR__ == ${required_cuda_ver_major} && __CUDACC_VER_MINOR__ >= ${required_cuda_ver_minor}) // ignore warning of cutlass #pragma GCC diagnostic push #pragma GCC diagnostic ignored "-Wunused-parameter" @@ -1136,7 +1149,9 @@ ${operation_instance} self.kernel_file = open(self.kernel_path, "w") self.kernel_file.write(SubstituteTemplate(self.header_template, { 'wrapper_path': self.wrapper_path, - })) + 'required_cuda_ver_major': str(self.operation.required_cuda_ver_major), + 'required_cuda_ver_minor': str(self.operation.required_cuda_ver_minor), + })) return self # diff --git a/dnn/scripts/cutlass_generator/generator.py b/dnn/scripts/cutlass_generator/generator.py index cd7f810d..5a4d3d35 100644 --- a/dnn/scripts/cutlass_generator/generator.py +++ b/dnn/scripts/cutlass_generator/generator.py @@ -217,6 +217,9 @@ def GenerateConv2d_TensorOp_8816(args): min_cc = 75 max_cc = 1024 + cuda_major = 10 + cuda_minor = 2 + for math_inst in math_instructions: for layout in layouts: for dst_type, dst_layout in zip(dst_types, dst_layouts): @@ -234,7 +237,7 @@ def GenerateConv2d_TensorOp_8816(args): ] operations += GenerateConv2d(ConvKind.Fprop, tile_descriptions, layout[0], layout[1], dst_layout, dst_type, min_cc, 128, 128, 64, - False, ImplicitGemmMode.GemmTN, True) + False, ImplicitGemmMode.GemmTN, True, cuda_major, cuda_minor) else: assert dst_layout == LayoutType.TensorNC4HW4 tile_descriptions = [ @@ -250,7 +253,7 @@ def GenerateConv2d_TensorOp_8816(args): ] operations += GenerateConv2d(ConvKind.Fprop, tile_descriptions, layout[0], layout[1], dst_layout, dst_type, min_cc, 128, 128, 64, - False) + False, ImplicitGemmMode.GemmNT, False, cuda_major, cuda_minor) return operations @@ -281,6 +284,9 @@ def GenerateConv2d_TensorOp_8832(args): min_cc = 75 max_cc = 1024 + cuda_major = 10 + cuda_minor = 2 + for math_inst in math_instructions: for layout in layouts: for dst_layout in dst_layouts: @@ -293,7 +299,7 @@ def GenerateConv2d_TensorOp_8832(args): ] operations += GenerateConv2d(ConvKind.Fprop, tile_descriptions, layout[0], layout[1], dst_layout, dst_type, min_cc, 128, 128, 64, - False, ImplicitGemmMode.GemmTN, True) + False, ImplicitGemmMode.GemmTN, True, cuda_major, cuda_minor) layouts_nhwc = [ (LayoutType.TensorNHWC, LayoutType.TensorNC8HW8, 32), @@ -316,12 +322,12 @@ def GenerateConv2d_TensorOp_8832(args): for tile in tile_descriptions: operations += GenerateConv2d(ConvKind.Fprop, [tile], layout[0], layout[1], dst_layout, dst_type, min_cc, layout[2], layout[2], 32, - False, ImplicitGemmMode.GemmTN, False) + False, ImplicitGemmMode.GemmTN, False, cuda_major, cuda_minor) if tile.threadblock_shape[1] == 32 or tile.threadblock_shape[1] == 64: dst_align = 32 if tile.threadblock_shape[1] == 32 else 64 operations += GenerateConv2d(ConvKind.Fprop, [tile], layout[0], layout[1], dst_layout, dst_type, min_cc, layout[2], layout[2], dst_align, - False, ImplicitGemmMode.GemmTN, True) + False, ImplicitGemmMode.GemmTN, True, cuda_major, cuda_minor) return operations @@ -624,6 +630,8 @@ def GeneratesGemm_TensorOp_1688(args): alignment_constraints = [8, 4, 2, #1 ] + cuda_major = 10 + cuda_minor = 2 operations = [] for math_inst in math_instructions: @@ -655,7 +663,9 @@ def GeneratesGemm_TensorOp_1688(args): min_cc, \ align * 16, \ align * 16, \ - align * 16) + align * 16, \ + cuda_major, \ + cuda_minor) return operations # @@ -686,6 +696,8 @@ def GeneratesGemm_TensorOp_884(args): alignment_constraints = [8, 4, 2, # 1 ] + cuda_major = 10 + cuda_minor = 2 operations = [] for math_inst in math_instructions: @@ -717,7 +729,9 @@ def GeneratesGemm_TensorOp_884(args): min_cc, \ align * 16, \ align * 16, \ - align * 16) + align * 16, \ + cuda_major, \ + cuda_minor) return operations diff --git a/dnn/scripts/cutlass_generator/manifest.py b/dnn/scripts/cutlass_generator/manifest.py index 88c9fc6d..33aafb8a 100644 --- a/dnn/scripts/cutlass_generator/manifest.py +++ b/dnn/scripts/cutlass_generator/manifest.py @@ -351,6 +351,13 @@ void initialize_all(Manifest &manifest) { ################################################################################################### def GenerateManifest(args, operations, output_dir): + assert isinstance(operations, list) + if len(operations) == 0: + return + op = operations[0] + required_cuda_ver_major = op.required_cuda_ver_major + required_cuda_ver_minor = op.required_cuda_ver_minor + manifest_path = os.path.join(output_dir, "all_%s_%s_operations.cu" % (args.operations, args.type)) f = open(manifest_path, "w") f.write(""" @@ -358,7 +365,7 @@ def GenerateManifest(args, operations, output_dir): Generated by generator.py - Do not edit. */ -#if __CUDACC_VER_MAJOR__ > 9 || (__CUDACC_VER_MAJOR__ == 9 && __CUDACC_VER_MINOR__ >= 2) +#if __CUDACC_VER_MAJOR__ > %s || (__CUDACC_VER_MAJOR__ == %s && __CUDACC_VER_MINOR__ >= %s) #include "cutlass/cutlass.h" #include "src/cuda/cutlass/library.h" @@ -367,7 +374,8 @@ def GenerateManifest(args, operations, output_dir): namespace cutlass { namespace library { -""") +""" % (str(required_cuda_ver_major), str(required_cuda_ver_major), str(required_cuda_ver_minor))) + for op in operations: f.write("void initialize_%s(Manifest &manifest);\n" % op.procedural_name()) diff --git a/dnn/src/cuda/cutlass/initialize_all.cu b/dnn/src/cuda/cutlass/initialize_all.cu index e836dd76..89d32c5d 100644 --- a/dnn/src/cuda/cutlass/initialize_all.cu +++ b/dnn/src/cuda/cutlass/initialize_all.cu @@ -44,26 +44,34 @@ namespace cutlass { namespace library { ///////////////////////////////////////////////////////////////////////////////////////////////// +#if ((__CUDACC_VER_MAJOR__ > 10) || \ + (__CUDACC_VER_MAJOR__ == 10 && __CUDACC_VER_MINOR__ >= 2)) +#define CUTLASS_ARCH_MMA_SM75_SUPPORTED 1 +#endif #if __CUDACC_VER_MAJOR__ > 9 || \ (__CUDACC_VER_MAJOR__ == 9 && __CUDACC_VER_MINOR__ >= 2) void initialize_all_gemm_simt_operations(Manifest& manifest); +void initialize_all_conv2d_simt_operations(Manifest& manifest); +void initialize_all_deconv_simt_operations(Manifest& manifest); +#if defined(CUTLASS_ARCH_MMA_SM75_SUPPORTED) && CUTLASS_ARCH_MMA_SM75_SUPPORTED void initialize_all_gemm_tensorop884_operations(Manifest& manifest); void initialize_all_gemm_tensorop1688_operations(Manifest& manifest); -void initialize_all_conv2d_simt_operations(Manifest& manifest); void initialize_all_conv2d_tensorop8816_operations(Manifest& manifest); void initialize_all_conv2d_tensorop8832_operations(Manifest& manifest); -void initialize_all_deconv_simt_operations(Manifest& manifest); +#endif void initialize_all(Manifest& manifest) { initialize_all_gemm_simt_operations(manifest); + initialize_all_conv2d_simt_operations(manifest); + initialize_all_deconv_simt_operations(manifest); +#if defined(CUTLASS_ARCH_MMA_SM75_SUPPORTED) && CUTLASS_ARCH_MMA_SM75_SUPPORTED initialize_all_gemm_tensorop884_operations(manifest); initialize_all_gemm_tensorop1688_operations(manifest); - initialize_all_conv2d_simt_operations(manifest); initialize_all_conv2d_tensorop8816_operations(manifest); initialize_all_conv2d_tensorop8832_operations(manifest); - initialize_all_deconv_simt_operations(manifest); +#endif } #else diff --git a/dnn/src/cuda/matrix_mul/algos.cpp b/dnn/src/cuda/matrix_mul/algos.cpp index e3bb328d..ada31b74 100644 --- a/dnn/src/cuda/matrix_mul/algos.cpp +++ b/dnn/src/cuda/matrix_mul/algos.cpp @@ -43,6 +43,7 @@ MatrixMulForwardImpl::AlgoPack::AlgoPack() { for (auto&& algo : simt_float32_gemv_batched_strided) { all_algos.push_back(&algo); } +#if CUDA_VERSION >= 10020 for (auto&& algo : tensorop_float16) { all_algos.push_back(&algo); } @@ -50,6 +51,7 @@ MatrixMulForwardImpl::AlgoPack::AlgoPack() { all_algos.push_back(&algo); } #endif +#endif all_algos.push_back(&naive); for (auto&& algo : all_algos) { @@ -107,7 +109,9 @@ void MatrixMulForwardImpl::AlgoPack::fill_cutlass_algos() { #define cb(...) \ tensorop_float16.emplace_back(AlgoParam{__VA_ARGS__}); \ tensorop_float16_split_k.emplace_back(AlgoParam{__VA_ARGS__}); +#if CUDA_VERSION >= 10020 FOREACH_CUTLASS_MATMUL_F16_SHAPES(cb) +#endif #undef cb #undef FOREACH_CUTLASS_MATMUL_F16_SHAPES } diff --git a/dnn/src/cuda/matrix_mul/algos.h b/dnn/src/cuda/matrix_mul/algos.h index 34c7cbf3..aa2807eb 100644 --- a/dnn/src/cuda/matrix_mul/algos.h +++ b/dnn/src/cuda/matrix_mul/algos.h @@ -241,6 +241,20 @@ public: return AlgoAttribute::REPRODUCIBLE; } MEGDNN_DECL_ALGO_TYPE(CUDA_FLOAT32_SIMT) + std::string param() const override { + std::string ret; + // FIXME: algo param compatible with old version, to avoid fastrun cache error + struct AlgoParam_ { + int threadblock_m, threadblock_n, threadblock_k; + int warp_m, warp_n, warp_k; + }; + AlgoParam_ algo_param{ + m_algo_param.threadblock_m, m_algo_param.threadblock_n, + m_algo_param.threadblock_k, m_algo_param.warp_m, + m_algo_param.warp_n, m_algo_param.warp_k}; + serialize_write_pod(algo_param, ret); + return ret; + } private: void do_exec(const ExecArgs& args) const override; @@ -263,6 +277,21 @@ public: AlgoAttribute::USABLE_DEPEND_ON_SHAPE; } MEGDNN_DECL_ALGO_TYPE(CUDA_FLOAT32_SIMT_SPLIT_K) + std::string param() const override { + std::string ret; + // FIXME: algo param compatible with old version, to avoid fastrun cache + // error + struct AlgoParam_ { + int threadblock_m, threadblock_n, threadblock_k; + int warp_m, warp_n, warp_k; + }; + AlgoParam_ algo_param{ + m_algo_param.threadblock_m, m_algo_param.threadblock_n, + m_algo_param.threadblock_k, m_algo_param.warp_m, + m_algo_param.warp_n, m_algo_param.warp_k}; + serialize_write_pod(algo_param, ret); + return ret; + } private: void do_exec(const ExecArgs& args) const override; @@ -297,6 +326,7 @@ private: std::string m_name; }; +#if CUDA_VERSION >= 10020 class MatrixMulForwardImpl::AlgoFloat16TensorOp final : public AlgoCutlassMatrixMulBase { public: @@ -345,7 +375,7 @@ private: int min_alignment_requirement() const override { return 2; } std::string m_name; }; - +#endif #endif class MatrixMulForwardImpl::AlgoPack : NonCopyableObj { @@ -370,9 +400,11 @@ public: std::vector simt_float32_split_k; std::vector simt_float32_gemv_batched_strided; +#if CUDA_VERSION >= 10020 std::vector tensorop_float16; std::vector tensorop_float16_split_k; #endif +#endif std::vector all_algos; const AlgoBase::Mapper& all_algos_map() const { return m_all_algos_map; } diff --git a/dnn/src/cuda/matrix_mul/cutlass_float16_tensorop.cpp b/dnn/src/cuda/matrix_mul/cutlass_float16_tensorop.cpp index ea2c05e9..1dcf6d84 100644 --- a/dnn/src/cuda/matrix_mul/cutlass_float16_tensorop.cpp +++ b/dnn/src/cuda/matrix_mul/cutlass_float16_tensorop.cpp @@ -2,7 +2,7 @@ * \file dnn/src/cuda/matrix_mul/cutlass_float16_tensorop.cpp * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") * - * Copyright (c) 2014-2020 Megvii Inc. All rights reserved. + * Copyright (c) 2014-2021 Megvii Inc. All rights reserved. * * Unless required by applicable law or agreed to in writing, * software distributed under the License is distributed on an @@ -15,7 +15,7 @@ #include "src/cuda/matrix_mul/algos.h" #include "src/cuda/utils.h" -#if CUDA_VERSION >= 9020 +#if CUDA_VERSION >= 10020 using namespace megdnn; using namespace cuda; diff --git a/dnn/src/cuda/matrix_mul/cutlass_float16_tensorop_split_k.cpp b/dnn/src/cuda/matrix_mul/cutlass_float16_tensorop_split_k.cpp index 18211251..c9b9adf5 100644 --- a/dnn/src/cuda/matrix_mul/cutlass_float16_tensorop_split_k.cpp +++ b/dnn/src/cuda/matrix_mul/cutlass_float16_tensorop_split_k.cpp @@ -2,7 +2,7 @@ * \file dnn/src/cuda/matrix_mul/cutlass_float16_tensorop_split_k.cpp * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") * - * Copyright (c) 2014-2020 Megvii Inc. All rights reserved. + * Copyright (c) 2014-2021 Megvii Inc. All rights reserved. * * Unless required by applicable law or agreed to in writing, * software distributed under the License is distributed on an @@ -15,14 +15,14 @@ #include "src/cuda/matrix_mul/algos.h" #include "src/cuda/utils.h" -#if CUDA_VERSION >= 9020 +#if CUDA_VERSION >= 10020 using namespace megdnn; using namespace cuda; bool MatrixMulForwardImpl::AlgoFloat16TensorOpSplitK::is_available( const SizeArgs& args) const { auto&& param = args.opr->param(); - int n = args.layout_c.shape[1], + int m = args.layout_c.shape[0], n = args.layout_c.shape[1], k = args.layout_a.shape[param.transposeA ? 0 : 1]; bool available = args.opr->param().format == param::MatrixMul::Format::DEFAULT && @@ -32,8 +32,8 @@ bool MatrixMulForwardImpl::AlgoFloat16TensorOpSplitK::is_available( auto&& device_prop = cuda::current_device_prop(); int y_grid_limit = device_prop.maxGridSize[1]; // limit y grid - available &= ((n + m_algo_param.threadblock_n - 1) / - m_algo_param.threadblock_n <= + available &= ((m + m_algo_param.threadblock_m - 1) / + m_algo_param.threadblock_m <= y_grid_limit); if (m_algo_param.instruction_m == 8 && m_algo_param.instruction_n == 8 && m_algo_param.instruction_k == 4) { diff --git a/dnn/src/cuda/matrix_mul/cutlass_float32_simt.cpp b/dnn/src/cuda/matrix_mul/cutlass_float32_simt.cpp index 63d9faef..7278202d 100644 --- a/dnn/src/cuda/matrix_mul/cutlass_float32_simt.cpp +++ b/dnn/src/cuda/matrix_mul/cutlass_float32_simt.cpp @@ -2,7 +2,7 @@ * \file dnn/src/cuda/matrix_mul/cutlass_float32_simt.cpp * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") * - * Copyright (c) 2014-2020 Megvii Inc. All rights reserved. + * Copyright (c) 2014-2021 Megvii Inc. All rights reserved. * * Unless required by applicable law or agreed to in writing, * software distributed under the License is distributed on an diff --git a/dnn/src/cuda/matrix_mul/cutlass_float32_simt_gemv_batched_strided.cpp b/dnn/src/cuda/matrix_mul/cutlass_float32_simt_gemv_batched_strided.cpp index 6d581a8e..d0b94e35 100644 --- a/dnn/src/cuda/matrix_mul/cutlass_float32_simt_gemv_batched_strided.cpp +++ b/dnn/src/cuda/matrix_mul/cutlass_float32_simt_gemv_batched_strided.cpp @@ -2,7 +2,7 @@ * \file dnn/src/cuda/matrix_mul/cutlass_float32_simt_gemv_batched_strided.cpp * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") * - * Copyright (c) 2014-2020 Megvii Inc. All rights reserved. + * Copyright (c) 2014-2021 Megvii Inc. All rights reserved. * * Unless required by applicable law or agreed to in writing, * software distributed under the License is distributed on an diff --git a/dnn/src/cuda/matrix_mul/cutlass_float32_simt_split_k.cpp b/dnn/src/cuda/matrix_mul/cutlass_float32_simt_split_k.cpp index 10ef7f42..dfbc3a06 100644 --- a/dnn/src/cuda/matrix_mul/cutlass_float32_simt_split_k.cpp +++ b/dnn/src/cuda/matrix_mul/cutlass_float32_simt_split_k.cpp @@ -2,7 +2,7 @@ * \file dnn/src/cuda/matrix_mul/cutlass_float32_simt_split_k.cpp * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") * - * Copyright (c) 2014-2020 Megvii Inc. All rights reserved. + * Copyright (c) 2014-2021 Megvii Inc. All rights reserved. * * Unless required by applicable law or agreed to in writing, * software distributed under the License is distributed on an @@ -22,7 +22,7 @@ using namespace cuda; bool MatrixMulForwardImpl::AlgoFloat32SIMTSplitK::is_available( const SizeArgs& args) const { auto&& param = args.opr->param(); - int n = args.layout_c.shape[1], + int m = args.layout_c.shape[0], n = args.layout_c.shape[1], k = args.layout_a.shape[param.transposeA ? 0 : 1]; bool available = args.opr->param().format == param::MatrixMul::Format::DEFAULT && @@ -32,8 +32,8 @@ bool MatrixMulForwardImpl::AlgoFloat32SIMTSplitK::is_available( auto&& device_prop = cuda::current_device_prop(); int y_grid_limit = device_prop.maxGridSize[1]; // limit y grid - available &= ((n + m_algo_param.threadblock_n - 1) / - m_algo_param.threadblock_n <= + available &= ((m + m_algo_param.threadblock_m - 1) / + m_algo_param.threadblock_m <= y_grid_limit); return available; } diff --git a/dnn/src/cuda/matrix_mul/cutlass_matrix_mul_base.cpp b/dnn/src/cuda/matrix_mul/cutlass_matrix_mul_base.cpp index 55a95d1f..5169e498 100644 --- a/dnn/src/cuda/matrix_mul/cutlass_matrix_mul_base.cpp +++ b/dnn/src/cuda/matrix_mul/cutlass_matrix_mul_base.cpp @@ -2,7 +2,7 @@ * \file dnn/src/cuda/matrix_mul/cutlass_matrix_mul_base.cpp * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") * - * Copyright (c) 2014-2020 Megvii Inc. All rights reserved. + * Copyright (c) 2014-2021 Megvii Inc. All rights reserved. * * Unless required by applicable law or agreed to in writing, * software distributed under the License is distributed on an diff --git a/dnn/test/cuda/cutlass_matmul.cpp b/dnn/test/cuda/cutlass_matmul.cpp index b406fcf8..50e42e96 100644 --- a/dnn/test/cuda/cutlass_matmul.cpp +++ b/dnn/test/cuda/cutlass_matmul.cpp @@ -21,7 +21,6 @@ #include "test/cuda/fixture.h" #include "test/cuda/utils.h" -#define MEGDNN_WITH_BENCHMARK 1 #if CUDA_VERSION >= 9020 namespace megdnn { namespace test { @@ -373,6 +372,7 @@ MEGDNN_FOREACH_CUTLASS_KERNEL(cb) #undef cb #undef MEGDNN_FOREACH_CUTLASS_KERNEL +#if CUDA_VERSION >= 10020 #define MEGDNN_FOREACH_CUTLASS_KERNEL(cb) \ cb(1, 256, 128, 32, 64, 64, 32, 8, 8, 4); \ cb(2, 128, 256, 32, 64, 64, 32, 8, 8, 4); \ @@ -448,6 +448,7 @@ MEGDNN_FOREACH_CUTLASS_KERNEL(cb) #undef cb #undef MEGDNN_FOREACH_CUTLASS_KERNEL +#endif #if MEGDNN_WITH_BENCHMARK TEST_F(CUDA, BENCHMARK_CUTLASS_MATMUL) { @@ -462,12 +463,14 @@ TEST_F(CUDA, BENCHMARK_CUTLASS_MATMUL_FEAT) { "CUTLASS_FLOAT32_SIMT"); } +#if CUDA_VERSION >= 10020 TEST_F(CUDA, BENCHMARK_CUTLASS_F16_MATMUL_FEAT) { benchmark_matrix_mul(handle_cuda(), get_f16_feat_model_args(), dtype::Float16(), dtype::Float16(), dtype::Float16(), "CUTLASS_FLOAT16_TENSOR_OP"); } #endif +#endif } // namespace test } // namespace megdnn #endif