Browse Source

fix(dnn/cuda): fix cutlass tensorop kernels

do not compile cutlass tensorop kernels, when using cuda version less than 10.2

GitOrigin-RevId: d4c37d5f41
tags/v1.6.0-rc1
Megvii Engine Team 3 years ago
parent
commit
ff0e6be7b9
14 changed files with 139 additions and 47 deletions
  1. +15
    -7
      dnn/scripts/cutlass_generator/conv2d_operation.py
  2. +26
    -11
      dnn/scripts/cutlass_generator/gemm_operation.py
  3. +21
    -7
      dnn/scripts/cutlass_generator/generator.py
  4. +10
    -2
      dnn/scripts/cutlass_generator/manifest.py
  5. +12
    -4
      dnn/src/cuda/cutlass/initialize_all.cu
  6. +4
    -0
      dnn/src/cuda/matrix_mul/algos.cpp
  7. +33
    -1
      dnn/src/cuda/matrix_mul/algos.h
  8. +2
    -2
      dnn/src/cuda/matrix_mul/cutlass_float16_tensorop.cpp
  9. +5
    -5
      dnn/src/cuda/matrix_mul/cutlass_float16_tensorop_split_k.cpp
  10. +1
    -1
      dnn/src/cuda/matrix_mul/cutlass_float32_simt.cpp
  11. +1
    -1
      dnn/src/cuda/matrix_mul/cutlass_float32_simt_gemv_batched_strided.cpp
  12. +4
    -4
      dnn/src/cuda/matrix_mul/cutlass_float32_simt_split_k.cpp
  13. +1
    -1
      dnn/src/cuda/matrix_mul/cutlass_matrix_mul_base.cpp
  14. +4
    -1
      dnn/test/cuda/cutlass_matmul.cpp

+ 15
- 7
dnn/scripts/cutlass_generator/conv2d_operation.py View File

@@ -19,7 +19,8 @@ class Conv2dOperation:
# #
def __init__(self, conv_kind, conv_type, arch, tile_description, src, flt, bias, dst, element_epilogue, \ def __init__(self, conv_kind, conv_type, arch, tile_description, src, flt, bias, dst, element_epilogue, \
epilogue_functor = EpilogueFunctor.LinearCombination, swizzling_functor = SwizzlingFunctor.Identity4, \ epilogue_functor = EpilogueFunctor.LinearCombination, swizzling_functor = SwizzlingFunctor.Identity4, \
need_load_from_const = True, implicit_gemm_mode = ImplicitGemmMode.GemmNT, without_shared_load = False):
need_load_from_const = True, implicit_gemm_mode = ImplicitGemmMode.GemmNT, without_shared_load = False, \
required_cuda_ver_major = 9, required_cuda_ver_minor = 2):


self.operation_kind = OperationKind.Conv2d self.operation_kind = OperationKind.Conv2d
self.conv_kind = conv_kind self.conv_kind = conv_kind
@@ -36,6 +37,9 @@ class Conv2dOperation:
self.need_load_from_const = need_load_from_const self.need_load_from_const = need_load_from_const
self.implicit_gemm_mode = implicit_gemm_mode self.implicit_gemm_mode = implicit_gemm_mode
self.without_shared_load = without_shared_load self.without_shared_load = without_shared_load
self.required_cuda_ver_major = required_cuda_ver_major
self.required_cuda_ver_minor = required_cuda_ver_minor

# #
def accumulator_type(self): def accumulator_type(self):
accum = self.tile_description.math_instruction.element_accumulator accum = self.tile_description.math_instruction.element_accumulator
@@ -320,7 +324,8 @@ using Deconvolution =


# #
def GenerateConv2d(conv_kind, tile_descriptions, src_layout, flt_layout, dst_layout, dst_type, min_cc, src_align = 32, flt_align = 32, dst_align = 128, \ def GenerateConv2d(conv_kind, tile_descriptions, src_layout, flt_layout, dst_layout, dst_type, min_cc, src_align = 32, flt_align = 32, dst_align = 128, \
skip_unity_kernel = False, implicit_gemm_mode = ImplicitGemmMode.GemmNT, without_shared_load = False):
skip_unity_kernel = False, implicit_gemm_mode = ImplicitGemmMode.GemmNT, without_shared_load = False, required_cuda_ver_major = 9, \
required_cuda_ver_minor = 2):
operations = [] operations = []


element_epilogue = DataType.f32 element_epilogue = DataType.f32
@@ -407,10 +412,10 @@ def GenerateConv2d(conv_kind, tile_descriptions, src_layout, flt_layout, dst_lay
bias = TensorDescription(bias_type, dst_layout, max(1, int(32 / DataTypeSize[bias_type]))) bias = TensorDescription(bias_type, dst_layout, max(1, int(32 / DataTypeSize[bias_type])))
dst = TensorDescription(dst_type, dst_layout, int(dst_align / DataTypeSize[dst_type])) dst = TensorDescription(dst_type, dst_layout, int(dst_align / DataTypeSize[dst_type]))


new_operation = Conv2dOperation(conv_kind, ConvType.Convolution, min_cc, tile, src, flt, bias, dst, element_epilogue, epilogue, swizzling_functor, True, implicit_gemm_mode, without_shared_load)
new_operation = Conv2dOperation(conv_kind, ConvType.Convolution, min_cc, tile, src, flt, bias, dst, element_epilogue, epilogue, swizzling_functor, True, implicit_gemm_mode, without_shared_load, required_cuda_ver_major, required_cuda_ver_minor)
operations.append(new_operation) operations.append(new_operation)
if not skip_unity_kernel: if not skip_unity_kernel:
new_operation = Conv2dOperation(conv_kind, ConvType.Convolution, min_cc, tile, src, flt, bias, dst, element_epilogue, epilogue, swizzling_functor, False, implicit_gemm_mode, without_shared_load)
new_operation = Conv2dOperation(conv_kind, ConvType.Convolution, min_cc, tile, src, flt, bias, dst, element_epilogue, epilogue, swizzling_functor, False, implicit_gemm_mode, without_shared_load, required_cuda_ver_major, required_cuda_ver_minor)
operations.append(new_operation) operations.append(new_operation)
return operations return operations


@@ -545,7 +550,7 @@ class EmitConvSingleKernelWrapper():
self.convolution_name = "Deconvolution" self.convolution_name = "Deconvolution"


self.header_template = """ self.header_template = """
#if !MEGDNN_TEGRA_X1
#if __CUDACC_VER_MAJOR__ > ${required_cuda_ver_major} || (__CUDACC_VER_MAJOR__ == ${required_cuda_ver_major} && __CUDACC_VER_MINOR__ >= ${required_cuda_ver_minor})
// ignore warning of cutlass // ignore warning of cutlass
#pragma GCC diagnostic push #pragma GCC diagnostic push
#pragma GCC diagnostic ignored "-Wunused-parameter" #pragma GCC diagnostic ignored "-Wunused-parameter"
@@ -589,14 +594,17 @@ void initialize_${operation_name}(Manifest &manifest) {
else: else:
self.kernel_path = os.path.join(self.kernel_path, "%s.cu" % self.operation.procedural_name()) self.kernel_path = os.path.join(self.kernel_path, "%s.cu" % self.operation.procedural_name())
self.kernel_file = open(self.kernel_path, "w") self.kernel_file = open(self.kernel_path, "w")
self.kernel_file.write(self.header_template)
self.kernel_file.write(SubstituteTemplate(self.header_template, {
'required_cuda_ver_major': str(self.operation.required_cuda_ver_major),
'required_cuda_ver_minor': str(self.operation.required_cuda_ver_minor),
}))
return self return self


# #
def emit(self): def emit(self):
self.kernel_file.write(SubstituteTemplate(self.instance_template, { self.kernel_file.write(SubstituteTemplate(self.instance_template, {
'operation_instance': self.instance_emitter.emit(self.operation), 'operation_instance': self.instance_emitter.emit(self.operation),
}))
}))


# emit manifest helper # emit manifest helper
manifest = SubstituteTemplate(self.manifest_template, { manifest = SubstituteTemplate(self.manifest_template, {


+ 26
- 11
dnn/scripts/cutlass_generator/gemm_operation.py View File

@@ -23,7 +23,8 @@ from library import *
class GemmOperation: class GemmOperation:
# #
def __init__(self, gemm_kind, arch, tile_description, A, B, C, element_epilogue, \ def __init__(self, gemm_kind, arch, tile_description, A, B, C, element_epilogue, \
epilogue_functor = EpilogueFunctor.LinearCombination, swizzling_functor = SwizzlingFunctor.Identity8):
epilogue_functor = EpilogueFunctor.LinearCombination, swizzling_functor = SwizzlingFunctor.Identity8, \
required_cuda_ver_major = 9, required_cuda_ver_minor = 2):


self.operation_kind = OperationKind.Gemm self.operation_kind = OperationKind.Gemm
self.arch = arch self.arch = arch
@@ -35,6 +36,9 @@ class GemmOperation:
self.element_epilogue = element_epilogue self.element_epilogue = element_epilogue
self.epilogue_functor = epilogue_functor self.epilogue_functor = epilogue_functor
self.swizzling_functor = swizzling_functor self.swizzling_functor = swizzling_functor
self.required_cuda_ver_major = required_cuda_ver_major
self.required_cuda_ver_minor = required_cuda_ver_minor



# #
def is_complex(self): def is_complex(self):
@@ -161,7 +165,8 @@ class GemmOperation:
# #
class GemvBatchedStridedOperation: class GemvBatchedStridedOperation:
# #
def __init__(self, gemm_kind, arch, math_inst, threadblock_shape, thread_shape, A, B, C):
def __init__(self, gemm_kind, arch, math_inst, threadblock_shape, thread_shape, A, B, C, \
required_cuda_ver_major = 9, required_cuda_ver_minor = 2):


self.operation_kind = OperationKind.Gemm self.operation_kind = OperationKind.Gemm
self.arch = arch self.arch = arch
@@ -172,6 +177,8 @@ class GemvBatchedStridedOperation:
self.A = A self.A = A
self.B = B self.B = B
self.C = C self.C = C
self.required_cuda_ver_major = required_cuda_ver_major
self.required_cuda_ver_minor = required_cuda_ver_minor


# #
def accumulator_type(self): def accumulator_type(self):
@@ -243,7 +250,7 @@ class GemvBatchedStridedOperation:
return self.procedural_name() return self.procedural_name()


# #
def GeneratesGemm(tile, data_type, layout_a, layout_b, layout_c, min_cc, align_a = 32, align_b = 32, align_c = 32):
def GeneratesGemm(tile, data_type, layout_a, layout_b, layout_c, min_cc, align_a = 32, align_b = 32, align_c = 32, required_cuda_ver_major = 9, required_cuda_ver_minor = 2):
operations = [] operations = []
swizzling_functor = SwizzlingFunctor.Identity1 swizzling_functor = SwizzlingFunctor.Identity1


@@ -261,20 +268,23 @@ def GeneratesGemm(tile, data_type, layout_a, layout_b, layout_c, min_cc, align_a
B = TensorDescription(element_b, layout_b, int(align_b//DataTypeSize[element_b])) B = TensorDescription(element_b, layout_b, int(align_b//DataTypeSize[element_b]))
C = TensorDescription(element_c, layout_c, int(align_c//DataTypeSize[element_c])) C = TensorDescription(element_c, layout_c, int(align_c//DataTypeSize[element_c]))
operations.append(GemmOperation(GemmKind.Gemm, min_cc, tile, A, B, C, \ operations.append(GemmOperation(GemmKind.Gemm, min_cc, tile, A, B, C, \
element_epilogue, epilogue, swizzling_functor))
element_epilogue, epilogue, swizzling_functor, \
required_cuda_ver_major, required_cuda_ver_minor))
operations.append(GemmOperation(GemmKind.SplitKParallel, min_cc, tile, A, B, C, \ operations.append(GemmOperation(GemmKind.SplitKParallel, min_cc, tile, A, B, C, \
element_epilogue, epilogue, swizzling_functor))
element_epilogue, epilogue, swizzling_functor, \
required_cuda_ver_major, required_cuda_ver_minor))
return operations return operations


def GeneratesGemv(math_inst, threadblock_shape, thread_shape, data_type, layout_a, layout_b, layout_c, min_cc, \ def GeneratesGemv(math_inst, threadblock_shape, thread_shape, data_type, layout_a, layout_b, layout_c, min_cc, \
align_a = 32, align_b = 32, align_c = 32):
align_a = 32, align_b = 32, align_c = 32, \
required_cuda_ver_major = 9, required_cuda_ver_minor = 2):
element_a, element_b, element_c, element_epilogue = data_type element_a, element_b, element_c, element_epilogue = data_type


A = TensorDescription(element_a, layout_a, int(align_a//DataTypeSize[element_a])) A = TensorDescription(element_a, layout_a, int(align_a//DataTypeSize[element_a]))
B = TensorDescription(element_b, layout_b, int(align_b//DataTypeSize[element_b])) B = TensorDescription(element_b, layout_b, int(align_b//DataTypeSize[element_b]))
C = TensorDescription(element_c, layout_c, int(align_c//DataTypeSize[element_c])) C = TensorDescription(element_c, layout_c, int(align_c//DataTypeSize[element_c]))
return GemvBatchedStridedOperation(GemmKind.GemvBatchedStrided, min_cc, math_inst, threadblock_shape, thread_shape, \ return GemvBatchedStridedOperation(GemmKind.GemvBatchedStrided, min_cc, math_inst, threadblock_shape, thread_shape, \
A, B, C)
A, B, C, required_cuda_ver_major, required_cuda_ver_minor)


################################################################################################### ###################################################################################################
# #
@@ -1025,7 +1035,7 @@ class EmitGemmSingleKernelWrapper:
self.instance_emitter = instance_emitters[self.operation.gemm_kind] self.instance_emitter = instance_emitters[self.operation.gemm_kind]


self.header_template = """ self.header_template = """
#if __CUDACC_VER_MAJOR__ > 9 || (__CUDACC_VER_MAJOR__ == 9 && __CUDACC_VER_MINOR__ >= 2)
#if __CUDACC_VER_MAJOR__ > ${required_cuda_ver_major} || (__CUDACC_VER_MAJOR__ == ${required_cuda_ver_major} && __CUDACC_VER_MINOR__ >= ${required_cuda_ver_minor})
// ignore warning of cutlass // ignore warning of cutlass
#pragma GCC diagnostic push #pragma GCC diagnostic push
#pragma GCC diagnostic ignored "-Wunused-parameter" #pragma GCC diagnostic ignored "-Wunused-parameter"
@@ -1065,7 +1075,10 @@ void initialize_${operation_name}(Manifest &manifest) {
def __enter__(self): def __enter__(self):
self.kernel_path = os.path.join(self.kernel_path, "%s.cu" % self.operation.procedural_name()) self.kernel_path = os.path.join(self.kernel_path, "%s.cu" % self.operation.procedural_name())
self.kernel_file = open(self.kernel_path, "w") self.kernel_file = open(self.kernel_path, "w")
self.kernel_file.write(self.header_template)
self.kernel_file.write(SubstituteTemplate(self.header_template, {
'required_cuda_ver_major': str(self.operation.required_cuda_ver_major),
'required_cuda_ver_minor': str(self.operation.required_cuda_ver_minor),
}))
return self return self


# #
@@ -1109,7 +1122,7 @@ template void megdnn::cuda::cutlass_wrapper::
self.instance_emitter = EmitGemvBatchedStridedInstance() self.instance_emitter = EmitGemvBatchedStridedInstance()


self.header_template = """ self.header_template = """
#if __CUDACC_VER_MAJOR__ > 9 || (__CUDACC_VER_MAJOR__ == 9 && __CUDACC_VER_MINOR__ >= 2)
#if __CUDACC_VER_MAJOR__ > ${required_cuda_ver_major} || (__CUDACC_VER_MAJOR__ == ${required_cuda_ver_major} && __CUDACC_VER_MINOR__ >= ${required_cuda_ver_minor})
// ignore warning of cutlass // ignore warning of cutlass
#pragma GCC diagnostic push #pragma GCC diagnostic push
#pragma GCC diagnostic ignored "-Wunused-parameter" #pragma GCC diagnostic ignored "-Wunused-parameter"
@@ -1136,7 +1149,9 @@ ${operation_instance}
self.kernel_file = open(self.kernel_path, "w") self.kernel_file = open(self.kernel_path, "w")
self.kernel_file.write(SubstituteTemplate(self.header_template, { self.kernel_file.write(SubstituteTemplate(self.header_template, {
'wrapper_path': self.wrapper_path, 'wrapper_path': self.wrapper_path,
}))
'required_cuda_ver_major': str(self.operation.required_cuda_ver_major),
'required_cuda_ver_minor': str(self.operation.required_cuda_ver_minor),
}))
return self return self


# #


+ 21
- 7
dnn/scripts/cutlass_generator/generator.py View File

@@ -217,6 +217,9 @@ def GenerateConv2d_TensorOp_8816(args):
min_cc = 75 min_cc = 75
max_cc = 1024 max_cc = 1024


cuda_major = 10
cuda_minor = 2

for math_inst in math_instructions: for math_inst in math_instructions:
for layout in layouts: for layout in layouts:
for dst_type, dst_layout in zip(dst_types, dst_layouts): for dst_type, dst_layout in zip(dst_types, dst_layouts):
@@ -234,7 +237,7 @@ def GenerateConv2d_TensorOp_8816(args):
] ]
operations += GenerateConv2d(ConvKind.Fprop, tile_descriptions, layout[0], layout[1], operations += GenerateConv2d(ConvKind.Fprop, tile_descriptions, layout[0], layout[1],
dst_layout, dst_type, min_cc, 128, 128, 64, dst_layout, dst_type, min_cc, 128, 128, 64,
False, ImplicitGemmMode.GemmTN, True)
False, ImplicitGemmMode.GemmTN, True, cuda_major, cuda_minor)
else: else:
assert dst_layout == LayoutType.TensorNC4HW4 assert dst_layout == LayoutType.TensorNC4HW4
tile_descriptions = [ tile_descriptions = [
@@ -250,7 +253,7 @@ def GenerateConv2d_TensorOp_8816(args):
] ]
operations += GenerateConv2d(ConvKind.Fprop, tile_descriptions, layout[0], layout[1], operations += GenerateConv2d(ConvKind.Fprop, tile_descriptions, layout[0], layout[1],
dst_layout, dst_type, min_cc, 128, 128, 64, dst_layout, dst_type, min_cc, 128, 128, 64,
False)
False, ImplicitGemmMode.GemmNT, False, cuda_major, cuda_minor)
return operations return operations


@@ -281,6 +284,9 @@ def GenerateConv2d_TensorOp_8832(args):
min_cc = 75 min_cc = 75
max_cc = 1024 max_cc = 1024


cuda_major = 10
cuda_minor = 2

for math_inst in math_instructions: for math_inst in math_instructions:
for layout in layouts: for layout in layouts:
for dst_layout in dst_layouts: for dst_layout in dst_layouts:
@@ -293,7 +299,7 @@ def GenerateConv2d_TensorOp_8832(args):
] ]
operations += GenerateConv2d(ConvKind.Fprop, tile_descriptions, layout[0], layout[1], operations += GenerateConv2d(ConvKind.Fprop, tile_descriptions, layout[0], layout[1],
dst_layout, dst_type, min_cc, 128, 128, 64, dst_layout, dst_type, min_cc, 128, 128, 64,
False, ImplicitGemmMode.GemmTN, True)
False, ImplicitGemmMode.GemmTN, True, cuda_major, cuda_minor)


layouts_nhwc = [ layouts_nhwc = [
(LayoutType.TensorNHWC, LayoutType.TensorNC8HW8, 32), (LayoutType.TensorNHWC, LayoutType.TensorNC8HW8, 32),
@@ -316,12 +322,12 @@ def GenerateConv2d_TensorOp_8832(args):
for tile in tile_descriptions: for tile in tile_descriptions:
operations += GenerateConv2d(ConvKind.Fprop, [tile], layout[0], layout[1], operations += GenerateConv2d(ConvKind.Fprop, [tile], layout[0], layout[1],
dst_layout, dst_type, min_cc, layout[2], layout[2], 32, dst_layout, dst_type, min_cc, layout[2], layout[2], 32,
False, ImplicitGemmMode.GemmTN, False)
False, ImplicitGemmMode.GemmTN, False, cuda_major, cuda_minor)
if tile.threadblock_shape[1] == 32 or tile.threadblock_shape[1] == 64: if tile.threadblock_shape[1] == 32 or tile.threadblock_shape[1] == 64:
dst_align = 32 if tile.threadblock_shape[1] == 32 else 64 dst_align = 32 if tile.threadblock_shape[1] == 32 else 64
operations += GenerateConv2d(ConvKind.Fprop, [tile], layout[0], layout[1], operations += GenerateConv2d(ConvKind.Fprop, [tile], layout[0], layout[1],
dst_layout, dst_type, min_cc, layout[2], layout[2], dst_align, dst_layout, dst_type, min_cc, layout[2], layout[2], dst_align,
False, ImplicitGemmMode.GemmTN, True)
False, ImplicitGemmMode.GemmTN, True, cuda_major, cuda_minor)


return operations return operations


@@ -624,6 +630,8 @@ def GeneratesGemm_TensorOp_1688(args):
alignment_constraints = [8, 4, 2, alignment_constraints = [8, 4, 2,
#1 #1
] ]
cuda_major = 10
cuda_minor = 2


operations = [] operations = []
for math_inst in math_instructions: for math_inst in math_instructions:
@@ -655,7 +663,9 @@ def GeneratesGemm_TensorOp_1688(args):
min_cc, \ min_cc, \
align * 16, \ align * 16, \
align * 16, \ align * 16, \
align * 16)
align * 16, \
cuda_major, \
cuda_minor)
return operations return operations


# #
@@ -686,6 +696,8 @@ def GeneratesGemm_TensorOp_884(args):
alignment_constraints = [8, 4, 2, alignment_constraints = [8, 4, 2,
# 1 # 1
] ]
cuda_major = 10
cuda_minor = 2


operations = [] operations = []
for math_inst in math_instructions: for math_inst in math_instructions:
@@ -717,7 +729,9 @@ def GeneratesGemm_TensorOp_884(args):
min_cc, \ min_cc, \
align * 16, \ align * 16, \
align * 16, \ align * 16, \
align * 16)
align * 16, \
cuda_major, \
cuda_minor)
return operations return operations




+ 10
- 2
dnn/scripts/cutlass_generator/manifest.py View File

@@ -351,6 +351,13 @@ void initialize_all(Manifest &manifest) {
################################################################################################### ###################################################################################################


def GenerateManifest(args, operations, output_dir): def GenerateManifest(args, operations, output_dir):
assert isinstance(operations, list)
if len(operations) == 0:
return
op = operations[0]
required_cuda_ver_major = op.required_cuda_ver_major
required_cuda_ver_minor = op.required_cuda_ver_minor

manifest_path = os.path.join(output_dir, "all_%s_%s_operations.cu" % (args.operations, args.type)) manifest_path = os.path.join(output_dir, "all_%s_%s_operations.cu" % (args.operations, args.type))
f = open(manifest_path, "w") f = open(manifest_path, "w")
f.write(""" f.write("""
@@ -358,7 +365,7 @@ def GenerateManifest(args, operations, output_dir):
Generated by generator.py - Do not edit. Generated by generator.py - Do not edit.
*/ */


#if __CUDACC_VER_MAJOR__ > 9 || (__CUDACC_VER_MAJOR__ == 9 && __CUDACC_VER_MINOR__ >= 2)
#if __CUDACC_VER_MAJOR__ > %s || (__CUDACC_VER_MAJOR__ == %s && __CUDACC_VER_MINOR__ >= %s)


#include "cutlass/cutlass.h" #include "cutlass/cutlass.h"
#include "src/cuda/cutlass/library.h" #include "src/cuda/cutlass/library.h"
@@ -367,7 +374,8 @@ def GenerateManifest(args, operations, output_dir):
namespace cutlass { namespace cutlass {
namespace library { namespace library {


""")
""" % (str(required_cuda_ver_major), str(required_cuda_ver_major), str(required_cuda_ver_minor)))

for op in operations: for op in operations:
f.write("void initialize_%s(Manifest &manifest);\n" % op.procedural_name()) f.write("void initialize_%s(Manifest &manifest);\n" % op.procedural_name())




+ 12
- 4
dnn/src/cuda/cutlass/initialize_all.cu View File

@@ -44,26 +44,34 @@ namespace cutlass {
namespace library { namespace library {


///////////////////////////////////////////////////////////////////////////////////////////////// /////////////////////////////////////////////////////////////////////////////////////////////////
#if ((__CUDACC_VER_MAJOR__ > 10) || \
(__CUDACC_VER_MAJOR__ == 10 && __CUDACC_VER_MINOR__ >= 2))
#define CUTLASS_ARCH_MMA_SM75_SUPPORTED 1
#endif


#if __CUDACC_VER_MAJOR__ > 9 || \ #if __CUDACC_VER_MAJOR__ > 9 || \
(__CUDACC_VER_MAJOR__ == 9 && __CUDACC_VER_MINOR__ >= 2) (__CUDACC_VER_MAJOR__ == 9 && __CUDACC_VER_MINOR__ >= 2)


void initialize_all_gemm_simt_operations(Manifest& manifest); void initialize_all_gemm_simt_operations(Manifest& manifest);
void initialize_all_conv2d_simt_operations(Manifest& manifest);
void initialize_all_deconv_simt_operations(Manifest& manifest);
#if defined(CUTLASS_ARCH_MMA_SM75_SUPPORTED) && CUTLASS_ARCH_MMA_SM75_SUPPORTED
void initialize_all_gemm_tensorop884_operations(Manifest& manifest); void initialize_all_gemm_tensorop884_operations(Manifest& manifest);
void initialize_all_gemm_tensorop1688_operations(Manifest& manifest); void initialize_all_gemm_tensorop1688_operations(Manifest& manifest);
void initialize_all_conv2d_simt_operations(Manifest& manifest);
void initialize_all_conv2d_tensorop8816_operations(Manifest& manifest); void initialize_all_conv2d_tensorop8816_operations(Manifest& manifest);
void initialize_all_conv2d_tensorop8832_operations(Manifest& manifest); void initialize_all_conv2d_tensorop8832_operations(Manifest& manifest);
void initialize_all_deconv_simt_operations(Manifest& manifest);
#endif


void initialize_all(Manifest& manifest) { void initialize_all(Manifest& manifest) {
initialize_all_gemm_simt_operations(manifest); initialize_all_gemm_simt_operations(manifest);
initialize_all_conv2d_simt_operations(manifest);
initialize_all_deconv_simt_operations(manifest);
#if defined(CUTLASS_ARCH_MMA_SM75_SUPPORTED) && CUTLASS_ARCH_MMA_SM75_SUPPORTED
initialize_all_gemm_tensorop884_operations(manifest); initialize_all_gemm_tensorop884_operations(manifest);
initialize_all_gemm_tensorop1688_operations(manifest); initialize_all_gemm_tensorop1688_operations(manifest);
initialize_all_conv2d_simt_operations(manifest);
initialize_all_conv2d_tensorop8816_operations(manifest); initialize_all_conv2d_tensorop8816_operations(manifest);
initialize_all_conv2d_tensorop8832_operations(manifest); initialize_all_conv2d_tensorop8832_operations(manifest);
initialize_all_deconv_simt_operations(manifest);
#endif
} }


#else #else


+ 4
- 0
dnn/src/cuda/matrix_mul/algos.cpp View File

@@ -43,6 +43,7 @@ MatrixMulForwardImpl::AlgoPack::AlgoPack() {
for (auto&& algo : simt_float32_gemv_batched_strided) { for (auto&& algo : simt_float32_gemv_batched_strided) {
all_algos.push_back(&algo); all_algos.push_back(&algo);
} }
#if CUDA_VERSION >= 10020
for (auto&& algo : tensorop_float16) { for (auto&& algo : tensorop_float16) {
all_algos.push_back(&algo); all_algos.push_back(&algo);
} }
@@ -50,6 +51,7 @@ MatrixMulForwardImpl::AlgoPack::AlgoPack() {
all_algos.push_back(&algo); all_algos.push_back(&algo);
} }
#endif #endif
#endif
all_algos.push_back(&naive); all_algos.push_back(&naive);


for (auto&& algo : all_algos) { for (auto&& algo : all_algos) {
@@ -107,7 +109,9 @@ void MatrixMulForwardImpl::AlgoPack::fill_cutlass_algos() {
#define cb(...) \ #define cb(...) \
tensorop_float16.emplace_back(AlgoParam{__VA_ARGS__}); \ tensorop_float16.emplace_back(AlgoParam{__VA_ARGS__}); \
tensorop_float16_split_k.emplace_back(AlgoParam{__VA_ARGS__}); tensorop_float16_split_k.emplace_back(AlgoParam{__VA_ARGS__});
#if CUDA_VERSION >= 10020
FOREACH_CUTLASS_MATMUL_F16_SHAPES(cb) FOREACH_CUTLASS_MATMUL_F16_SHAPES(cb)
#endif
#undef cb #undef cb
#undef FOREACH_CUTLASS_MATMUL_F16_SHAPES #undef FOREACH_CUTLASS_MATMUL_F16_SHAPES
} }


+ 33
- 1
dnn/src/cuda/matrix_mul/algos.h View File

@@ -241,6 +241,20 @@ public:
return AlgoAttribute::REPRODUCIBLE; return AlgoAttribute::REPRODUCIBLE;
} }
MEGDNN_DECL_ALGO_TYPE(CUDA_FLOAT32_SIMT) MEGDNN_DECL_ALGO_TYPE(CUDA_FLOAT32_SIMT)
std::string param() const override {
std::string ret;
// FIXME: algo param compatible with old version, to avoid fastrun cache error
struct AlgoParam_ {
int threadblock_m, threadblock_n, threadblock_k;
int warp_m, warp_n, warp_k;
};
AlgoParam_ algo_param{
m_algo_param.threadblock_m, m_algo_param.threadblock_n,
m_algo_param.threadblock_k, m_algo_param.warp_m,
m_algo_param.warp_n, m_algo_param.warp_k};
serialize_write_pod(algo_param, ret);
return ret;
}


private: private:
void do_exec(const ExecArgs& args) const override; void do_exec(const ExecArgs& args) const override;
@@ -263,6 +277,21 @@ public:
AlgoAttribute::USABLE_DEPEND_ON_SHAPE; AlgoAttribute::USABLE_DEPEND_ON_SHAPE;
} }
MEGDNN_DECL_ALGO_TYPE(CUDA_FLOAT32_SIMT_SPLIT_K) MEGDNN_DECL_ALGO_TYPE(CUDA_FLOAT32_SIMT_SPLIT_K)
std::string param() const override {
std::string ret;
// FIXME: algo param compatible with old version, to avoid fastrun cache
// error
struct AlgoParam_ {
int threadblock_m, threadblock_n, threadblock_k;
int warp_m, warp_n, warp_k;
};
AlgoParam_ algo_param{
m_algo_param.threadblock_m, m_algo_param.threadblock_n,
m_algo_param.threadblock_k, m_algo_param.warp_m,
m_algo_param.warp_n, m_algo_param.warp_k};
serialize_write_pod(algo_param, ret);
return ret;
}


private: private:
void do_exec(const ExecArgs& args) const override; void do_exec(const ExecArgs& args) const override;
@@ -297,6 +326,7 @@ private:
std::string m_name; std::string m_name;
}; };


#if CUDA_VERSION >= 10020
class MatrixMulForwardImpl::AlgoFloat16TensorOp final class MatrixMulForwardImpl::AlgoFloat16TensorOp final
: public AlgoCutlassMatrixMulBase { : public AlgoCutlassMatrixMulBase {
public: public:
@@ -345,7 +375,7 @@ private:
int min_alignment_requirement() const override { return 2; } int min_alignment_requirement() const override { return 2; }
std::string m_name; std::string m_name;
}; };
#endif
#endif #endif


class MatrixMulForwardImpl::AlgoPack : NonCopyableObj { class MatrixMulForwardImpl::AlgoPack : NonCopyableObj {
@@ -370,9 +400,11 @@ public:
std::vector<AlgoFloat32SIMTSplitK> simt_float32_split_k; std::vector<AlgoFloat32SIMTSplitK> simt_float32_split_k;
std::vector<AlgoFloat32SIMTGemvBatchedStrided> std::vector<AlgoFloat32SIMTGemvBatchedStrided>
simt_float32_gemv_batched_strided; simt_float32_gemv_batched_strided;
#if CUDA_VERSION >= 10020
std::vector<AlgoFloat16TensorOp> tensorop_float16; std::vector<AlgoFloat16TensorOp> tensorop_float16;
std::vector<AlgoFloat16TensorOpSplitK> tensorop_float16_split_k; std::vector<AlgoFloat16TensorOpSplitK> tensorop_float16_split_k;
#endif #endif
#endif
std::vector<AlgoBase*> all_algos; std::vector<AlgoBase*> all_algos;


const AlgoBase::Mapper& all_algos_map() const { return m_all_algos_map; } const AlgoBase::Mapper& all_algos_map() const { return m_all_algos_map; }


+ 2
- 2
dnn/src/cuda/matrix_mul/cutlass_float16_tensorop.cpp View File

@@ -2,7 +2,7 @@
* \file dnn/src/cuda/matrix_mul/cutlass_float16_tensorop.cpp * \file dnn/src/cuda/matrix_mul/cutlass_float16_tensorop.cpp
* MegEngine is Licensed under the Apache License, Version 2.0 (the "License") * MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
* *
* Copyright (c) 2014-2020 Megvii Inc. All rights reserved.
* Copyright (c) 2014-2021 Megvii Inc. All rights reserved.
* *
* Unless required by applicable law or agreed to in writing, * Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an * software distributed under the License is distributed on an
@@ -15,7 +15,7 @@
#include "src/cuda/matrix_mul/algos.h" #include "src/cuda/matrix_mul/algos.h"
#include "src/cuda/utils.h" #include "src/cuda/utils.h"


#if CUDA_VERSION >= 9020
#if CUDA_VERSION >= 10020
using namespace megdnn; using namespace megdnn;
using namespace cuda; using namespace cuda;




+ 5
- 5
dnn/src/cuda/matrix_mul/cutlass_float16_tensorop_split_k.cpp View File

@@ -2,7 +2,7 @@
* \file dnn/src/cuda/matrix_mul/cutlass_float16_tensorop_split_k.cpp * \file dnn/src/cuda/matrix_mul/cutlass_float16_tensorop_split_k.cpp
* MegEngine is Licensed under the Apache License, Version 2.0 (the "License") * MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
* *
* Copyright (c) 2014-2020 Megvii Inc. All rights reserved.
* Copyright (c) 2014-2021 Megvii Inc. All rights reserved.
* *
* Unless required by applicable law or agreed to in writing, * Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an * software distributed under the License is distributed on an
@@ -15,14 +15,14 @@
#include "src/cuda/matrix_mul/algos.h" #include "src/cuda/matrix_mul/algos.h"
#include "src/cuda/utils.h" #include "src/cuda/utils.h"


#if CUDA_VERSION >= 9020
#if CUDA_VERSION >= 10020
using namespace megdnn; using namespace megdnn;
using namespace cuda; using namespace cuda;


bool MatrixMulForwardImpl::AlgoFloat16TensorOpSplitK::is_available( bool MatrixMulForwardImpl::AlgoFloat16TensorOpSplitK::is_available(
const SizeArgs& args) const { const SizeArgs& args) const {
auto&& param = args.opr->param(); auto&& param = args.opr->param();
int n = args.layout_c.shape[1],
int m = args.layout_c.shape[0], n = args.layout_c.shape[1],
k = args.layout_a.shape[param.transposeA ? 0 : 1]; k = args.layout_a.shape[param.transposeA ? 0 : 1];
bool available = bool available =
args.opr->param().format == param::MatrixMul::Format::DEFAULT && args.opr->param().format == param::MatrixMul::Format::DEFAULT &&
@@ -32,8 +32,8 @@ bool MatrixMulForwardImpl::AlgoFloat16TensorOpSplitK::is_available(
auto&& device_prop = cuda::current_device_prop(); auto&& device_prop = cuda::current_device_prop();
int y_grid_limit = device_prop.maxGridSize[1]; int y_grid_limit = device_prop.maxGridSize[1];
// limit y grid // limit y grid
available &= ((n + m_algo_param.threadblock_n - 1) /
m_algo_param.threadblock_n <=
available &= ((m + m_algo_param.threadblock_m - 1) /
m_algo_param.threadblock_m <=
y_grid_limit); y_grid_limit);
if (m_algo_param.instruction_m == 8 && m_algo_param.instruction_n == 8 && if (m_algo_param.instruction_m == 8 && m_algo_param.instruction_n == 8 &&
m_algo_param.instruction_k == 4) { m_algo_param.instruction_k == 4) {


+ 1
- 1
dnn/src/cuda/matrix_mul/cutlass_float32_simt.cpp View File

@@ -2,7 +2,7 @@
* \file dnn/src/cuda/matrix_mul/cutlass_float32_simt.cpp * \file dnn/src/cuda/matrix_mul/cutlass_float32_simt.cpp
* MegEngine is Licensed under the Apache License, Version 2.0 (the "License") * MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
* *
* Copyright (c) 2014-2020 Megvii Inc. All rights reserved.
* Copyright (c) 2014-2021 Megvii Inc. All rights reserved.
* *
* Unless required by applicable law or agreed to in writing, * Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an * software distributed under the License is distributed on an


+ 1
- 1
dnn/src/cuda/matrix_mul/cutlass_float32_simt_gemv_batched_strided.cpp View File

@@ -2,7 +2,7 @@
* \file dnn/src/cuda/matrix_mul/cutlass_float32_simt_gemv_batched_strided.cpp * \file dnn/src/cuda/matrix_mul/cutlass_float32_simt_gemv_batched_strided.cpp
* MegEngine is Licensed under the Apache License, Version 2.0 (the "License") * MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
* *
* Copyright (c) 2014-2020 Megvii Inc. All rights reserved.
* Copyright (c) 2014-2021 Megvii Inc. All rights reserved.
* *
* Unless required by applicable law or agreed to in writing, * Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an * software distributed under the License is distributed on an


+ 4
- 4
dnn/src/cuda/matrix_mul/cutlass_float32_simt_split_k.cpp View File

@@ -2,7 +2,7 @@
* \file dnn/src/cuda/matrix_mul/cutlass_float32_simt_split_k.cpp * \file dnn/src/cuda/matrix_mul/cutlass_float32_simt_split_k.cpp
* MegEngine is Licensed under the Apache License, Version 2.0 (the "License") * MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
* *
* Copyright (c) 2014-2020 Megvii Inc. All rights reserved.
* Copyright (c) 2014-2021 Megvii Inc. All rights reserved.
* *
* Unless required by applicable law or agreed to in writing, * Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an * software distributed under the License is distributed on an
@@ -22,7 +22,7 @@ using namespace cuda;
bool MatrixMulForwardImpl::AlgoFloat32SIMTSplitK::is_available( bool MatrixMulForwardImpl::AlgoFloat32SIMTSplitK::is_available(
const SizeArgs& args) const { const SizeArgs& args) const {
auto&& param = args.opr->param(); auto&& param = args.opr->param();
int n = args.layout_c.shape[1],
int m = args.layout_c.shape[0], n = args.layout_c.shape[1],
k = args.layout_a.shape[param.transposeA ? 0 : 1]; k = args.layout_a.shape[param.transposeA ? 0 : 1];
bool available = bool available =
args.opr->param().format == param::MatrixMul::Format::DEFAULT && args.opr->param().format == param::MatrixMul::Format::DEFAULT &&
@@ -32,8 +32,8 @@ bool MatrixMulForwardImpl::AlgoFloat32SIMTSplitK::is_available(
auto&& device_prop = cuda::current_device_prop(); auto&& device_prop = cuda::current_device_prop();
int y_grid_limit = device_prop.maxGridSize[1]; int y_grid_limit = device_prop.maxGridSize[1];
// limit y grid // limit y grid
available &= ((n + m_algo_param.threadblock_n - 1) /
m_algo_param.threadblock_n <=
available &= ((m + m_algo_param.threadblock_m - 1) /
m_algo_param.threadblock_m <=
y_grid_limit); y_grid_limit);
return available; return available;
} }


+ 1
- 1
dnn/src/cuda/matrix_mul/cutlass_matrix_mul_base.cpp View File

@@ -2,7 +2,7 @@
* \file dnn/src/cuda/matrix_mul/cutlass_matrix_mul_base.cpp * \file dnn/src/cuda/matrix_mul/cutlass_matrix_mul_base.cpp
* MegEngine is Licensed under the Apache License, Version 2.0 (the "License") * MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
* *
* Copyright (c) 2014-2020 Megvii Inc. All rights reserved.
* Copyright (c) 2014-2021 Megvii Inc. All rights reserved.
* *
* Unless required by applicable law or agreed to in writing, * Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an * software distributed under the License is distributed on an


+ 4
- 1
dnn/test/cuda/cutlass_matmul.cpp View File

@@ -21,7 +21,6 @@
#include "test/cuda/fixture.h" #include "test/cuda/fixture.h"
#include "test/cuda/utils.h" #include "test/cuda/utils.h"


#define MEGDNN_WITH_BENCHMARK 1
#if CUDA_VERSION >= 9020 #if CUDA_VERSION >= 9020
namespace megdnn { namespace megdnn {
namespace test { namespace test {
@@ -373,6 +372,7 @@ MEGDNN_FOREACH_CUTLASS_KERNEL(cb)
#undef cb #undef cb
#undef MEGDNN_FOREACH_CUTLASS_KERNEL #undef MEGDNN_FOREACH_CUTLASS_KERNEL


#if CUDA_VERSION >= 10020
#define MEGDNN_FOREACH_CUTLASS_KERNEL(cb) \ #define MEGDNN_FOREACH_CUTLASS_KERNEL(cb) \
cb(1, 256, 128, 32, 64, 64, 32, 8, 8, 4); \ cb(1, 256, 128, 32, 64, 64, 32, 8, 8, 4); \
cb(2, 128, 256, 32, 64, 64, 32, 8, 8, 4); \ cb(2, 128, 256, 32, 64, 64, 32, 8, 8, 4); \
@@ -448,6 +448,7 @@ MEGDNN_FOREACH_CUTLASS_KERNEL(cb)
#undef cb #undef cb


#undef MEGDNN_FOREACH_CUTLASS_KERNEL #undef MEGDNN_FOREACH_CUTLASS_KERNEL
#endif


#if MEGDNN_WITH_BENCHMARK #if MEGDNN_WITH_BENCHMARK
TEST_F(CUDA, BENCHMARK_CUTLASS_MATMUL) { TEST_F(CUDA, BENCHMARK_CUTLASS_MATMUL) {
@@ -462,12 +463,14 @@ TEST_F(CUDA, BENCHMARK_CUTLASS_MATMUL_FEAT) {
"CUTLASS_FLOAT32_SIMT"); "CUTLASS_FLOAT32_SIMT");
} }


#if CUDA_VERSION >= 10020
TEST_F(CUDA, BENCHMARK_CUTLASS_F16_MATMUL_FEAT) { TEST_F(CUDA, BENCHMARK_CUTLASS_F16_MATMUL_FEAT) {
benchmark_matrix_mul(handle_cuda(), get_f16_feat_model_args(), benchmark_matrix_mul(handle_cuda(), get_f16_feat_model_args(),
dtype::Float16(), dtype::Float16(), dtype::Float16(), dtype::Float16(), dtype::Float16(), dtype::Float16(),
"CUTLASS_FLOAT16_TENSOR_OP"); "CUTLASS_FLOAT16_TENSOR_OP");
} }
#endif #endif
#endif
} // namespace test } // namespace test
} // namespace megdnn } // namespace megdnn
#endif #endif


Loading…
Cancel
Save