Browse Source

feat(dnn/cuda): add nhwc int8 imma conv and conv fuse typecvt

GitOrigin-RevId: 229e1eb4be
release-1.6
Megvii Engine Team 3 years ago
parent
commit
11f022ff7c
33 changed files with 1497 additions and 441 deletions
  1. +19
    -17
      dnn/scripts/cutlass_generator/conv2d_operation.py
  2. +102
    -29
      dnn/scripts/cutlass_generator/generator.py
  3. +18
    -0
      dnn/scripts/cutlass_generator/library.py
  4. +455
    -170
      dnn/scripts/cutlass_generator/list.bzl
  5. +4
    -1
      dnn/src/common/convolution.cpp
  6. +30
    -1
      dnn/src/cuda/conv_bias/algo.cpp
  7. +45
    -1
      dnn/src/cuda/conv_bias/algo.h
  8. +16
    -5
      dnn/src/cuda/conv_bias/cutlass_convolution_base.cpp
  9. +45
    -26
      dnn/src/cuda/conv_bias/cutlass_reorder_filter.cu
  10. +3
    -3
      dnn/src/cuda/conv_bias/cutlass_reorder_filter.cuh
  11. +16
    -2
      dnn/src/cuda/conv_bias/implicit_gemm_int4_int4_nhwc_imma.cpp
  12. +10
    -2
      dnn/src/cuda/conv_bias/implicit_gemm_int4_nchw64_imma_base.cpp
  13. +34
    -27
      dnn/src/cuda/conv_bias/implicit_gemm_int4_nhwc_imma_base.cpp
  14. +10
    -2
      dnn/src/cuda/conv_bias/implicit_gemm_int8_nchw32_imma.cpp
  15. +12
    -5
      dnn/src/cuda/conv_bias/implicit_gemm_int8_nchw4_dp4a.cpp
  16. +294
    -0
      dnn/src/cuda/conv_bias/implicit_gemm_int8_nhwc_imma.cpp
  17. +29
    -10
      dnn/src/cuda/conv_bias/implicit_gemm_uint4_int4_nhwc_imma.cpp
  18. +1
    -0
      dnn/src/cuda/conv_bias/opr_impl.h
  19. +2
    -0
      dnn/src/cuda/convolution/backward_data/algo.h
  20. +41
    -30
      dnn/src/cuda/convolution/backward_data/implicit_gemm_int8_nchw4_dp4a.cpp
  21. +42
    -27
      dnn/src/cuda/convolution/backward_data/implicit_gemm_int8_nchw_dp4a.cpp
  22. +2
    -2
      dnn/src/cuda/cutlass/convolution_operation.h
  23. +1
    -1
      dnn/src/cuda/cutlass/library.h
  24. +15
    -13
      dnn/src/cuda/cutlass/operation_table.cpp
  25. +6
    -6
      dnn/src/cuda/cutlass/operation_table.h
  26. +29
    -0
      dnn/src/cuda/cutlass/util.cu
  27. +4
    -0
      dnn/src/cuda/cutlass/util.h
  28. +4
    -0
      dnn/src/cuda/matrix_mul/algos.h
  29. +36
    -28
      dnn/src/cuda/matrix_mul/cutlass_float32_simt.cpp
  30. +36
    -29
      dnn/src/cuda/matrix_mul/cutlass_float32_simt_split_k.cpp
  31. +130
    -1
      dnn/test/cuda/conv_bias_int8.cpp
  32. +6
    -2
      dnn/test/cuda/conv_test_utils.cpp
  33. +0
    -1
      dnn/test/cuda/convolution.cpp

+ 19
- 17
dnn/scripts/cutlass_generator/conv2d_operation.py View File

@@ -19,8 +19,8 @@ class Conv2dOperation:
#
def __init__(self, conv_kind, conv_type, arch, tile_description, src, flt, bias, dst, element_epilogue, \
epilogue_functor = EpilogueFunctor.LinearCombination, swizzling_functor = SwizzlingFunctor.Identity4, \
need_load_from_const = True, implicit_gemm_mode = ImplicitGemmMode.GemmNT, without_shared_load = False, \
required_cuda_ver_major = 9, required_cuda_ver_minor = 2):
special_optimization = SpecialOptimizeDesc.NoneSpecialOpt, implicit_gemm_mode = ImplicitGemmMode.GemmNT, \
without_shared_load = False, required_cuda_ver_major = 9, required_cuda_ver_minor = 2):

self.operation_kind = OperationKind.Conv2d
self.conv_kind = conv_kind
@@ -34,7 +34,7 @@ class Conv2dOperation:
self.element_epilogue = element_epilogue
self.epilogue_functor = epilogue_functor
self.swizzling_functor = swizzling_functor
self.need_load_from_const = need_load_from_const
self.special_optimization = special_optimization
self.implicit_gemm_mode = implicit_gemm_mode
self.without_shared_load = without_shared_load
self.required_cuda_ver_major = required_cuda_ver_major
@@ -60,16 +60,18 @@ class Conv2dOperation:
else:
inst_shape = ''

unity_kernel = ''
if not self.need_load_from_const:
unity_kernel = '_1x1'
special_opt = ''
if self.special_optimization == SpecialOptimizeDesc.ConvFilterUnity:
special_opt = '_1x1'
elif self.special_optimization == SpecialOptimizeDesc.DeconvDoubleUpsampling:
special_opt = '_s2'

reorder_k = ''
if self.without_shared_load:
reorder_k = '_roc'

return "%s%s%s%s%s%s_%s" % (ShortDataTypeNames[self.accumulator_type()], \
inst_shape, intermediate_type, ConvKindNames[self.conv_kind], unity_kernel, \
inst_shape, intermediate_type, ConvKindNames[self.conv_kind], special_opt, \
reorder_k, ShortEpilogueNames[self.epilogue_functor])

#
@@ -183,7 +185,7 @@ using Convolution =
${stages},
${alignment_src},
${alignment_filter},
${nonuninity_kernel},
${special_optimization},
${math_operator},
${implicit_gemm_mode},
${without_shared_load}>;
@@ -226,7 +228,7 @@ using Convolution =
'stages': str(operation.tile_description.stages),
'alignment_src': str(operation.src.alignment),
'alignment_filter': str(operation.flt.alignment),
'nonuninity_kernel': str(operation.need_load_from_const).lower(),
'special_optimization': SpecialOptimizeDescTag[operation.special_optimization],
'math_operator': MathOperationTag[operation.tile_description.math_instruction.math_operation],
'implicit_gemm_mode': ImplicitGemmModeTag[operation.implicit_gemm_mode],
'without_shared_load': str(operation.without_shared_load).lower()
@@ -266,7 +268,7 @@ using Deconvolution =
${stages},
${alignment_src},
${alignment_filter},
${nonuninity_kernel},
${special_optimization},
${math_operator},
${implicit_gemm_mode}>;
"""
@@ -308,7 +310,7 @@ using Deconvolution =
'stages': str(operation.tile_description.stages),
'alignment_src': str(operation.src.alignment),
'alignment_filter': str(operation.flt.alignment),
'nonuninity_kernel': str(operation.need_load_from_const).lower(),
'special_optimization': SpecialOptimizeDescTag[operation.special_optimization],
'math_operator': MathOperationTag[operation.tile_description.math_instruction.math_operation],
'implicit_gemm_mode': ImplicitGemmModeTag[operation.implicit_gemm_mode]
}
@@ -323,9 +325,9 @@ using Deconvolution =
###################################################################################################

#
def GenerateConv2d(conv_kind, tile_descriptions, src_layout, flt_layout, dst_layout, dst_type, min_cc, src_align = 32, flt_align = 32, dst_align = 128, \
skip_unity_kernel = False, implicit_gemm_mode = ImplicitGemmMode.GemmNT, without_shared_load = False, required_cuda_ver_major = 9, \
required_cuda_ver_minor = 2):
def GenerateConv2d(conv_kind, tile_descriptions, src_layout, flt_layout, dst_layout, dst_type, min_cc, src_align = 32, flt_align = 32, dst_align = 32, \
use_special_optimization = SpecialOptimizeDesc.NoneSpecialOpt, implicit_gemm_mode = ImplicitGemmMode.GemmNT, without_shared_load = False, \
required_cuda_ver_major = 9, required_cuda_ver_minor = 2):
operations = []

element_epilogue = DataType.f32
@@ -412,10 +414,10 @@ def GenerateConv2d(conv_kind, tile_descriptions, src_layout, flt_layout, dst_lay
bias = TensorDescription(bias_type, dst_layout, max(1, int(32 / DataTypeSize[bias_type])))
dst = TensorDescription(dst_type, dst_layout, int(dst_align / DataTypeSize[dst_type]))

new_operation = Conv2dOperation(conv_kind, ConvType.Convolution, min_cc, tile, src, flt, bias, dst, element_epilogue, epilogue, swizzling_functor, True, implicit_gemm_mode, without_shared_load, required_cuda_ver_major, required_cuda_ver_minor)
new_operation = Conv2dOperation(conv_kind, ConvType.Convolution, min_cc, tile, src, flt, bias, dst, element_epilogue, epilogue, swizzling_functor, SpecialOptimizeDesc.NoneSpecialOpt, implicit_gemm_mode, without_shared_load, required_cuda_ver_major, required_cuda_ver_minor)
operations.append(new_operation)
if not skip_unity_kernel:
new_operation = Conv2dOperation(conv_kind, ConvType.Convolution, min_cc, tile, src, flt, bias, dst, element_epilogue, epilogue, swizzling_functor, False, implicit_gemm_mode, without_shared_load, required_cuda_ver_major, required_cuda_ver_minor)
if use_special_optimization != SpecialOptimizeDesc.NoneSpecialOpt:
new_operation = Conv2dOperation(conv_kind, ConvType.Convolution, min_cc, tile, src, flt, bias, dst, element_epilogue, epilogue, swizzling_functor, use_special_optimization , implicit_gemm_mode, without_shared_load, required_cuda_ver_major, required_cuda_ver_minor)
operations.append(new_operation)
return operations



+ 102
- 29
dnn/scripts/cutlass_generator/generator.py View File

@@ -168,10 +168,10 @@ def GenerateConv2d_Simt(args):
for dst_type, dst_layout in zip(dst_types, dst_layouts):
if dst_type == DataType.s4 or dst_type == DataType.u4:
min_cc = 75
skip_unity_kernel = True
use_special_optimization = SpecialOptimizeDesc.NoneSpecialOpt
else:
min_cc = 61
skip_unity_kernel = False
use_special_optimization = SpecialOptimizeDesc.ConvFilterUnity
tile_descriptions = [
TileDescription([128, 128, 32], 2, [2, 4, 1], math_inst, min_cc, max_cc),
TileDescription([128, 64, 32], 2, [2, 2, 1], math_inst, min_cc, max_cc),
@@ -182,10 +182,16 @@ def GenerateConv2d_Simt(args):
TileDescription([ 64, 32, 32], 2, [1, 1, 1], math_inst, min_cc, max_cc),
TileDescription([ 16, 128, 16], 1, [1, 1, 1], math_inst, min_cc, max_cc),
TileDescription([ 16, 64, 8], 2, [1, 1, 1], math_inst, min_cc, max_cc),
]
operations += GenerateConv2d(ConvKind.Fprop, tile_descriptions, layout[0], layout[1],
dst_layout, dst_type, min_cc, 32, 32, 32,
skip_unity_kernel)
]
for tile in tile_descriptions:
if dst_layout == LayoutType.TensorNC32HW32 and tile.threadblock_shape[0] > 32:
continue
if (dst_layout == LayoutType.TensorNCHW or dst_layout == LayoutType.TensorNHWC) \
and tile.threadblock_shape[0] > 16:
continue
operations += GenerateConv2d(ConvKind.Fprop, [tile], layout[0], layout[1],
dst_layout, dst_type, min_cc, 32, 32, 32,
use_special_optimization)
return operations


@@ -214,6 +220,8 @@ def GenerateConv2d_TensorOp_8816(args):
DataType.s8,
]

use_special_optimization = SpecialOptimizeDesc.ConvFilterUnity

min_cc = 75
max_cc = 1024

@@ -232,28 +240,69 @@ def GenerateConv2d_TensorOp_8816(args):
TileDescription([ 64, 128, 64], 2, [2, 2, 1], math_inst, min_cc, max_cc),
TileDescription([128, 64, 32], 1, [2, 2, 1], math_inst, min_cc, max_cc),
TileDescription([128, 32, 32], 1, [2, 1, 1], math_inst, min_cc, max_cc),
TileDescription([ 64, 128, 32], 1, [2, 2, 1], math_inst, min_cc, max_cc),
TileDescription([ 32, 128, 32], 1, [1, 2, 1], math_inst, min_cc, max_cc),
]
operations += GenerateConv2d(ConvKind.Fprop, tile_descriptions, layout[0], layout[1],
dst_layout, dst_type, min_cc, 128, 128, 64,
False, ImplicitGemmMode.GemmTN, True, cuda_major, cuda_minor)
dst_layout, dst_type, min_cc, 128, 128, 64, use_special_optimization,
ImplicitGemmMode.GemmTN, True, cuda_major, cuda_minor)
else:
assert dst_layout == LayoutType.TensorNC4HW4
tile_descriptions = [
TileDescription([128, 256, 64], 2, [2, 4, 1], math_inst, min_cc, max_cc),
TileDescription([256, 128, 64], 2, [4, 2, 1], math_inst, min_cc, max_cc),
TileDescription([128, 128, 64], 2, [2, 2, 1], math_inst, min_cc, max_cc),
TileDescription([128, 64, 64], 2, [2, 2, 1], math_inst, min_cc, max_cc),
TileDescription([ 64, 128, 64], 2, [2, 2, 1], math_inst, min_cc, max_cc),
TileDescription([128, 64, 32], 1, [2, 2, 1], math_inst, min_cc, max_cc),
TileDescription([128, 32, 32], 1, [2, 1, 1], math_inst, min_cc, max_cc),
TileDescription([ 64, 128, 32], 1, [2, 2, 1], math_inst, min_cc, max_cc),
TileDescription([ 32, 128, 32], 1, [1, 2, 1], math_inst, min_cc, max_cc),
]
operations += GenerateConv2d(ConvKind.Fprop, tile_descriptions, layout[0], layout[1],
dst_layout, dst_type, min_cc, 128, 128, 64,
False, ImplicitGemmMode.GemmNT, False, cuda_major, cuda_minor)
dst_layout, dst_type, min_cc, 128, 128, 64, use_special_optimization,
ImplicitGemmMode.GemmNT, False, cuda_major, cuda_minor)

layouts_nhwc = [
(LayoutType.TensorNHWC, LayoutType.TensorNC4HW4, 32),
(LayoutType.TensorNHWC, LayoutType.TensorNC8HW8, 64),
(LayoutType.TensorNHWC, LayoutType.TensorNC16HW16, 128),
]

dst_layouts_nhwc = [
LayoutType.TensorNHWC,
]

for math_inst in math_instructions:
for layout in layouts_nhwc:
for dst_layout in dst_layouts_nhwc:
dst_type = math_inst.element_b
tile_descriptions = [
TileDescription([128, 32, 32], 1, [2, 1, 1], math_inst, min_cc, max_cc),
TileDescription([64, 16, 32], 2, [1, 1, 1], math_inst, min_cc, max_cc),
]
for tile in tile_descriptions:
dst_align = 32 if tile.threadblock_shape[1] == 16 else 64
operations += GenerateConv2d(ConvKind.Fprop, [tile], layout[0], layout[1], dst_layout,
dst_type, min_cc, layout[2], layout[2], dst_align, use_special_optimization,
ImplicitGemmMode.GemmTN, False, cuda_major, cuda_minor)
if tile.threadblock_shape[1] == 16 or tile.threadblock_shape[1] == 32:
operations += GenerateConv2d(ConvKind.Fprop, [tile], layout[0], layout[1], dst_layout,
dst_type, min_cc, layout[2], layout[2], dst_align, use_special_optimization,
ImplicitGemmMode.GemmTN, True, cuda_major, cuda_minor)

out_dtypes = [DataType.s4, DataType.u4, DataType.f32]

#INT8x8x4 and INT8x8x32
for math_inst in math_instructions:
for layout in layouts_nhwc:
for dst_layout in dst_layouts_nhwc:
for out_dtype in out_dtypes:
tile_descriptions = [
TileDescription([128, 32, 32], 1, [2, 1, 1], math_inst, min_cc, max_cc),
TileDescription([64, 16, 32], 2, [1, 1, 1], math_inst, min_cc, max_cc),
]
for tile in tile_descriptions:
dst_align = 4 * DataTypeSize[out_dtype] if tile.threadblock_shape[1] == 16 or out_dtype == DataType.f32 \
else 8 * DataTypeSize[out_dtype]
operations += GenerateConv2d(ConvKind.Fprop, [tile], layout[0], layout[1], dst_layout,
out_dtype, min_cc, layout[2], layout[2], dst_align, use_special_optimization,
ImplicitGemmMode.GemmTN, False, cuda_major, cuda_minor)
if tile.threadblock_shape[1] == 16 or (tile.threadblock_shape[1] == 32 and out_dtype != DataType.f32):
operations += GenerateConv2d(ConvKind.Fprop, [tile], layout[0], layout[1], dst_layout,
out_dtype, min_cc, layout[2], layout[2], dst_align, use_special_optimization,
ImplicitGemmMode.GemmTN, True, cuda_major, cuda_minor)
return operations

@@ -281,6 +330,8 @@ def GenerateConv2d_TensorOp_8832(args):
LayoutType.TensorNC64HW64,
]

use_special_optimization = SpecialOptimizeDesc.ConvFilterUnity

min_cc = 75
max_cc = 1024

@@ -298,8 +349,8 @@ def GenerateConv2d_TensorOp_8832(args):
TileDescription([128, 64, 64], 1, [2, 1, 1], math_inst, min_cc, max_cc),
]
operations += GenerateConv2d(ConvKind.Fprop, tile_descriptions, layout[0], layout[1],
dst_layout, dst_type, min_cc, 128, 128, 64,
False, ImplicitGemmMode.GemmTN, True, cuda_major, cuda_minor)
dst_layout, dst_type, min_cc, 128, 128, 64, use_special_optimization,
ImplicitGemmMode.GemmTN, True, cuda_major, cuda_minor)

layouts_nhwc = [
(LayoutType.TensorNHWC, LayoutType.TensorNC8HW8, 32),
@@ -316,18 +367,39 @@ def GenerateConv2d_TensorOp_8832(args):
for dst_layout in dst_layouts_nhwc:
dst_type = math_inst.element_b
tile_descriptions = [
TileDescription([128, 16, 64], 2, [1, 1, 1], math_inst, min_cc, max_cc),
TileDescription([128, 32, 64], 1, [2, 1, 1], math_inst, min_cc, max_cc),
TileDescription([128, 64, 64], 1, [2, 1, 1], math_inst, min_cc, max_cc),
]
for tile in tile_descriptions:
operations += GenerateConv2d(ConvKind.Fprop, [tile], layout[0], layout[1],
dst_layout, dst_type, min_cc, layout[2], layout[2], 32,
False, ImplicitGemmMode.GemmTN, False, cuda_major, cuda_minor)
dst_align = 16 if tile.threadblock_shape[1] == 16 else 32
operations += GenerateConv2d(ConvKind.Fprop, [tile], layout[0], layout[1], dst_layout,
dst_type, min_cc, layout[2], layout[2], dst_align, use_special_optimization,
ImplicitGemmMode.GemmTN, False, cuda_major, cuda_minor)
if tile.threadblock_shape[1] == 32 or tile.threadblock_shape[1] == 64:
dst_align = 32 if tile.threadblock_shape[1] == 32 else 64
operations += GenerateConv2d(ConvKind.Fprop, [tile], layout[0], layout[1],
dst_layout, dst_type, min_cc, layout[2], layout[2], dst_align,
False, ImplicitGemmMode.GemmTN, True, cuda_major, cuda_minor)
operations += GenerateConv2d(ConvKind.Fprop, [tile], layout[0], layout[1], dst_layout,
dst_type, min_cc, layout[2], layout[2], dst_align, use_special_optimization,
ImplicitGemmMode.GemmTN, True, cuda_major, cuda_minor)
# INT4x4x8
for math_inst in math_instructions:
for layout in layouts_nhwc:
for dst_layout in dst_layouts_nhwc:
tile_descriptions = [
TileDescription([128, 16, 64], 2, [1, 1, 1], math_inst, min_cc, max_cc),
TileDescription([128, 32, 64], 1, [2, 1, 1], math_inst, min_cc, max_cc),
TileDescription([128, 64, 64], 1, [2, 1, 1], math_inst, min_cc, max_cc),
]
for tile in tile_descriptions:
dst_align = 32 if tile.threadblock_shape[1] == 16 else 64
operations += GenerateConv2d(ConvKind.Fprop, [tile], layout[0], layout[1], dst_layout,
DataType.s8, min_cc, layout[2], layout[2], dst_align, use_special_optimization,
ImplicitGemmMode.GemmTN, False, cuda_major, cuda_minor)
if tile.threadblock_shape[1] == 32 or tile.threadblock_shape[1] == 64:
dst_align = 64 if tile.threadblock_shape[1] == 32 else 128
operations += GenerateConv2d(ConvKind.Fprop, [tile], layout[0], layout[1], dst_layout,
DataType.s8, min_cc, layout[2], layout[2], dst_align, use_special_optimization,
ImplicitGemmMode.GemmTN, True, cuda_major, cuda_minor)

return operations

@@ -354,6 +426,8 @@ def GenerateDeconv_Simt(args):
DataType.s8,
]

use_special_optimization = SpecialOptimizeDesc.DeconvDoubleUpsampling

min_cc = 61
max_cc = 1024

@@ -361,7 +435,6 @@ def GenerateDeconv_Simt(args):
for layout in layouts:
for dst_type, dst_layout in zip(dst_types, dst_layouts):
tile_descriptions = [
TileDescription([64, 128, 32], 2, [1, 4, 1], math_inst, min_cc, max_cc),
TileDescription([32, 128, 32], 2, [1, 2, 1], math_inst, min_cc, max_cc),
TileDescription([16, 128, 16], 2, [1, 2, 1], math_inst, min_cc, max_cc),
TileDescription([16, 128, 16], 1, [1, 1, 1], math_inst, min_cc, max_cc),
@@ -369,7 +442,7 @@ def GenerateDeconv_Simt(args):
]
operations += GenerateConv2d(ConvKind.Dgrad, tile_descriptions, layout[0], layout[1],
dst_layout, dst_type, min_cc, 32, 32, 32,
True)
use_special_optimization)
return operations

################################################################################


+ 18
- 0
dnn/scripts/cutlass_generator/library.py View File

@@ -562,6 +562,24 @@ StrideSupportNames = {
StrideSupport.Unity: 'unity_stride',
}

class SpecialOptimizeDesc(enum.Enum):
NoneSpecialOpt = enum_auto()
ConvFilterUnity = enum_auto()
DeconvDoubleUpsampling = enum_auto()

SpecialOptimizeDescNames = {
SpecialOptimizeDesc.NoneSpecialOpt: 'none',
SpecialOptimizeDesc.ConvFilterUnity: 'conv_filter_unity',
SpecialOptimizeDesc.DeconvDoubleUpsampling: 'deconv_double_upsampling',
}

SpecialOptimizeDescTag = {
SpecialOptimizeDesc.NoneSpecialOpt: 'cutlass::conv::SpecialOptimizeDesc::NONE',
SpecialOptimizeDesc.ConvFilterUnity: 'cutlass::conv::SpecialOptimizeDesc::CONV_FILTER_UNITY',
SpecialOptimizeDesc.DeconvDoubleUpsampling: 'cutlass::conv::SpecialOptimizeDesc::DECONV_DOUBLE_UPSAMPLING',
}


class ImplicitGemmMode(enum.Enum):
GemmNT = enum_auto()
GemmTN = enum_auto()


+ 455
- 170
dnn/scripts/cutlass_generator/list.bzl View File

@@ -455,11 +455,14 @@ cutlass_gen_list = [
"cutlass_simt_sgemv_batched_strided_1x32_32_tt_align1x4.cu",
"cutlass_simt_sgemv_batched_strided_1x32_16_tt_align1x2.cu",
"cutlass_simt_sgemv_batched_strided_1x32_8_tt_align1x1.cu",
"cutlass_simt_s8_idgrad_id_s8_64x128x32_64x32x32_2_nc4hw4_k4rsc4.cu",
"cutlass_simt_s8_idgrad_id_s8_32x128x32_32x64x32_2_nc4hw4_k4rsc4.cu",
"cutlass_simt_s8_idgrad_s2_id_s8_32x128x32_32x64x32_2_nc4hw4_k4rsc4.cu",
"cutlass_simt_s8_idgrad_id_s8_16x128x16_16x64x16_2_nc4hw4_k4rsc4.cu",
"cutlass_simt_s8_idgrad_s2_id_s8_16x128x16_16x64x16_2_nc4hw4_k4rsc4.cu",
"cutlass_simt_s8_idgrad_id_s8_16x128x16_16x128x16_1_nc4hw4_k4rsc4.cu",
"cutlass_simt_s8_idgrad_s2_id_s8_16x128x16_16x128x16_1_nc4hw4_k4rsc4.cu",
"cutlass_simt_s8_idgrad_id_s8_16x64x8_16x64x8_2_nc4hw4_k4rsc4.cu",
"cutlass_simt_s8_idgrad_s2_id_s8_16x64x8_16x64x8_2_nc4hw4_k4rsc4.cu",
"all_deconv_simt_operations.cu",
"cutlass_simt_s8_ifprop_id_s8_128x128x32_64x32x32_2_nc4hw4_c4rsk4.cu",
"cutlass_simt_s8_ifprop_1x1_id_s8_128x128x32_64x32x32_2_nc4hw4_c4rsk4.cu",
@@ -515,30 +518,6 @@ cutlass_gen_list = [
"cutlass_simt_s8_ifprop_1x1_relu_s8_16x64x8_16x64x8_2_nc4hw4_c4rsk4.cu",
"cutlass_simt_s8_ifprop_hswish_s8_16x64x8_16x64x8_2_nc4hw4_c4rsk4.cu",
"cutlass_simt_s8_ifprop_1x1_hswish_s8_16x64x8_16x64x8_2_nc4hw4_c4rsk4.cu",
"cutlass_simt_s8_ifprop_id_s8_128x128x32_64x32x32_2_nc4hw4_c4rsk4_nc32hw32.cu",
"cutlass_simt_s8_ifprop_1x1_id_s8_128x128x32_64x32x32_2_nc4hw4_c4rsk4_nc32hw32.cu",
"cutlass_simt_s8_ifprop_relu_s8_128x128x32_64x32x32_2_nc4hw4_c4rsk4_nc32hw32.cu",
"cutlass_simt_s8_ifprop_1x1_relu_s8_128x128x32_64x32x32_2_nc4hw4_c4rsk4_nc32hw32.cu",
"cutlass_simt_s8_ifprop_hswish_s8_128x128x32_64x32x32_2_nc4hw4_c4rsk4_nc32hw32.cu",
"cutlass_simt_s8_ifprop_1x1_hswish_s8_128x128x32_64x32x32_2_nc4hw4_c4rsk4_nc32hw32.cu",
"cutlass_simt_s8_ifprop_id_s8_128x64x32_64x32x32_2_nc4hw4_c4rsk4_nc32hw32.cu",
"cutlass_simt_s8_ifprop_1x1_id_s8_128x64x32_64x32x32_2_nc4hw4_c4rsk4_nc32hw32.cu",
"cutlass_simt_s8_ifprop_relu_s8_128x64x32_64x32x32_2_nc4hw4_c4rsk4_nc32hw32.cu",
"cutlass_simt_s8_ifprop_1x1_relu_s8_128x64x32_64x32x32_2_nc4hw4_c4rsk4_nc32hw32.cu",
"cutlass_simt_s8_ifprop_hswish_s8_128x64x32_64x32x32_2_nc4hw4_c4rsk4_nc32hw32.cu",
"cutlass_simt_s8_ifprop_1x1_hswish_s8_128x64x32_64x32x32_2_nc4hw4_c4rsk4_nc32hw32.cu",
"cutlass_simt_s8_ifprop_id_s8_64x128x32_64x32x32_2_nc4hw4_c4rsk4_nc32hw32.cu",
"cutlass_simt_s8_ifprop_1x1_id_s8_64x128x32_64x32x32_2_nc4hw4_c4rsk4_nc32hw32.cu",
"cutlass_simt_s8_ifprop_relu_s8_64x128x32_64x32x32_2_nc4hw4_c4rsk4_nc32hw32.cu",
"cutlass_simt_s8_ifprop_1x1_relu_s8_64x128x32_64x32x32_2_nc4hw4_c4rsk4_nc32hw32.cu",
"cutlass_simt_s8_ifprop_hswish_s8_64x128x32_64x32x32_2_nc4hw4_c4rsk4_nc32hw32.cu",
"cutlass_simt_s8_ifprop_1x1_hswish_s8_64x128x32_64x32x32_2_nc4hw4_c4rsk4_nc32hw32.cu",
"cutlass_simt_s8_ifprop_id_s8_128x32x32_64x32x32_2_nc4hw4_c4rsk4_nc32hw32.cu",
"cutlass_simt_s8_ifprop_1x1_id_s8_128x32x32_64x32x32_2_nc4hw4_c4rsk4_nc32hw32.cu",
"cutlass_simt_s8_ifprop_relu_s8_128x32x32_64x32x32_2_nc4hw4_c4rsk4_nc32hw32.cu",
"cutlass_simt_s8_ifprop_1x1_relu_s8_128x32x32_64x32x32_2_nc4hw4_c4rsk4_nc32hw32.cu",
"cutlass_simt_s8_ifprop_hswish_s8_128x32x32_64x32x32_2_nc4hw4_c4rsk4_nc32hw32.cu",
"cutlass_simt_s8_ifprop_1x1_hswish_s8_128x32x32_64x32x32_2_nc4hw4_c4rsk4_nc32hw32.cu",
"cutlass_simt_s8_ifprop_id_s8_32x128x32_32x64x32_2_nc4hw4_c4rsk4_nc32hw32.cu",
"cutlass_simt_s8_ifprop_1x1_id_s8_32x128x32_32x64x32_2_nc4hw4_c4rsk4_nc32hw32.cu",
"cutlass_simt_s8_ifprop_relu_s8_32x128x32_32x64x32_2_nc4hw4_c4rsk4_nc32hw32.cu",
@@ -551,108 +530,18 @@ cutlass_gen_list = [
"cutlass_simt_s8_ifprop_1x1_relu_s8_32x64x32_32x64x32_2_nc4hw4_c4rsk4_nc32hw32.cu",
"cutlass_simt_s8_ifprop_hswish_s8_32x64x32_32x64x32_2_nc4hw4_c4rsk4_nc32hw32.cu",
"cutlass_simt_s8_ifprop_1x1_hswish_s8_32x64x32_32x64x32_2_nc4hw4_c4rsk4_nc32hw32.cu",
"cutlass_simt_s8_ifprop_id_s8_64x32x32_64x32x32_2_nc4hw4_c4rsk4_nc32hw32.cu",
"cutlass_simt_s8_ifprop_1x1_id_s8_64x32x32_64x32x32_2_nc4hw4_c4rsk4_nc32hw32.cu",
"cutlass_simt_s8_ifprop_relu_s8_64x32x32_64x32x32_2_nc4hw4_c4rsk4_nc32hw32.cu",
"cutlass_simt_s8_ifprop_1x1_relu_s8_64x32x32_64x32x32_2_nc4hw4_c4rsk4_nc32hw32.cu",
"cutlass_simt_s8_ifprop_hswish_s8_64x32x32_64x32x32_2_nc4hw4_c4rsk4_nc32hw32.cu",
"cutlass_simt_s8_ifprop_1x1_hswish_s8_64x32x32_64x32x32_2_nc4hw4_c4rsk4_nc32hw32.cu",
"cutlass_simt_u4_ifprop_id_s8_128x128x32_64x32x32_2_nc4hw4_c4rsk4_nhwc.cu",
"cutlass_simt_u4_ifprop_relu_s8_128x128x32_64x32x32_2_nc4hw4_c4rsk4_nhwc.cu",
"cutlass_simt_u4_ifprop_hswish_s8_128x128x32_64x32x32_2_nc4hw4_c4rsk4_nhwc.cu",
"cutlass_simt_u4_ifprop_id_s8_128x64x32_64x32x32_2_nc4hw4_c4rsk4_nhwc.cu",
"cutlass_simt_u4_ifprop_relu_s8_128x64x32_64x32x32_2_nc4hw4_c4rsk4_nhwc.cu",
"cutlass_simt_u4_ifprop_hswish_s8_128x64x32_64x32x32_2_nc4hw4_c4rsk4_nhwc.cu",
"cutlass_simt_u4_ifprop_id_s8_64x128x32_64x32x32_2_nc4hw4_c4rsk4_nhwc.cu",
"cutlass_simt_u4_ifprop_relu_s8_64x128x32_64x32x32_2_nc4hw4_c4rsk4_nhwc.cu",
"cutlass_simt_u4_ifprop_hswish_s8_64x128x32_64x32x32_2_nc4hw4_c4rsk4_nhwc.cu",
"cutlass_simt_u4_ifprop_id_s8_128x32x32_64x32x32_2_nc4hw4_c4rsk4_nhwc.cu",
"cutlass_simt_u4_ifprop_relu_s8_128x32x32_64x32x32_2_nc4hw4_c4rsk4_nhwc.cu",
"cutlass_simt_u4_ifprop_hswish_s8_128x32x32_64x32x32_2_nc4hw4_c4rsk4_nhwc.cu",
"cutlass_simt_u4_ifprop_id_s8_32x128x32_32x64x32_2_nc4hw4_c4rsk4_nhwc.cu",
"cutlass_simt_u4_ifprop_relu_s8_32x128x32_32x64x32_2_nc4hw4_c4rsk4_nhwc.cu",
"cutlass_simt_u4_ifprop_hswish_s8_32x128x32_32x64x32_2_nc4hw4_c4rsk4_nhwc.cu",
"cutlass_simt_u4_ifprop_id_s8_32x64x32_32x64x32_2_nc4hw4_c4rsk4_nhwc.cu",
"cutlass_simt_u4_ifprop_relu_s8_32x64x32_32x64x32_2_nc4hw4_c4rsk4_nhwc.cu",
"cutlass_simt_u4_ifprop_hswish_s8_32x64x32_32x64x32_2_nc4hw4_c4rsk4_nhwc.cu",
"cutlass_simt_u4_ifprop_id_s8_64x32x32_64x32x32_2_nc4hw4_c4rsk4_nhwc.cu",
"cutlass_simt_u4_ifprop_relu_s8_64x32x32_64x32x32_2_nc4hw4_c4rsk4_nhwc.cu",
"cutlass_simt_u4_ifprop_hswish_s8_64x32x32_64x32x32_2_nc4hw4_c4rsk4_nhwc.cu",
"cutlass_simt_u4_ifprop_id_s8_16x128x16_16x128x16_1_nc4hw4_c4rsk4_nhwc.cu",
"cutlass_simt_u4_ifprop_relu_s8_16x128x16_16x128x16_1_nc4hw4_c4rsk4_nhwc.cu",
"cutlass_simt_u4_ifprop_hswish_s8_16x128x16_16x128x16_1_nc4hw4_c4rsk4_nhwc.cu",
"cutlass_simt_u4_ifprop_id_s8_16x64x8_16x64x8_2_nc4hw4_c4rsk4_nhwc.cu",
"cutlass_simt_u4_ifprop_relu_s8_16x64x8_16x64x8_2_nc4hw4_c4rsk4_nhwc.cu",
"cutlass_simt_u4_ifprop_hswish_s8_16x64x8_16x64x8_2_nc4hw4_c4rsk4_nhwc.cu",
"cutlass_simt_s4_ifprop_id_s8_128x128x32_64x32x32_2_nc4hw4_c4rsk4_nhwc.cu",
"cutlass_simt_s4_ifprop_relu_s8_128x128x32_64x32x32_2_nc4hw4_c4rsk4_nhwc.cu",
"cutlass_simt_s4_ifprop_hswish_s8_128x128x32_64x32x32_2_nc4hw4_c4rsk4_nhwc.cu",
"cutlass_simt_s4_ifprop_id_s8_128x64x32_64x32x32_2_nc4hw4_c4rsk4_nhwc.cu",
"cutlass_simt_s4_ifprop_relu_s8_128x64x32_64x32x32_2_nc4hw4_c4rsk4_nhwc.cu",
"cutlass_simt_s4_ifprop_hswish_s8_128x64x32_64x32x32_2_nc4hw4_c4rsk4_nhwc.cu",
"cutlass_simt_s4_ifprop_id_s8_64x128x32_64x32x32_2_nc4hw4_c4rsk4_nhwc.cu",
"cutlass_simt_s4_ifprop_relu_s8_64x128x32_64x32x32_2_nc4hw4_c4rsk4_nhwc.cu",
"cutlass_simt_s4_ifprop_hswish_s8_64x128x32_64x32x32_2_nc4hw4_c4rsk4_nhwc.cu",
"cutlass_simt_s4_ifprop_id_s8_128x32x32_64x32x32_2_nc4hw4_c4rsk4_nhwc.cu",
"cutlass_simt_s4_ifprop_relu_s8_128x32x32_64x32x32_2_nc4hw4_c4rsk4_nhwc.cu",
"cutlass_simt_s4_ifprop_hswish_s8_128x32x32_64x32x32_2_nc4hw4_c4rsk4_nhwc.cu",
"cutlass_simt_s4_ifprop_id_s8_32x128x32_32x64x32_2_nc4hw4_c4rsk4_nhwc.cu",
"cutlass_simt_s4_ifprop_relu_s8_32x128x32_32x64x32_2_nc4hw4_c4rsk4_nhwc.cu",
"cutlass_simt_s4_ifprop_hswish_s8_32x128x32_32x64x32_2_nc4hw4_c4rsk4_nhwc.cu",
"cutlass_simt_s4_ifprop_id_s8_32x64x32_32x64x32_2_nc4hw4_c4rsk4_nhwc.cu",
"cutlass_simt_s4_ifprop_relu_s8_32x64x32_32x64x32_2_nc4hw4_c4rsk4_nhwc.cu",
"cutlass_simt_s4_ifprop_hswish_s8_32x64x32_32x64x32_2_nc4hw4_c4rsk4_nhwc.cu",
"cutlass_simt_s4_ifprop_id_s8_64x32x32_64x32x32_2_nc4hw4_c4rsk4_nhwc.cu",
"cutlass_simt_s4_ifprop_relu_s8_64x32x32_64x32x32_2_nc4hw4_c4rsk4_nhwc.cu",
"cutlass_simt_s4_ifprop_hswish_s8_64x32x32_64x32x32_2_nc4hw4_c4rsk4_nhwc.cu",
"cutlass_simt_s4_ifprop_id_s8_16x128x16_16x128x16_1_nc4hw4_c4rsk4_nhwc.cu",
"cutlass_simt_s4_ifprop_relu_s8_16x128x16_16x128x16_1_nc4hw4_c4rsk4_nhwc.cu",
"cutlass_simt_s4_ifprop_hswish_s8_16x128x16_16x128x16_1_nc4hw4_c4rsk4_nhwc.cu",
"cutlass_simt_s4_ifprop_id_s8_16x64x8_16x64x8_2_nc4hw4_c4rsk4_nhwc.cu",
"cutlass_simt_s4_ifprop_relu_s8_16x64x8_16x64x8_2_nc4hw4_c4rsk4_nhwc.cu",
"cutlass_simt_s4_ifprop_hswish_s8_16x64x8_16x64x8_2_nc4hw4_c4rsk4_nhwc.cu",
"cutlass_simt_f32_ifprop_id_s8_128x128x32_64x32x32_2_nc4hw4_c4rsk4_nchw.cu",
"cutlass_simt_f32_ifprop_1x1_id_s8_128x128x32_64x32x32_2_nc4hw4_c4rsk4_nchw.cu",
"cutlass_simt_f32_ifprop_relu_s8_128x128x32_64x32x32_2_nc4hw4_c4rsk4_nchw.cu",
"cutlass_simt_f32_ifprop_1x1_relu_s8_128x128x32_64x32x32_2_nc4hw4_c4rsk4_nchw.cu",
"cutlass_simt_f32_ifprop_hswish_s8_128x128x32_64x32x32_2_nc4hw4_c4rsk4_nchw.cu",
"cutlass_simt_f32_ifprop_1x1_hswish_s8_128x128x32_64x32x32_2_nc4hw4_c4rsk4_nchw.cu",
"cutlass_simt_f32_ifprop_id_s8_128x64x32_64x32x32_2_nc4hw4_c4rsk4_nchw.cu",
"cutlass_simt_f32_ifprop_1x1_id_s8_128x64x32_64x32x32_2_nc4hw4_c4rsk4_nchw.cu",
"cutlass_simt_f32_ifprop_relu_s8_128x64x32_64x32x32_2_nc4hw4_c4rsk4_nchw.cu",
"cutlass_simt_f32_ifprop_1x1_relu_s8_128x64x32_64x32x32_2_nc4hw4_c4rsk4_nchw.cu",
"cutlass_simt_f32_ifprop_hswish_s8_128x64x32_64x32x32_2_nc4hw4_c4rsk4_nchw.cu",
"cutlass_simt_f32_ifprop_1x1_hswish_s8_128x64x32_64x32x32_2_nc4hw4_c4rsk4_nchw.cu",
"cutlass_simt_f32_ifprop_id_s8_64x128x32_64x32x32_2_nc4hw4_c4rsk4_nchw.cu",
"cutlass_simt_f32_ifprop_1x1_id_s8_64x128x32_64x32x32_2_nc4hw4_c4rsk4_nchw.cu",
"cutlass_simt_f32_ifprop_relu_s8_64x128x32_64x32x32_2_nc4hw4_c4rsk4_nchw.cu",
"cutlass_simt_f32_ifprop_1x1_relu_s8_64x128x32_64x32x32_2_nc4hw4_c4rsk4_nchw.cu",
"cutlass_simt_f32_ifprop_hswish_s8_64x128x32_64x32x32_2_nc4hw4_c4rsk4_nchw.cu",
"cutlass_simt_f32_ifprop_1x1_hswish_s8_64x128x32_64x32x32_2_nc4hw4_c4rsk4_nchw.cu",
"cutlass_simt_f32_ifprop_id_s8_128x32x32_64x32x32_2_nc4hw4_c4rsk4_nchw.cu",
"cutlass_simt_f32_ifprop_1x1_id_s8_128x32x32_64x32x32_2_nc4hw4_c4rsk4_nchw.cu",
"cutlass_simt_f32_ifprop_relu_s8_128x32x32_64x32x32_2_nc4hw4_c4rsk4_nchw.cu",
"cutlass_simt_f32_ifprop_1x1_relu_s8_128x32x32_64x32x32_2_nc4hw4_c4rsk4_nchw.cu",
"cutlass_simt_f32_ifprop_hswish_s8_128x32x32_64x32x32_2_nc4hw4_c4rsk4_nchw.cu",
"cutlass_simt_f32_ifprop_1x1_hswish_s8_128x32x32_64x32x32_2_nc4hw4_c4rsk4_nchw.cu",
"cutlass_simt_f32_ifprop_id_s8_32x128x32_32x64x32_2_nc4hw4_c4rsk4_nchw.cu",
"cutlass_simt_f32_ifprop_1x1_id_s8_32x128x32_32x64x32_2_nc4hw4_c4rsk4_nchw.cu",
"cutlass_simt_f32_ifprop_relu_s8_32x128x32_32x64x32_2_nc4hw4_c4rsk4_nchw.cu",
"cutlass_simt_f32_ifprop_1x1_relu_s8_32x128x32_32x64x32_2_nc4hw4_c4rsk4_nchw.cu",
"cutlass_simt_f32_ifprop_hswish_s8_32x128x32_32x64x32_2_nc4hw4_c4rsk4_nchw.cu",
"cutlass_simt_f32_ifprop_1x1_hswish_s8_32x128x32_32x64x32_2_nc4hw4_c4rsk4_nchw.cu",
"cutlass_simt_f32_ifprop_id_s8_32x64x32_32x64x32_2_nc4hw4_c4rsk4_nchw.cu",
"cutlass_simt_f32_ifprop_1x1_id_s8_32x64x32_32x64x32_2_nc4hw4_c4rsk4_nchw.cu",
"cutlass_simt_f32_ifprop_relu_s8_32x64x32_32x64x32_2_nc4hw4_c4rsk4_nchw.cu",
"cutlass_simt_f32_ifprop_1x1_relu_s8_32x64x32_32x64x32_2_nc4hw4_c4rsk4_nchw.cu",
"cutlass_simt_f32_ifprop_hswish_s8_32x64x32_32x64x32_2_nc4hw4_c4rsk4_nchw.cu",
"cutlass_simt_f32_ifprop_1x1_hswish_s8_32x64x32_32x64x32_2_nc4hw4_c4rsk4_nchw.cu",
"cutlass_simt_f32_ifprop_id_s8_64x32x32_64x32x32_2_nc4hw4_c4rsk4_nchw.cu",
"cutlass_simt_f32_ifprop_1x1_id_s8_64x32x32_64x32x32_2_nc4hw4_c4rsk4_nchw.cu",
"cutlass_simt_f32_ifprop_relu_s8_64x32x32_64x32x32_2_nc4hw4_c4rsk4_nchw.cu",
"cutlass_simt_f32_ifprop_1x1_relu_s8_64x32x32_64x32x32_2_nc4hw4_c4rsk4_nchw.cu",
"cutlass_simt_f32_ifprop_hswish_s8_64x32x32_64x32x32_2_nc4hw4_c4rsk4_nchw.cu",
"cutlass_simt_f32_ifprop_1x1_hswish_s8_64x32x32_64x32x32_2_nc4hw4_c4rsk4_nchw.cu",
"cutlass_simt_f32_ifprop_id_s8_16x128x16_16x128x16_1_nc4hw4_c4rsk4_nchw.cu",
"cutlass_simt_f32_ifprop_1x1_id_s8_16x128x16_16x128x16_1_nc4hw4_c4rsk4_nchw.cu",
"cutlass_simt_f32_ifprop_relu_s8_16x128x16_16x128x16_1_nc4hw4_c4rsk4_nchw.cu",
@@ -708,72 +597,288 @@ cutlass_gen_list = [
"cutlass_tensorop_s8_i8816fprop_1x1_roc_relu_s8_128x32x32_64x32x32_1_nc32hw32_c32rsk32.cu",
"cutlass_tensorop_s8_i8816fprop_roc_hswish_s8_128x32x32_64x32x32_1_nc32hw32_c32rsk32.cu",
"cutlass_tensorop_s8_i8816fprop_1x1_roc_hswish_s8_128x32x32_64x32x32_1_nc32hw32_c32rsk32.cu",
"cutlass_tensorop_s8_i8816fprop_roc_id_s8_64x128x32_32x64x32_1_nc32hw32_c32rsk32.cu",
"cutlass_tensorop_s8_i8816fprop_1x1_roc_id_s8_64x128x32_32x64x32_1_nc32hw32_c32rsk32.cu",
"cutlass_tensorop_s8_i8816fprop_roc_relu_s8_64x128x32_32x64x32_1_nc32hw32_c32rsk32.cu",
"cutlass_tensorop_s8_i8816fprop_1x1_roc_relu_s8_64x128x32_32x64x32_1_nc32hw32_c32rsk32.cu",
"cutlass_tensorop_s8_i8816fprop_roc_hswish_s8_64x128x32_32x64x32_1_nc32hw32_c32rsk32.cu",
"cutlass_tensorop_s8_i8816fprop_1x1_roc_hswish_s8_64x128x32_32x64x32_1_nc32hw32_c32rsk32.cu",
"cutlass_tensorop_s8_i8816fprop_roc_id_s8_32x128x32_32x64x32_1_nc32hw32_c32rsk32.cu",
"cutlass_tensorop_s8_i8816fprop_1x1_roc_id_s8_32x128x32_32x64x32_1_nc32hw32_c32rsk32.cu",
"cutlass_tensorop_s8_i8816fprop_roc_relu_s8_32x128x32_32x64x32_1_nc32hw32_c32rsk32.cu",
"cutlass_tensorop_s8_i8816fprop_1x1_roc_relu_s8_32x128x32_32x64x32_1_nc32hw32_c32rsk32.cu",
"cutlass_tensorop_s8_i8816fprop_roc_hswish_s8_32x128x32_32x64x32_1_nc32hw32_c32rsk32.cu",
"cutlass_tensorop_s8_i8816fprop_1x1_roc_hswish_s8_32x128x32_32x64x32_1_nc32hw32_c32rsk32.cu",
"cutlass_tensorop_s8_i8816fprop_id_s8_128x256x64_64x64x64_2_nc32hw32_c32rsk32_nc4hw4.cu",
"cutlass_tensorop_s8_i8816fprop_1x1_id_s8_128x256x64_64x64x64_2_nc32hw32_c32rsk32_nc4hw4.cu",
"cutlass_tensorop_s8_i8816fprop_relu_s8_128x256x64_64x64x64_2_nc32hw32_c32rsk32_nc4hw4.cu",
"cutlass_tensorop_s8_i8816fprop_1x1_relu_s8_128x256x64_64x64x64_2_nc32hw32_c32rsk32_nc4hw4.cu",
"cutlass_tensorop_s8_i8816fprop_hswish_s8_128x256x64_64x64x64_2_nc32hw32_c32rsk32_nc4hw4.cu",
"cutlass_tensorop_s8_i8816fprop_1x1_hswish_s8_128x256x64_64x64x64_2_nc32hw32_c32rsk32_nc4hw4.cu",
"cutlass_tensorop_s8_i8816fprop_id_s8_256x128x64_64x64x64_2_nc32hw32_c32rsk32_nc4hw4.cu",
"cutlass_tensorop_s8_i8816fprop_1x1_id_s8_256x128x64_64x64x64_2_nc32hw32_c32rsk32_nc4hw4.cu",
"cutlass_tensorop_s8_i8816fprop_relu_s8_256x128x64_64x64x64_2_nc32hw32_c32rsk32_nc4hw4.cu",
"cutlass_tensorop_s8_i8816fprop_1x1_relu_s8_256x128x64_64x64x64_2_nc32hw32_c32rsk32_nc4hw4.cu",
"cutlass_tensorop_s8_i8816fprop_hswish_s8_256x128x64_64x64x64_2_nc32hw32_c32rsk32_nc4hw4.cu",
"cutlass_tensorop_s8_i8816fprop_1x1_hswish_s8_256x128x64_64x64x64_2_nc32hw32_c32rsk32_nc4hw4.cu",
"cutlass_tensorop_s8_i8816fprop_id_s8_128x128x64_64x64x64_2_nc32hw32_c32rsk32_nc4hw4.cu",
"cutlass_tensorop_s8_i8816fprop_1x1_id_s8_128x128x64_64x64x64_2_nc32hw32_c32rsk32_nc4hw4.cu",
"cutlass_tensorop_s8_i8816fprop_relu_s8_128x128x64_64x64x64_2_nc32hw32_c32rsk32_nc4hw4.cu",
"cutlass_tensorop_s8_i8816fprop_1x1_relu_s8_128x128x64_64x64x64_2_nc32hw32_c32rsk32_nc4hw4.cu",
"cutlass_tensorop_s8_i8816fprop_hswish_s8_128x128x64_64x64x64_2_nc32hw32_c32rsk32_nc4hw4.cu",
"cutlass_tensorop_s8_i8816fprop_1x1_hswish_s8_128x128x64_64x64x64_2_nc32hw32_c32rsk32_nc4hw4.cu",
"cutlass_tensorop_s8_i8816fprop_id_s8_128x64x64_64x32x64_2_nc32hw32_c32rsk32_nc4hw4.cu",
"cutlass_tensorop_s8_i8816fprop_1x1_id_s8_128x64x64_64x32x64_2_nc32hw32_c32rsk32_nc4hw4.cu",
"cutlass_tensorop_s8_i8816fprop_relu_s8_128x64x64_64x32x64_2_nc32hw32_c32rsk32_nc4hw4.cu",
"cutlass_tensorop_s8_i8816fprop_1x1_relu_s8_128x64x64_64x32x64_2_nc32hw32_c32rsk32_nc4hw4.cu",
"cutlass_tensorop_s8_i8816fprop_hswish_s8_128x64x64_64x32x64_2_nc32hw32_c32rsk32_nc4hw4.cu",
"cutlass_tensorop_s8_i8816fprop_1x1_hswish_s8_128x64x64_64x32x64_2_nc32hw32_c32rsk32_nc4hw4.cu",
"cutlass_tensorop_s8_i8816fprop_id_s8_64x128x64_32x64x64_2_nc32hw32_c32rsk32_nc4hw4.cu",
"cutlass_tensorop_s8_i8816fprop_1x1_id_s8_64x128x64_32x64x64_2_nc32hw32_c32rsk32_nc4hw4.cu",
"cutlass_tensorop_s8_i8816fprop_relu_s8_64x128x64_32x64x64_2_nc32hw32_c32rsk32_nc4hw4.cu",
"cutlass_tensorop_s8_i8816fprop_1x1_relu_s8_64x128x64_32x64x64_2_nc32hw32_c32rsk32_nc4hw4.cu",
"cutlass_tensorop_s8_i8816fprop_hswish_s8_64x128x64_32x64x64_2_nc32hw32_c32rsk32_nc4hw4.cu",
"cutlass_tensorop_s8_i8816fprop_1x1_hswish_s8_64x128x64_32x64x64_2_nc32hw32_c32rsk32_nc4hw4.cu",
"cutlass_tensorop_s8_i8816fprop_id_s8_128x64x32_64x32x32_1_nc32hw32_c32rsk32_nc4hw4.cu",
"cutlass_tensorop_s8_i8816fprop_1x1_id_s8_128x64x32_64x32x32_1_nc32hw32_c32rsk32_nc4hw4.cu",
"cutlass_tensorop_s8_i8816fprop_relu_s8_128x64x32_64x32x32_1_nc32hw32_c32rsk32_nc4hw4.cu",
"cutlass_tensorop_s8_i8816fprop_1x1_relu_s8_128x64x32_64x32x32_1_nc32hw32_c32rsk32_nc4hw4.cu",
"cutlass_tensorop_s8_i8816fprop_hswish_s8_128x64x32_64x32x32_1_nc32hw32_c32rsk32_nc4hw4.cu",
"cutlass_tensorop_s8_i8816fprop_1x1_hswish_s8_128x64x32_64x32x32_1_nc32hw32_c32rsk32_nc4hw4.cu",
"cutlass_tensorop_s8_i8816fprop_id_s8_128x32x32_64x32x32_1_nc32hw32_c32rsk32_nc4hw4.cu",
"cutlass_tensorop_s8_i8816fprop_1x1_id_s8_128x32x32_64x32x32_1_nc32hw32_c32rsk32_nc4hw4.cu",
"cutlass_tensorop_s8_i8816fprop_relu_s8_128x32x32_64x32x32_1_nc32hw32_c32rsk32_nc4hw4.cu",
"cutlass_tensorop_s8_i8816fprop_1x1_relu_s8_128x32x32_64x32x32_1_nc32hw32_c32rsk32_nc4hw4.cu",
"cutlass_tensorop_s8_i8816fprop_hswish_s8_128x32x32_64x32x32_1_nc32hw32_c32rsk32_nc4hw4.cu",
"cutlass_tensorop_s8_i8816fprop_1x1_hswish_s8_128x32x32_64x32x32_1_nc32hw32_c32rsk32_nc4hw4.cu",
"cutlass_tensorop_s8_i8816fprop_id_s8_64x128x32_32x64x32_1_nc32hw32_c32rsk32_nc4hw4.cu",
"cutlass_tensorop_s8_i8816fprop_1x1_id_s8_64x128x32_32x64x32_1_nc32hw32_c32rsk32_nc4hw4.cu",
"cutlass_tensorop_s8_i8816fprop_relu_s8_64x128x32_32x64x32_1_nc32hw32_c32rsk32_nc4hw4.cu",
"cutlass_tensorop_s8_i8816fprop_1x1_relu_s8_64x128x32_32x64x32_1_nc32hw32_c32rsk32_nc4hw4.cu",
"cutlass_tensorop_s8_i8816fprop_hswish_s8_64x128x32_32x64x32_1_nc32hw32_c32rsk32_nc4hw4.cu",
"cutlass_tensorop_s8_i8816fprop_1x1_hswish_s8_64x128x32_32x64x32_1_nc32hw32_c32rsk32_nc4hw4.cu",
"cutlass_tensorop_s8_i8816fprop_id_s8_32x128x32_32x64x32_1_nc32hw32_c32rsk32_nc4hw4.cu",
"cutlass_tensorop_s8_i8816fprop_1x1_id_s8_32x128x32_32x64x32_1_nc32hw32_c32rsk32_nc4hw4.cu",
"cutlass_tensorop_s8_i8816fprop_relu_s8_32x128x32_32x64x32_1_nc32hw32_c32rsk32_nc4hw4.cu",
"cutlass_tensorop_s8_i8816fprop_1x1_relu_s8_32x128x32_32x64x32_1_nc32hw32_c32rsk32_nc4hw4.cu",
"cutlass_tensorop_s8_i8816fprop_hswish_s8_32x128x32_32x64x32_1_nc32hw32_c32rsk32_nc4hw4.cu",
"cutlass_tensorop_s8_i8816fprop_1x1_hswish_s8_32x128x32_32x64x32_1_nc32hw32_c32rsk32_nc4hw4.cu",
"cutlass_tensorop_s8_i8816fprop_id_s8_128x32x32_64x32x32_1_nhwc_nc4hw4.cu",
"cutlass_tensorop_s8_i8816fprop_1x1_id_s8_128x32x32_64x32x32_1_nhwc_nc4hw4.cu",
"cutlass_tensorop_s8_i8816fprop_relu_s8_128x32x32_64x32x32_1_nhwc_nc4hw4.cu",
"cutlass_tensorop_s8_i8816fprop_1x1_relu_s8_128x32x32_64x32x32_1_nhwc_nc4hw4.cu",
"cutlass_tensorop_s8_i8816fprop_hswish_s8_128x32x32_64x32x32_1_nhwc_nc4hw4.cu",
"cutlass_tensorop_s8_i8816fprop_1x1_hswish_s8_128x32x32_64x32x32_1_nhwc_nc4hw4.cu",
"cutlass_tensorop_s8_i8816fprop_roc_id_s8_128x32x32_64x32x32_1_nhwc_nc4hw4.cu",
"cutlass_tensorop_s8_i8816fprop_1x1_roc_id_s8_128x32x32_64x32x32_1_nhwc_nc4hw4.cu",
"cutlass_tensorop_s8_i8816fprop_roc_relu_s8_128x32x32_64x32x32_1_nhwc_nc4hw4.cu",
"cutlass_tensorop_s8_i8816fprop_1x1_roc_relu_s8_128x32x32_64x32x32_1_nhwc_nc4hw4.cu",
"cutlass_tensorop_s8_i8816fprop_roc_hswish_s8_128x32x32_64x32x32_1_nhwc_nc4hw4.cu",
"cutlass_tensorop_s8_i8816fprop_1x1_roc_hswish_s8_128x32x32_64x32x32_1_nhwc_nc4hw4.cu",
"cutlass_tensorop_s8_i8816fprop_id_s8_64x16x32_64x16x32_2_nhwc_nc4hw4.cu",
"cutlass_tensorop_s8_i8816fprop_1x1_id_s8_64x16x32_64x16x32_2_nhwc_nc4hw4.cu",
"cutlass_tensorop_s8_i8816fprop_relu_s8_64x16x32_64x16x32_2_nhwc_nc4hw4.cu",
"cutlass_tensorop_s8_i8816fprop_1x1_relu_s8_64x16x32_64x16x32_2_nhwc_nc4hw4.cu",
"cutlass_tensorop_s8_i8816fprop_hswish_s8_64x16x32_64x16x32_2_nhwc_nc4hw4.cu",
"cutlass_tensorop_s8_i8816fprop_1x1_hswish_s8_64x16x32_64x16x32_2_nhwc_nc4hw4.cu",
"cutlass_tensorop_s8_i8816fprop_roc_id_s8_64x16x32_64x16x32_2_nhwc_nc4hw4.cu",
"cutlass_tensorop_s8_i8816fprop_1x1_roc_id_s8_64x16x32_64x16x32_2_nhwc_nc4hw4.cu",
"cutlass_tensorop_s8_i8816fprop_roc_relu_s8_64x16x32_64x16x32_2_nhwc_nc4hw4.cu",
"cutlass_tensorop_s8_i8816fprop_1x1_roc_relu_s8_64x16x32_64x16x32_2_nhwc_nc4hw4.cu",
"cutlass_tensorop_s8_i8816fprop_roc_hswish_s8_64x16x32_64x16x32_2_nhwc_nc4hw4.cu",
"cutlass_tensorop_s8_i8816fprop_1x1_roc_hswish_s8_64x16x32_64x16x32_2_nhwc_nc4hw4.cu",
"cutlass_tensorop_s8_i8816fprop_id_s8_128x32x32_64x32x32_1_nhwc_nc8hw8.cu",
"cutlass_tensorop_s8_i8816fprop_1x1_id_s8_128x32x32_64x32x32_1_nhwc_nc8hw8.cu",
"cutlass_tensorop_s8_i8816fprop_relu_s8_128x32x32_64x32x32_1_nhwc_nc8hw8.cu",
"cutlass_tensorop_s8_i8816fprop_1x1_relu_s8_128x32x32_64x32x32_1_nhwc_nc8hw8.cu",
"cutlass_tensorop_s8_i8816fprop_hswish_s8_128x32x32_64x32x32_1_nhwc_nc8hw8.cu",
"cutlass_tensorop_s8_i8816fprop_1x1_hswish_s8_128x32x32_64x32x32_1_nhwc_nc8hw8.cu",
"cutlass_tensorop_s8_i8816fprop_roc_id_s8_128x32x32_64x32x32_1_nhwc_nc8hw8.cu",
"cutlass_tensorop_s8_i8816fprop_1x1_roc_id_s8_128x32x32_64x32x32_1_nhwc_nc8hw8.cu",
"cutlass_tensorop_s8_i8816fprop_roc_relu_s8_128x32x32_64x32x32_1_nhwc_nc8hw8.cu",
"cutlass_tensorop_s8_i8816fprop_1x1_roc_relu_s8_128x32x32_64x32x32_1_nhwc_nc8hw8.cu",
"cutlass_tensorop_s8_i8816fprop_roc_hswish_s8_128x32x32_64x32x32_1_nhwc_nc8hw8.cu",
"cutlass_tensorop_s8_i8816fprop_1x1_roc_hswish_s8_128x32x32_64x32x32_1_nhwc_nc8hw8.cu",
"cutlass_tensorop_s8_i8816fprop_id_s8_64x16x32_64x16x32_2_nhwc_nc8hw8.cu",
"cutlass_tensorop_s8_i8816fprop_1x1_id_s8_64x16x32_64x16x32_2_nhwc_nc8hw8.cu",
"cutlass_tensorop_s8_i8816fprop_relu_s8_64x16x32_64x16x32_2_nhwc_nc8hw8.cu",
"cutlass_tensorop_s8_i8816fprop_1x1_relu_s8_64x16x32_64x16x32_2_nhwc_nc8hw8.cu",
"cutlass_tensorop_s8_i8816fprop_hswish_s8_64x16x32_64x16x32_2_nhwc_nc8hw8.cu",
"cutlass_tensorop_s8_i8816fprop_1x1_hswish_s8_64x16x32_64x16x32_2_nhwc_nc8hw8.cu",
"cutlass_tensorop_s8_i8816fprop_roc_id_s8_64x16x32_64x16x32_2_nhwc_nc8hw8.cu",
"cutlass_tensorop_s8_i8816fprop_1x1_roc_id_s8_64x16x32_64x16x32_2_nhwc_nc8hw8.cu",
"cutlass_tensorop_s8_i8816fprop_roc_relu_s8_64x16x32_64x16x32_2_nhwc_nc8hw8.cu",
"cutlass_tensorop_s8_i8816fprop_1x1_roc_relu_s8_64x16x32_64x16x32_2_nhwc_nc8hw8.cu",
"cutlass_tensorop_s8_i8816fprop_roc_hswish_s8_64x16x32_64x16x32_2_nhwc_nc8hw8.cu",
"cutlass_tensorop_s8_i8816fprop_1x1_roc_hswish_s8_64x16x32_64x16x32_2_nhwc_nc8hw8.cu",
"cutlass_tensorop_s8_i8816fprop_id_s8_128x32x32_64x32x32_1_nhwc_nc16hw16.cu",
"cutlass_tensorop_s8_i8816fprop_1x1_id_s8_128x32x32_64x32x32_1_nhwc_nc16hw16.cu",
"cutlass_tensorop_s8_i8816fprop_relu_s8_128x32x32_64x32x32_1_nhwc_nc16hw16.cu",
"cutlass_tensorop_s8_i8816fprop_1x1_relu_s8_128x32x32_64x32x32_1_nhwc_nc16hw16.cu",
"cutlass_tensorop_s8_i8816fprop_hswish_s8_128x32x32_64x32x32_1_nhwc_nc16hw16.cu",
"cutlass_tensorop_s8_i8816fprop_1x1_hswish_s8_128x32x32_64x32x32_1_nhwc_nc16hw16.cu",
"cutlass_tensorop_s8_i8816fprop_roc_id_s8_128x32x32_64x32x32_1_nhwc_nc16hw16.cu",
"cutlass_tensorop_s8_i8816fprop_1x1_roc_id_s8_128x32x32_64x32x32_1_nhwc_nc16hw16.cu",
"cutlass_tensorop_s8_i8816fprop_roc_relu_s8_128x32x32_64x32x32_1_nhwc_nc16hw16.cu",
"cutlass_tensorop_s8_i8816fprop_1x1_roc_relu_s8_128x32x32_64x32x32_1_nhwc_nc16hw16.cu",
"cutlass_tensorop_s8_i8816fprop_roc_hswish_s8_128x32x32_64x32x32_1_nhwc_nc16hw16.cu",
"cutlass_tensorop_s8_i8816fprop_1x1_roc_hswish_s8_128x32x32_64x32x32_1_nhwc_nc16hw16.cu",
"cutlass_tensorop_s8_i8816fprop_id_s8_64x16x32_64x16x32_2_nhwc_nc16hw16.cu",
"cutlass_tensorop_s8_i8816fprop_1x1_id_s8_64x16x32_64x16x32_2_nhwc_nc16hw16.cu",
"cutlass_tensorop_s8_i8816fprop_relu_s8_64x16x32_64x16x32_2_nhwc_nc16hw16.cu",
"cutlass_tensorop_s8_i8816fprop_1x1_relu_s8_64x16x32_64x16x32_2_nhwc_nc16hw16.cu",
"cutlass_tensorop_s8_i8816fprop_hswish_s8_64x16x32_64x16x32_2_nhwc_nc16hw16.cu",
"cutlass_tensorop_s8_i8816fprop_1x1_hswish_s8_64x16x32_64x16x32_2_nhwc_nc16hw16.cu",
"cutlass_tensorop_s8_i8816fprop_roc_id_s8_64x16x32_64x16x32_2_nhwc_nc16hw16.cu",
"cutlass_tensorop_s8_i8816fprop_1x1_roc_id_s8_64x16x32_64x16x32_2_nhwc_nc16hw16.cu",
"cutlass_tensorop_s8_i8816fprop_roc_relu_s8_64x16x32_64x16x32_2_nhwc_nc16hw16.cu",
"cutlass_tensorop_s8_i8816fprop_1x1_roc_relu_s8_64x16x32_64x16x32_2_nhwc_nc16hw16.cu",
"cutlass_tensorop_s8_i8816fprop_roc_hswish_s8_64x16x32_64x16x32_2_nhwc_nc16hw16.cu",
"cutlass_tensorop_s8_i8816fprop_1x1_roc_hswish_s8_64x16x32_64x16x32_2_nhwc_nc16hw16.cu",
"cutlass_tensorop_s4_i8816fprop_id_s8_128x32x32_64x32x32_1_nhwc_nc4hw4.cu",
"cutlass_tensorop_s4_i8816fprop_1x1_id_s8_128x32x32_64x32x32_1_nhwc_nc4hw4.cu",
"cutlass_tensorop_s4_i8816fprop_relu_s8_128x32x32_64x32x32_1_nhwc_nc4hw4.cu",
"cutlass_tensorop_s4_i8816fprop_1x1_relu_s8_128x32x32_64x32x32_1_nhwc_nc4hw4.cu",
"cutlass_tensorop_s4_i8816fprop_hswish_s8_128x32x32_64x32x32_1_nhwc_nc4hw4.cu",
"cutlass_tensorop_s4_i8816fprop_1x1_hswish_s8_128x32x32_64x32x32_1_nhwc_nc4hw4.cu",
"cutlass_tensorop_s4_i8816fprop_roc_id_s8_128x32x32_64x32x32_1_nhwc_nc4hw4.cu",
"cutlass_tensorop_s4_i8816fprop_1x1_roc_id_s8_128x32x32_64x32x32_1_nhwc_nc4hw4.cu",
"cutlass_tensorop_s4_i8816fprop_roc_relu_s8_128x32x32_64x32x32_1_nhwc_nc4hw4.cu",
"cutlass_tensorop_s4_i8816fprop_1x1_roc_relu_s8_128x32x32_64x32x32_1_nhwc_nc4hw4.cu",
"cutlass_tensorop_s4_i8816fprop_roc_hswish_s8_128x32x32_64x32x32_1_nhwc_nc4hw4.cu",
"cutlass_tensorop_s4_i8816fprop_1x1_roc_hswish_s8_128x32x32_64x32x32_1_nhwc_nc4hw4.cu",
"cutlass_tensorop_s4_i8816fprop_id_s8_64x16x32_64x16x32_2_nhwc_nc4hw4.cu",
"cutlass_tensorop_s4_i8816fprop_1x1_id_s8_64x16x32_64x16x32_2_nhwc_nc4hw4.cu",
"cutlass_tensorop_s4_i8816fprop_relu_s8_64x16x32_64x16x32_2_nhwc_nc4hw4.cu",
"cutlass_tensorop_s4_i8816fprop_1x1_relu_s8_64x16x32_64x16x32_2_nhwc_nc4hw4.cu",
"cutlass_tensorop_s4_i8816fprop_hswish_s8_64x16x32_64x16x32_2_nhwc_nc4hw4.cu",
"cutlass_tensorop_s4_i8816fprop_1x1_hswish_s8_64x16x32_64x16x32_2_nhwc_nc4hw4.cu",
"cutlass_tensorop_s4_i8816fprop_roc_id_s8_64x16x32_64x16x32_2_nhwc_nc4hw4.cu",
"cutlass_tensorop_s4_i8816fprop_1x1_roc_id_s8_64x16x32_64x16x32_2_nhwc_nc4hw4.cu",
"cutlass_tensorop_s4_i8816fprop_roc_relu_s8_64x16x32_64x16x32_2_nhwc_nc4hw4.cu",
"cutlass_tensorop_s4_i8816fprop_1x1_roc_relu_s8_64x16x32_64x16x32_2_nhwc_nc4hw4.cu",
"cutlass_tensorop_s4_i8816fprop_roc_hswish_s8_64x16x32_64x16x32_2_nhwc_nc4hw4.cu",
"cutlass_tensorop_s4_i8816fprop_1x1_roc_hswish_s8_64x16x32_64x16x32_2_nhwc_nc4hw4.cu",
"cutlass_tensorop_u4_i8816fprop_id_s8_128x32x32_64x32x32_1_nhwc_nc4hw4.cu",
"cutlass_tensorop_u4_i8816fprop_1x1_id_s8_128x32x32_64x32x32_1_nhwc_nc4hw4.cu",
"cutlass_tensorop_u4_i8816fprop_relu_s8_128x32x32_64x32x32_1_nhwc_nc4hw4.cu",
"cutlass_tensorop_u4_i8816fprop_1x1_relu_s8_128x32x32_64x32x32_1_nhwc_nc4hw4.cu",
"cutlass_tensorop_u4_i8816fprop_hswish_s8_128x32x32_64x32x32_1_nhwc_nc4hw4.cu",
"cutlass_tensorop_u4_i8816fprop_1x1_hswish_s8_128x32x32_64x32x32_1_nhwc_nc4hw4.cu",
"cutlass_tensorop_u4_i8816fprop_roc_id_s8_128x32x32_64x32x32_1_nhwc_nc4hw4.cu",
"cutlass_tensorop_u4_i8816fprop_1x1_roc_id_s8_128x32x32_64x32x32_1_nhwc_nc4hw4.cu",
"cutlass_tensorop_u4_i8816fprop_roc_relu_s8_128x32x32_64x32x32_1_nhwc_nc4hw4.cu",
"cutlass_tensorop_u4_i8816fprop_1x1_roc_relu_s8_128x32x32_64x32x32_1_nhwc_nc4hw4.cu",
"cutlass_tensorop_u4_i8816fprop_roc_hswish_s8_128x32x32_64x32x32_1_nhwc_nc4hw4.cu",
"cutlass_tensorop_u4_i8816fprop_1x1_roc_hswish_s8_128x32x32_64x32x32_1_nhwc_nc4hw4.cu",
"cutlass_tensorop_u4_i8816fprop_id_s8_64x16x32_64x16x32_2_nhwc_nc4hw4.cu",
"cutlass_tensorop_u4_i8816fprop_1x1_id_s8_64x16x32_64x16x32_2_nhwc_nc4hw4.cu",
"cutlass_tensorop_u4_i8816fprop_relu_s8_64x16x32_64x16x32_2_nhwc_nc4hw4.cu",
"cutlass_tensorop_u4_i8816fprop_1x1_relu_s8_64x16x32_64x16x32_2_nhwc_nc4hw4.cu",
"cutlass_tensorop_u4_i8816fprop_hswish_s8_64x16x32_64x16x32_2_nhwc_nc4hw4.cu",
"cutlass_tensorop_u4_i8816fprop_1x1_hswish_s8_64x16x32_64x16x32_2_nhwc_nc4hw4.cu",
"cutlass_tensorop_u4_i8816fprop_roc_id_s8_64x16x32_64x16x32_2_nhwc_nc4hw4.cu",
"cutlass_tensorop_u4_i8816fprop_1x1_roc_id_s8_64x16x32_64x16x32_2_nhwc_nc4hw4.cu",
"cutlass_tensorop_u4_i8816fprop_roc_relu_s8_64x16x32_64x16x32_2_nhwc_nc4hw4.cu",
"cutlass_tensorop_u4_i8816fprop_1x1_roc_relu_s8_64x16x32_64x16x32_2_nhwc_nc4hw4.cu",
"cutlass_tensorop_u4_i8816fprop_roc_hswish_s8_64x16x32_64x16x32_2_nhwc_nc4hw4.cu",
"cutlass_tensorop_u4_i8816fprop_1x1_roc_hswish_s8_64x16x32_64x16x32_2_nhwc_nc4hw4.cu",
"cutlass_tensorop_f32_i8816fprop_id_s8_128x32x32_64x32x32_1_nhwc_nc4hw4.cu",
"cutlass_tensorop_f32_i8816fprop_1x1_id_s8_128x32x32_64x32x32_1_nhwc_nc4hw4.cu",
"cutlass_tensorop_f32_i8816fprop_relu_s8_128x32x32_64x32x32_1_nhwc_nc4hw4.cu",
"cutlass_tensorop_f32_i8816fprop_1x1_relu_s8_128x32x32_64x32x32_1_nhwc_nc4hw4.cu",
"cutlass_tensorop_f32_i8816fprop_hswish_s8_128x32x32_64x32x32_1_nhwc_nc4hw4.cu",
"cutlass_tensorop_f32_i8816fprop_1x1_hswish_s8_128x32x32_64x32x32_1_nhwc_nc4hw4.cu",
"cutlass_tensorop_f32_i8816fprop_id_s8_64x16x32_64x16x32_2_nhwc_nc4hw4.cu",
"cutlass_tensorop_f32_i8816fprop_1x1_id_s8_64x16x32_64x16x32_2_nhwc_nc4hw4.cu",
"cutlass_tensorop_f32_i8816fprop_relu_s8_64x16x32_64x16x32_2_nhwc_nc4hw4.cu",
"cutlass_tensorop_f32_i8816fprop_1x1_relu_s8_64x16x32_64x16x32_2_nhwc_nc4hw4.cu",
"cutlass_tensorop_f32_i8816fprop_hswish_s8_64x16x32_64x16x32_2_nhwc_nc4hw4.cu",
"cutlass_tensorop_f32_i8816fprop_1x1_hswish_s8_64x16x32_64x16x32_2_nhwc_nc4hw4.cu",
"cutlass_tensorop_f32_i8816fprop_roc_id_s8_64x16x32_64x16x32_2_nhwc_nc4hw4.cu",
"cutlass_tensorop_f32_i8816fprop_1x1_roc_id_s8_64x16x32_64x16x32_2_nhwc_nc4hw4.cu",
"cutlass_tensorop_f32_i8816fprop_roc_relu_s8_64x16x32_64x16x32_2_nhwc_nc4hw4.cu",
"cutlass_tensorop_f32_i8816fprop_1x1_roc_relu_s8_64x16x32_64x16x32_2_nhwc_nc4hw4.cu",
"cutlass_tensorop_f32_i8816fprop_roc_hswish_s8_64x16x32_64x16x32_2_nhwc_nc4hw4.cu",
"cutlass_tensorop_f32_i8816fprop_1x1_roc_hswish_s8_64x16x32_64x16x32_2_nhwc_nc4hw4.cu",
"cutlass_tensorop_s4_i8816fprop_id_s8_128x32x32_64x32x32_1_nhwc_nc8hw8.cu",
"cutlass_tensorop_s4_i8816fprop_1x1_id_s8_128x32x32_64x32x32_1_nhwc_nc8hw8.cu",
"cutlass_tensorop_s4_i8816fprop_relu_s8_128x32x32_64x32x32_1_nhwc_nc8hw8.cu",
"cutlass_tensorop_s4_i8816fprop_1x1_relu_s8_128x32x32_64x32x32_1_nhwc_nc8hw8.cu",
"cutlass_tensorop_s4_i8816fprop_hswish_s8_128x32x32_64x32x32_1_nhwc_nc8hw8.cu",
"cutlass_tensorop_s4_i8816fprop_1x1_hswish_s8_128x32x32_64x32x32_1_nhwc_nc8hw8.cu",
"cutlass_tensorop_s4_i8816fprop_roc_id_s8_128x32x32_64x32x32_1_nhwc_nc8hw8.cu",
"cutlass_tensorop_s4_i8816fprop_1x1_roc_id_s8_128x32x32_64x32x32_1_nhwc_nc8hw8.cu",
"cutlass_tensorop_s4_i8816fprop_roc_relu_s8_128x32x32_64x32x32_1_nhwc_nc8hw8.cu",
"cutlass_tensorop_s4_i8816fprop_1x1_roc_relu_s8_128x32x32_64x32x32_1_nhwc_nc8hw8.cu",
"cutlass_tensorop_s4_i8816fprop_roc_hswish_s8_128x32x32_64x32x32_1_nhwc_nc8hw8.cu",
"cutlass_tensorop_s4_i8816fprop_1x1_roc_hswish_s8_128x32x32_64x32x32_1_nhwc_nc8hw8.cu",
"cutlass_tensorop_s4_i8816fprop_id_s8_64x16x32_64x16x32_2_nhwc_nc8hw8.cu",
"cutlass_tensorop_s4_i8816fprop_1x1_id_s8_64x16x32_64x16x32_2_nhwc_nc8hw8.cu",
"cutlass_tensorop_s4_i8816fprop_relu_s8_64x16x32_64x16x32_2_nhwc_nc8hw8.cu",
"cutlass_tensorop_s4_i8816fprop_1x1_relu_s8_64x16x32_64x16x32_2_nhwc_nc8hw8.cu",
"cutlass_tensorop_s4_i8816fprop_hswish_s8_64x16x32_64x16x32_2_nhwc_nc8hw8.cu",
"cutlass_tensorop_s4_i8816fprop_1x1_hswish_s8_64x16x32_64x16x32_2_nhwc_nc8hw8.cu",
"cutlass_tensorop_s4_i8816fprop_roc_id_s8_64x16x32_64x16x32_2_nhwc_nc8hw8.cu",
"cutlass_tensorop_s4_i8816fprop_1x1_roc_id_s8_64x16x32_64x16x32_2_nhwc_nc8hw8.cu",
"cutlass_tensorop_s4_i8816fprop_roc_relu_s8_64x16x32_64x16x32_2_nhwc_nc8hw8.cu",
"cutlass_tensorop_s4_i8816fprop_1x1_roc_relu_s8_64x16x32_64x16x32_2_nhwc_nc8hw8.cu",
"cutlass_tensorop_s4_i8816fprop_roc_hswish_s8_64x16x32_64x16x32_2_nhwc_nc8hw8.cu",
"cutlass_tensorop_s4_i8816fprop_1x1_roc_hswish_s8_64x16x32_64x16x32_2_nhwc_nc8hw8.cu",
"cutlass_tensorop_u4_i8816fprop_id_s8_128x32x32_64x32x32_1_nhwc_nc8hw8.cu",
"cutlass_tensorop_u4_i8816fprop_1x1_id_s8_128x32x32_64x32x32_1_nhwc_nc8hw8.cu",
"cutlass_tensorop_u4_i8816fprop_relu_s8_128x32x32_64x32x32_1_nhwc_nc8hw8.cu",
"cutlass_tensorop_u4_i8816fprop_1x1_relu_s8_128x32x32_64x32x32_1_nhwc_nc8hw8.cu",
"cutlass_tensorop_u4_i8816fprop_hswish_s8_128x32x32_64x32x32_1_nhwc_nc8hw8.cu",
"cutlass_tensorop_u4_i8816fprop_1x1_hswish_s8_128x32x32_64x32x32_1_nhwc_nc8hw8.cu",
"cutlass_tensorop_u4_i8816fprop_roc_id_s8_128x32x32_64x32x32_1_nhwc_nc8hw8.cu",
"cutlass_tensorop_u4_i8816fprop_1x1_roc_id_s8_128x32x32_64x32x32_1_nhwc_nc8hw8.cu",
"cutlass_tensorop_u4_i8816fprop_roc_relu_s8_128x32x32_64x32x32_1_nhwc_nc8hw8.cu",
"cutlass_tensorop_u4_i8816fprop_1x1_roc_relu_s8_128x32x32_64x32x32_1_nhwc_nc8hw8.cu",
"cutlass_tensorop_u4_i8816fprop_roc_hswish_s8_128x32x32_64x32x32_1_nhwc_nc8hw8.cu",
"cutlass_tensorop_u4_i8816fprop_1x1_roc_hswish_s8_128x32x32_64x32x32_1_nhwc_nc8hw8.cu",
"cutlass_tensorop_u4_i8816fprop_id_s8_64x16x32_64x16x32_2_nhwc_nc8hw8.cu",
"cutlass_tensorop_u4_i8816fprop_1x1_id_s8_64x16x32_64x16x32_2_nhwc_nc8hw8.cu",
"cutlass_tensorop_u4_i8816fprop_relu_s8_64x16x32_64x16x32_2_nhwc_nc8hw8.cu",
"cutlass_tensorop_u4_i8816fprop_1x1_relu_s8_64x16x32_64x16x32_2_nhwc_nc8hw8.cu",
"cutlass_tensorop_u4_i8816fprop_hswish_s8_64x16x32_64x16x32_2_nhwc_nc8hw8.cu",
"cutlass_tensorop_u4_i8816fprop_1x1_hswish_s8_64x16x32_64x16x32_2_nhwc_nc8hw8.cu",
"cutlass_tensorop_u4_i8816fprop_roc_id_s8_64x16x32_64x16x32_2_nhwc_nc8hw8.cu",
"cutlass_tensorop_u4_i8816fprop_1x1_roc_id_s8_64x16x32_64x16x32_2_nhwc_nc8hw8.cu",
"cutlass_tensorop_u4_i8816fprop_roc_relu_s8_64x16x32_64x16x32_2_nhwc_nc8hw8.cu",
"cutlass_tensorop_u4_i8816fprop_1x1_roc_relu_s8_64x16x32_64x16x32_2_nhwc_nc8hw8.cu",
"cutlass_tensorop_u4_i8816fprop_roc_hswish_s8_64x16x32_64x16x32_2_nhwc_nc8hw8.cu",
"cutlass_tensorop_u4_i8816fprop_1x1_roc_hswish_s8_64x16x32_64x16x32_2_nhwc_nc8hw8.cu",
"cutlass_tensorop_f32_i8816fprop_id_s8_128x32x32_64x32x32_1_nhwc_nc8hw8.cu",
"cutlass_tensorop_f32_i8816fprop_1x1_id_s8_128x32x32_64x32x32_1_nhwc_nc8hw8.cu",
"cutlass_tensorop_f32_i8816fprop_relu_s8_128x32x32_64x32x32_1_nhwc_nc8hw8.cu",
"cutlass_tensorop_f32_i8816fprop_1x1_relu_s8_128x32x32_64x32x32_1_nhwc_nc8hw8.cu",
"cutlass_tensorop_f32_i8816fprop_hswish_s8_128x32x32_64x32x32_1_nhwc_nc8hw8.cu",
"cutlass_tensorop_f32_i8816fprop_1x1_hswish_s8_128x32x32_64x32x32_1_nhwc_nc8hw8.cu",
"cutlass_tensorop_f32_i8816fprop_id_s8_64x16x32_64x16x32_2_nhwc_nc8hw8.cu",
"cutlass_tensorop_f32_i8816fprop_1x1_id_s8_64x16x32_64x16x32_2_nhwc_nc8hw8.cu",
"cutlass_tensorop_f32_i8816fprop_relu_s8_64x16x32_64x16x32_2_nhwc_nc8hw8.cu",
"cutlass_tensorop_f32_i8816fprop_1x1_relu_s8_64x16x32_64x16x32_2_nhwc_nc8hw8.cu",
"cutlass_tensorop_f32_i8816fprop_hswish_s8_64x16x32_64x16x32_2_nhwc_nc8hw8.cu",
"cutlass_tensorop_f32_i8816fprop_1x1_hswish_s8_64x16x32_64x16x32_2_nhwc_nc8hw8.cu",
"cutlass_tensorop_f32_i8816fprop_roc_id_s8_64x16x32_64x16x32_2_nhwc_nc8hw8.cu",
"cutlass_tensorop_f32_i8816fprop_1x1_roc_id_s8_64x16x32_64x16x32_2_nhwc_nc8hw8.cu",
"cutlass_tensorop_f32_i8816fprop_roc_relu_s8_64x16x32_64x16x32_2_nhwc_nc8hw8.cu",
"cutlass_tensorop_f32_i8816fprop_1x1_roc_relu_s8_64x16x32_64x16x32_2_nhwc_nc8hw8.cu",
"cutlass_tensorop_f32_i8816fprop_roc_hswish_s8_64x16x32_64x16x32_2_nhwc_nc8hw8.cu",
"cutlass_tensorop_f32_i8816fprop_1x1_roc_hswish_s8_64x16x32_64x16x32_2_nhwc_nc8hw8.cu",
"cutlass_tensorop_s4_i8816fprop_id_s8_128x32x32_64x32x32_1_nhwc_nc16hw16.cu",
"cutlass_tensorop_s4_i8816fprop_1x1_id_s8_128x32x32_64x32x32_1_nhwc_nc16hw16.cu",
"cutlass_tensorop_s4_i8816fprop_relu_s8_128x32x32_64x32x32_1_nhwc_nc16hw16.cu",
"cutlass_tensorop_s4_i8816fprop_1x1_relu_s8_128x32x32_64x32x32_1_nhwc_nc16hw16.cu",
"cutlass_tensorop_s4_i8816fprop_hswish_s8_128x32x32_64x32x32_1_nhwc_nc16hw16.cu",
"cutlass_tensorop_s4_i8816fprop_1x1_hswish_s8_128x32x32_64x32x32_1_nhwc_nc16hw16.cu",
"cutlass_tensorop_s4_i8816fprop_roc_id_s8_128x32x32_64x32x32_1_nhwc_nc16hw16.cu",
"cutlass_tensorop_s4_i8816fprop_1x1_roc_id_s8_128x32x32_64x32x32_1_nhwc_nc16hw16.cu",
"cutlass_tensorop_s4_i8816fprop_roc_relu_s8_128x32x32_64x32x32_1_nhwc_nc16hw16.cu",
"cutlass_tensorop_s4_i8816fprop_1x1_roc_relu_s8_128x32x32_64x32x32_1_nhwc_nc16hw16.cu",
"cutlass_tensorop_s4_i8816fprop_roc_hswish_s8_128x32x32_64x32x32_1_nhwc_nc16hw16.cu",
"cutlass_tensorop_s4_i8816fprop_1x1_roc_hswish_s8_128x32x32_64x32x32_1_nhwc_nc16hw16.cu",
"cutlass_tensorop_s4_i8816fprop_id_s8_64x16x32_64x16x32_2_nhwc_nc16hw16.cu",
"cutlass_tensorop_s4_i8816fprop_1x1_id_s8_64x16x32_64x16x32_2_nhwc_nc16hw16.cu",
"cutlass_tensorop_s4_i8816fprop_relu_s8_64x16x32_64x16x32_2_nhwc_nc16hw16.cu",
"cutlass_tensorop_s4_i8816fprop_1x1_relu_s8_64x16x32_64x16x32_2_nhwc_nc16hw16.cu",
"cutlass_tensorop_s4_i8816fprop_hswish_s8_64x16x32_64x16x32_2_nhwc_nc16hw16.cu",
"cutlass_tensorop_s4_i8816fprop_1x1_hswish_s8_64x16x32_64x16x32_2_nhwc_nc16hw16.cu",
"cutlass_tensorop_s4_i8816fprop_roc_id_s8_64x16x32_64x16x32_2_nhwc_nc16hw16.cu",
"cutlass_tensorop_s4_i8816fprop_1x1_roc_id_s8_64x16x32_64x16x32_2_nhwc_nc16hw16.cu",
"cutlass_tensorop_s4_i8816fprop_roc_relu_s8_64x16x32_64x16x32_2_nhwc_nc16hw16.cu",
"cutlass_tensorop_s4_i8816fprop_1x1_roc_relu_s8_64x16x32_64x16x32_2_nhwc_nc16hw16.cu",
"cutlass_tensorop_s4_i8816fprop_roc_hswish_s8_64x16x32_64x16x32_2_nhwc_nc16hw16.cu",
"cutlass_tensorop_s4_i8816fprop_1x1_roc_hswish_s8_64x16x32_64x16x32_2_nhwc_nc16hw16.cu",
"cutlass_tensorop_u4_i8816fprop_id_s8_128x32x32_64x32x32_1_nhwc_nc16hw16.cu",
"cutlass_tensorop_u4_i8816fprop_1x1_id_s8_128x32x32_64x32x32_1_nhwc_nc16hw16.cu",
"cutlass_tensorop_u4_i8816fprop_relu_s8_128x32x32_64x32x32_1_nhwc_nc16hw16.cu",
"cutlass_tensorop_u4_i8816fprop_1x1_relu_s8_128x32x32_64x32x32_1_nhwc_nc16hw16.cu",
"cutlass_tensorop_u4_i8816fprop_hswish_s8_128x32x32_64x32x32_1_nhwc_nc16hw16.cu",
"cutlass_tensorop_u4_i8816fprop_1x1_hswish_s8_128x32x32_64x32x32_1_nhwc_nc16hw16.cu",
"cutlass_tensorop_u4_i8816fprop_roc_id_s8_128x32x32_64x32x32_1_nhwc_nc16hw16.cu",
"cutlass_tensorop_u4_i8816fprop_1x1_roc_id_s8_128x32x32_64x32x32_1_nhwc_nc16hw16.cu",
"cutlass_tensorop_u4_i8816fprop_roc_relu_s8_128x32x32_64x32x32_1_nhwc_nc16hw16.cu",
"cutlass_tensorop_u4_i8816fprop_1x1_roc_relu_s8_128x32x32_64x32x32_1_nhwc_nc16hw16.cu",
"cutlass_tensorop_u4_i8816fprop_roc_hswish_s8_128x32x32_64x32x32_1_nhwc_nc16hw16.cu",
"cutlass_tensorop_u4_i8816fprop_1x1_roc_hswish_s8_128x32x32_64x32x32_1_nhwc_nc16hw16.cu",
"cutlass_tensorop_u4_i8816fprop_id_s8_64x16x32_64x16x32_2_nhwc_nc16hw16.cu",
"cutlass_tensorop_u4_i8816fprop_1x1_id_s8_64x16x32_64x16x32_2_nhwc_nc16hw16.cu",
"cutlass_tensorop_u4_i8816fprop_relu_s8_64x16x32_64x16x32_2_nhwc_nc16hw16.cu",
"cutlass_tensorop_u4_i8816fprop_1x1_relu_s8_64x16x32_64x16x32_2_nhwc_nc16hw16.cu",
"cutlass_tensorop_u4_i8816fprop_hswish_s8_64x16x32_64x16x32_2_nhwc_nc16hw16.cu",
"cutlass_tensorop_u4_i8816fprop_1x1_hswish_s8_64x16x32_64x16x32_2_nhwc_nc16hw16.cu",
"cutlass_tensorop_u4_i8816fprop_roc_id_s8_64x16x32_64x16x32_2_nhwc_nc16hw16.cu",
"cutlass_tensorop_u4_i8816fprop_1x1_roc_id_s8_64x16x32_64x16x32_2_nhwc_nc16hw16.cu",
"cutlass_tensorop_u4_i8816fprop_roc_relu_s8_64x16x32_64x16x32_2_nhwc_nc16hw16.cu",
"cutlass_tensorop_u4_i8816fprop_1x1_roc_relu_s8_64x16x32_64x16x32_2_nhwc_nc16hw16.cu",
"cutlass_tensorop_u4_i8816fprop_roc_hswish_s8_64x16x32_64x16x32_2_nhwc_nc16hw16.cu",
"cutlass_tensorop_u4_i8816fprop_1x1_roc_hswish_s8_64x16x32_64x16x32_2_nhwc_nc16hw16.cu",
"cutlass_tensorop_f32_i8816fprop_id_s8_128x32x32_64x32x32_1_nhwc_nc16hw16.cu",
"cutlass_tensorop_f32_i8816fprop_1x1_id_s8_128x32x32_64x32x32_1_nhwc_nc16hw16.cu",
"cutlass_tensorop_f32_i8816fprop_relu_s8_128x32x32_64x32x32_1_nhwc_nc16hw16.cu",
"cutlass_tensorop_f32_i8816fprop_1x1_relu_s8_128x32x32_64x32x32_1_nhwc_nc16hw16.cu",
"cutlass_tensorop_f32_i8816fprop_hswish_s8_128x32x32_64x32x32_1_nhwc_nc16hw16.cu",
"cutlass_tensorop_f32_i8816fprop_1x1_hswish_s8_128x32x32_64x32x32_1_nhwc_nc16hw16.cu",
"cutlass_tensorop_f32_i8816fprop_id_s8_64x16x32_64x16x32_2_nhwc_nc16hw16.cu",
"cutlass_tensorop_f32_i8816fprop_1x1_id_s8_64x16x32_64x16x32_2_nhwc_nc16hw16.cu",
"cutlass_tensorop_f32_i8816fprop_relu_s8_64x16x32_64x16x32_2_nhwc_nc16hw16.cu",
"cutlass_tensorop_f32_i8816fprop_1x1_relu_s8_64x16x32_64x16x32_2_nhwc_nc16hw16.cu",
"cutlass_tensorop_f32_i8816fprop_hswish_s8_64x16x32_64x16x32_2_nhwc_nc16hw16.cu",
"cutlass_tensorop_f32_i8816fprop_1x1_hswish_s8_64x16x32_64x16x32_2_nhwc_nc16hw16.cu",
"cutlass_tensorop_f32_i8816fprop_roc_id_s8_64x16x32_64x16x32_2_nhwc_nc16hw16.cu",
"cutlass_tensorop_f32_i8816fprop_1x1_roc_id_s8_64x16x32_64x16x32_2_nhwc_nc16hw16.cu",
"cutlass_tensorop_f32_i8816fprop_roc_relu_s8_64x16x32_64x16x32_2_nhwc_nc16hw16.cu",
"cutlass_tensorop_f32_i8816fprop_1x1_roc_relu_s8_64x16x32_64x16x32_2_nhwc_nc16hw16.cu",
"cutlass_tensorop_f32_i8816fprop_roc_hswish_s8_64x16x32_64x16x32_2_nhwc_nc16hw16.cu",
"cutlass_tensorop_f32_i8816fprop_1x1_roc_hswish_s8_64x16x32_64x16x32_2_nhwc_nc16hw16.cu",
"all_conv2d_tensorop8816_operations.cu",
"cutlass_tensorop_s4_i8832fprop_roc_id_s4_128x256x128_64x64x128_2_nc64hw64_c64rsk64.cu",
"cutlass_tensorop_s4_i8832fprop_1x1_roc_id_s4_128x256x128_64x64x128_2_nc64hw64_c64rsk64.cu",
@@ -815,6 +920,12 @@ cutlass_gen_list = [
"cutlass_tensorop_u4_i8832fprop_1x1_roc_id_u4_s4_128x64x64_64x64x64_1_nc64hw64_c64rsk64.cu",
"cutlass_tensorop_u4_i8832fprop_roc_relu_u4_s4_128x64x64_64x64x64_1_nc64hw64_c64rsk64.cu",
"cutlass_tensorop_u4_i8832fprop_1x1_roc_relu_u4_s4_128x64x64_64x64x64_1_nc64hw64_c64rsk64.cu",
"cutlass_tensorop_s4_i8832fprop_id_s4_128x16x64_128x16x64_2_nhwc_nc8hw8.cu",
"cutlass_tensorop_s4_i8832fprop_1x1_id_s4_128x16x64_128x16x64_2_nhwc_nc8hw8.cu",
"cutlass_tensorop_s4_i8832fprop_relu_s4_128x16x64_128x16x64_2_nhwc_nc8hw8.cu",
"cutlass_tensorop_s4_i8832fprop_1x1_relu_s4_128x16x64_128x16x64_2_nhwc_nc8hw8.cu",
"cutlass_tensorop_s4_i8832fprop_hswish_s4_128x16x64_128x16x64_2_nhwc_nc8hw8.cu",
"cutlass_tensorop_s4_i8832fprop_1x1_hswish_s4_128x16x64_128x16x64_2_nhwc_nc8hw8.cu",
"cutlass_tensorop_s4_i8832fprop_id_s4_128x32x64_64x32x64_1_nhwc_nc8hw8.cu",
"cutlass_tensorop_s4_i8832fprop_1x1_id_s4_128x32x64_64x32x64_1_nhwc_nc8hw8.cu",
"cutlass_tensorop_s4_i8832fprop_relu_s4_128x32x64_64x32x64_1_nhwc_nc8hw8.cu",
@@ -839,6 +950,12 @@ cutlass_gen_list = [
"cutlass_tensorop_s4_i8832fprop_1x1_roc_relu_s4_128x64x64_64x64x64_1_nhwc_nc8hw8.cu",
"cutlass_tensorop_s4_i8832fprop_roc_hswish_s4_128x64x64_64x64x64_1_nhwc_nc8hw8.cu",
"cutlass_tensorop_s4_i8832fprop_1x1_roc_hswish_s4_128x64x64_64x64x64_1_nhwc_nc8hw8.cu",
"cutlass_tensorop_s4_i8832fprop_id_s4_128x16x64_128x16x64_2_nhwc_nc16hw16.cu",
"cutlass_tensorop_s4_i8832fprop_1x1_id_s4_128x16x64_128x16x64_2_nhwc_nc16hw16.cu",
"cutlass_tensorop_s4_i8832fprop_relu_s4_128x16x64_128x16x64_2_nhwc_nc16hw16.cu",
"cutlass_tensorop_s4_i8832fprop_1x1_relu_s4_128x16x64_128x16x64_2_nhwc_nc16hw16.cu",
"cutlass_tensorop_s4_i8832fprop_hswish_s4_128x16x64_128x16x64_2_nhwc_nc16hw16.cu",
"cutlass_tensorop_s4_i8832fprop_1x1_hswish_s4_128x16x64_128x16x64_2_nhwc_nc16hw16.cu",
"cutlass_tensorop_s4_i8832fprop_id_s4_128x32x64_64x32x64_1_nhwc_nc16hw16.cu",
"cutlass_tensorop_s4_i8832fprop_1x1_id_s4_128x32x64_64x32x64_1_nhwc_nc16hw16.cu",
"cutlass_tensorop_s4_i8832fprop_relu_s4_128x32x64_64x32x64_1_nhwc_nc16hw16.cu",
@@ -863,6 +980,12 @@ cutlass_gen_list = [
"cutlass_tensorop_s4_i8832fprop_1x1_roc_relu_s4_128x64x64_64x64x64_1_nhwc_nc16hw16.cu",
"cutlass_tensorop_s4_i8832fprop_roc_hswish_s4_128x64x64_64x64x64_1_nhwc_nc16hw16.cu",
"cutlass_tensorop_s4_i8832fprop_1x1_roc_hswish_s4_128x64x64_64x64x64_1_nhwc_nc16hw16.cu",
"cutlass_tensorop_s4_i8832fprop_id_s4_128x16x64_128x16x64_2_nhwc_nc32hw32.cu",
"cutlass_tensorop_s4_i8832fprop_1x1_id_s4_128x16x64_128x16x64_2_nhwc_nc32hw32.cu",
"cutlass_tensorop_s4_i8832fprop_relu_s4_128x16x64_128x16x64_2_nhwc_nc32hw32.cu",
"cutlass_tensorop_s4_i8832fprop_1x1_relu_s4_128x16x64_128x16x64_2_nhwc_nc32hw32.cu",
"cutlass_tensorop_s4_i8832fprop_hswish_s4_128x16x64_128x16x64_2_nhwc_nc32hw32.cu",
"cutlass_tensorop_s4_i8832fprop_1x1_hswish_s4_128x16x64_128x16x64_2_nhwc_nc32hw32.cu",
"cutlass_tensorop_s4_i8832fprop_id_s4_128x32x64_64x32x64_1_nhwc_nc32hw32.cu",
"cutlass_tensorop_s4_i8832fprop_1x1_id_s4_128x32x64_64x32x64_1_nhwc_nc32hw32.cu",
"cutlass_tensorop_s4_i8832fprop_relu_s4_128x32x64_64x32x64_1_nhwc_nc32hw32.cu",
@@ -887,6 +1010,10 @@ cutlass_gen_list = [
"cutlass_tensorop_s4_i8832fprop_1x1_roc_relu_s4_128x64x64_64x64x64_1_nhwc_nc32hw32.cu",
"cutlass_tensorop_s4_i8832fprop_roc_hswish_s4_128x64x64_64x64x64_1_nhwc_nc32hw32.cu",
"cutlass_tensorop_s4_i8832fprop_1x1_roc_hswish_s4_128x64x64_64x64x64_1_nhwc_nc32hw32.cu",
"cutlass_tensorop_u4_i8832fprop_id_u4_s4_128x16x64_128x16x64_2_nhwc_nc8hw8.cu",
"cutlass_tensorop_u4_i8832fprop_1x1_id_u4_s4_128x16x64_128x16x64_2_nhwc_nc8hw8.cu",
"cutlass_tensorop_u4_i8832fprop_relu_u4_s4_128x16x64_128x16x64_2_nhwc_nc8hw8.cu",
"cutlass_tensorop_u4_i8832fprop_1x1_relu_u4_s4_128x16x64_128x16x64_2_nhwc_nc8hw8.cu",
"cutlass_tensorop_u4_i8832fprop_id_u4_s4_128x32x64_64x32x64_1_nhwc_nc8hw8.cu",
"cutlass_tensorop_u4_i8832fprop_1x1_id_u4_s4_128x32x64_64x32x64_1_nhwc_nc8hw8.cu",
"cutlass_tensorop_u4_i8832fprop_relu_u4_s4_128x32x64_64x32x64_1_nhwc_nc8hw8.cu",
@@ -903,6 +1030,10 @@ cutlass_gen_list = [
"cutlass_tensorop_u4_i8832fprop_1x1_roc_id_u4_s4_128x64x64_64x64x64_1_nhwc_nc8hw8.cu",
"cutlass_tensorop_u4_i8832fprop_roc_relu_u4_s4_128x64x64_64x64x64_1_nhwc_nc8hw8.cu",
"cutlass_tensorop_u4_i8832fprop_1x1_roc_relu_u4_s4_128x64x64_64x64x64_1_nhwc_nc8hw8.cu",
"cutlass_tensorop_u4_i8832fprop_id_u4_s4_128x16x64_128x16x64_2_nhwc_nc16hw16.cu",
"cutlass_tensorop_u4_i8832fprop_1x1_id_u4_s4_128x16x64_128x16x64_2_nhwc_nc16hw16.cu",
"cutlass_tensorop_u4_i8832fprop_relu_u4_s4_128x16x64_128x16x64_2_nhwc_nc16hw16.cu",
"cutlass_tensorop_u4_i8832fprop_1x1_relu_u4_s4_128x16x64_128x16x64_2_nhwc_nc16hw16.cu",
"cutlass_tensorop_u4_i8832fprop_id_u4_s4_128x32x64_64x32x64_1_nhwc_nc16hw16.cu",
"cutlass_tensorop_u4_i8832fprop_1x1_id_u4_s4_128x32x64_64x32x64_1_nhwc_nc16hw16.cu",
"cutlass_tensorop_u4_i8832fprop_relu_u4_s4_128x32x64_64x32x64_1_nhwc_nc16hw16.cu",
@@ -919,6 +1050,10 @@ cutlass_gen_list = [
"cutlass_tensorop_u4_i8832fprop_1x1_roc_id_u4_s4_128x64x64_64x64x64_1_nhwc_nc16hw16.cu",
"cutlass_tensorop_u4_i8832fprop_roc_relu_u4_s4_128x64x64_64x64x64_1_nhwc_nc16hw16.cu",
"cutlass_tensorop_u4_i8832fprop_1x1_roc_relu_u4_s4_128x64x64_64x64x64_1_nhwc_nc16hw16.cu",
"cutlass_tensorop_u4_i8832fprop_id_u4_s4_128x16x64_128x16x64_2_nhwc_nc32hw32.cu",
"cutlass_tensorop_u4_i8832fprop_1x1_id_u4_s4_128x16x64_128x16x64_2_nhwc_nc32hw32.cu",
"cutlass_tensorop_u4_i8832fprop_relu_u4_s4_128x16x64_128x16x64_2_nhwc_nc32hw32.cu",
"cutlass_tensorop_u4_i8832fprop_1x1_relu_u4_s4_128x16x64_128x16x64_2_nhwc_nc32hw32.cu",
"cutlass_tensorop_u4_i8832fprop_id_u4_s4_128x32x64_64x32x64_1_nhwc_nc32hw32.cu",
"cutlass_tensorop_u4_i8832fprop_1x1_id_u4_s4_128x32x64_64x32x64_1_nhwc_nc32hw32.cu",
"cutlass_tensorop_u4_i8832fprop_relu_u4_s4_128x32x64_64x32x64_1_nhwc_nc32hw32.cu",
@@ -935,5 +1070,155 @@ cutlass_gen_list = [
"cutlass_tensorop_u4_i8832fprop_1x1_roc_id_u4_s4_128x64x64_64x64x64_1_nhwc_nc32hw32.cu",
"cutlass_tensorop_u4_i8832fprop_roc_relu_u4_s4_128x64x64_64x64x64_1_nhwc_nc32hw32.cu",
"cutlass_tensorop_u4_i8832fprop_1x1_roc_relu_u4_s4_128x64x64_64x64x64_1_nhwc_nc32hw32.cu",
"cutlass_tensorop_s8_i8832fprop_id_s4_128x16x64_128x16x64_2_nhwc_nc8hw8.cu",
"cutlass_tensorop_s8_i8832fprop_1x1_id_s4_128x16x64_128x16x64_2_nhwc_nc8hw8.cu",
"cutlass_tensorop_s8_i8832fprop_relu_s4_128x16x64_128x16x64_2_nhwc_nc8hw8.cu",
"cutlass_tensorop_s8_i8832fprop_1x1_relu_s4_128x16x64_128x16x64_2_nhwc_nc8hw8.cu",
"cutlass_tensorop_s8_i8832fprop_hswish_s4_128x16x64_128x16x64_2_nhwc_nc8hw8.cu",
"cutlass_tensorop_s8_i8832fprop_1x1_hswish_s4_128x16x64_128x16x64_2_nhwc_nc8hw8.cu",
"cutlass_tensorop_s8_i8832fprop_id_s4_128x32x64_64x32x64_1_nhwc_nc8hw8.cu",
"cutlass_tensorop_s8_i8832fprop_1x1_id_s4_128x32x64_64x32x64_1_nhwc_nc8hw8.cu",
"cutlass_tensorop_s8_i8832fprop_relu_s4_128x32x64_64x32x64_1_nhwc_nc8hw8.cu",
"cutlass_tensorop_s8_i8832fprop_1x1_relu_s4_128x32x64_64x32x64_1_nhwc_nc8hw8.cu",
"cutlass_tensorop_s8_i8832fprop_hswish_s4_128x32x64_64x32x64_1_nhwc_nc8hw8.cu",
"cutlass_tensorop_s8_i8832fprop_1x1_hswish_s4_128x32x64_64x32x64_1_nhwc_nc8hw8.cu",
"cutlass_tensorop_s8_i8832fprop_roc_id_s4_128x32x64_64x32x64_1_nhwc_nc8hw8.cu",
"cutlass_tensorop_s8_i8832fprop_1x1_roc_id_s4_128x32x64_64x32x64_1_nhwc_nc8hw8.cu",
"cutlass_tensorop_s8_i8832fprop_roc_relu_s4_128x32x64_64x32x64_1_nhwc_nc8hw8.cu",
"cutlass_tensorop_s8_i8832fprop_1x1_roc_relu_s4_128x32x64_64x32x64_1_nhwc_nc8hw8.cu",
"cutlass_tensorop_s8_i8832fprop_roc_hswish_s4_128x32x64_64x32x64_1_nhwc_nc8hw8.cu",
"cutlass_tensorop_s8_i8832fprop_1x1_roc_hswish_s4_128x32x64_64x32x64_1_nhwc_nc8hw8.cu",
"cutlass_tensorop_s8_i8832fprop_id_s4_128x64x64_64x64x64_1_nhwc_nc8hw8.cu",
"cutlass_tensorop_s8_i8832fprop_1x1_id_s4_128x64x64_64x64x64_1_nhwc_nc8hw8.cu",
"cutlass_tensorop_s8_i8832fprop_relu_s4_128x64x64_64x64x64_1_nhwc_nc8hw8.cu",
"cutlass_tensorop_s8_i8832fprop_1x1_relu_s4_128x64x64_64x64x64_1_nhwc_nc8hw8.cu",
"cutlass_tensorop_s8_i8832fprop_hswish_s4_128x64x64_64x64x64_1_nhwc_nc8hw8.cu",
"cutlass_tensorop_s8_i8832fprop_1x1_hswish_s4_128x64x64_64x64x64_1_nhwc_nc8hw8.cu",
"cutlass_tensorop_s8_i8832fprop_roc_id_s4_128x64x64_64x64x64_1_nhwc_nc8hw8.cu",
"cutlass_tensorop_s8_i8832fprop_1x1_roc_id_s4_128x64x64_64x64x64_1_nhwc_nc8hw8.cu",
"cutlass_tensorop_s8_i8832fprop_roc_relu_s4_128x64x64_64x64x64_1_nhwc_nc8hw8.cu",
"cutlass_tensorop_s8_i8832fprop_1x1_roc_relu_s4_128x64x64_64x64x64_1_nhwc_nc8hw8.cu",
"cutlass_tensorop_s8_i8832fprop_roc_hswish_s4_128x64x64_64x64x64_1_nhwc_nc8hw8.cu",
"cutlass_tensorop_s8_i8832fprop_1x1_roc_hswish_s4_128x64x64_64x64x64_1_nhwc_nc8hw8.cu",
"cutlass_tensorop_s8_i8832fprop_id_s4_128x16x64_128x16x64_2_nhwc_nc16hw16.cu",
"cutlass_tensorop_s8_i8832fprop_1x1_id_s4_128x16x64_128x16x64_2_nhwc_nc16hw16.cu",
"cutlass_tensorop_s8_i8832fprop_relu_s4_128x16x64_128x16x64_2_nhwc_nc16hw16.cu",
"cutlass_tensorop_s8_i8832fprop_1x1_relu_s4_128x16x64_128x16x64_2_nhwc_nc16hw16.cu",
"cutlass_tensorop_s8_i8832fprop_hswish_s4_128x16x64_128x16x64_2_nhwc_nc16hw16.cu",
"cutlass_tensorop_s8_i8832fprop_1x1_hswish_s4_128x16x64_128x16x64_2_nhwc_nc16hw16.cu",
"cutlass_tensorop_s8_i8832fprop_id_s4_128x32x64_64x32x64_1_nhwc_nc16hw16.cu",
"cutlass_tensorop_s8_i8832fprop_1x1_id_s4_128x32x64_64x32x64_1_nhwc_nc16hw16.cu",
"cutlass_tensorop_s8_i8832fprop_relu_s4_128x32x64_64x32x64_1_nhwc_nc16hw16.cu",
"cutlass_tensorop_s8_i8832fprop_1x1_relu_s4_128x32x64_64x32x64_1_nhwc_nc16hw16.cu",
"cutlass_tensorop_s8_i8832fprop_hswish_s4_128x32x64_64x32x64_1_nhwc_nc16hw16.cu",
"cutlass_tensorop_s8_i8832fprop_1x1_hswish_s4_128x32x64_64x32x64_1_nhwc_nc16hw16.cu",
"cutlass_tensorop_s8_i8832fprop_roc_id_s4_128x32x64_64x32x64_1_nhwc_nc16hw16.cu",
"cutlass_tensorop_s8_i8832fprop_1x1_roc_id_s4_128x32x64_64x32x64_1_nhwc_nc16hw16.cu",
"cutlass_tensorop_s8_i8832fprop_roc_relu_s4_128x32x64_64x32x64_1_nhwc_nc16hw16.cu",
"cutlass_tensorop_s8_i8832fprop_1x1_roc_relu_s4_128x32x64_64x32x64_1_nhwc_nc16hw16.cu",
"cutlass_tensorop_s8_i8832fprop_roc_hswish_s4_128x32x64_64x32x64_1_nhwc_nc16hw16.cu",
"cutlass_tensorop_s8_i8832fprop_1x1_roc_hswish_s4_128x32x64_64x32x64_1_nhwc_nc16hw16.cu",
"cutlass_tensorop_s8_i8832fprop_id_s4_128x64x64_64x64x64_1_nhwc_nc16hw16.cu",
"cutlass_tensorop_s8_i8832fprop_1x1_id_s4_128x64x64_64x64x64_1_nhwc_nc16hw16.cu",
"cutlass_tensorop_s8_i8832fprop_relu_s4_128x64x64_64x64x64_1_nhwc_nc16hw16.cu",
"cutlass_tensorop_s8_i8832fprop_1x1_relu_s4_128x64x64_64x64x64_1_nhwc_nc16hw16.cu",
"cutlass_tensorop_s8_i8832fprop_hswish_s4_128x64x64_64x64x64_1_nhwc_nc16hw16.cu",
"cutlass_tensorop_s8_i8832fprop_1x1_hswish_s4_128x64x64_64x64x64_1_nhwc_nc16hw16.cu",
"cutlass_tensorop_s8_i8832fprop_roc_id_s4_128x64x64_64x64x64_1_nhwc_nc16hw16.cu",
"cutlass_tensorop_s8_i8832fprop_1x1_roc_id_s4_128x64x64_64x64x64_1_nhwc_nc16hw16.cu",
"cutlass_tensorop_s8_i8832fprop_roc_relu_s4_128x64x64_64x64x64_1_nhwc_nc16hw16.cu",
"cutlass_tensorop_s8_i8832fprop_1x1_roc_relu_s4_128x64x64_64x64x64_1_nhwc_nc16hw16.cu",
"cutlass_tensorop_s8_i8832fprop_roc_hswish_s4_128x64x64_64x64x64_1_nhwc_nc16hw16.cu",
"cutlass_tensorop_s8_i8832fprop_1x1_roc_hswish_s4_128x64x64_64x64x64_1_nhwc_nc16hw16.cu",
"cutlass_tensorop_s8_i8832fprop_id_s4_128x16x64_128x16x64_2_nhwc_nc32hw32.cu",
"cutlass_tensorop_s8_i8832fprop_1x1_id_s4_128x16x64_128x16x64_2_nhwc_nc32hw32.cu",
"cutlass_tensorop_s8_i8832fprop_relu_s4_128x16x64_128x16x64_2_nhwc_nc32hw32.cu",
"cutlass_tensorop_s8_i8832fprop_1x1_relu_s4_128x16x64_128x16x64_2_nhwc_nc32hw32.cu",
"cutlass_tensorop_s8_i8832fprop_hswish_s4_128x16x64_128x16x64_2_nhwc_nc32hw32.cu",
"cutlass_tensorop_s8_i8832fprop_1x1_hswish_s4_128x16x64_128x16x64_2_nhwc_nc32hw32.cu",
"cutlass_tensorop_s8_i8832fprop_id_s4_128x32x64_64x32x64_1_nhwc_nc32hw32.cu",
"cutlass_tensorop_s8_i8832fprop_1x1_id_s4_128x32x64_64x32x64_1_nhwc_nc32hw32.cu",
"cutlass_tensorop_s8_i8832fprop_relu_s4_128x32x64_64x32x64_1_nhwc_nc32hw32.cu",
"cutlass_tensorop_s8_i8832fprop_1x1_relu_s4_128x32x64_64x32x64_1_nhwc_nc32hw32.cu",
"cutlass_tensorop_s8_i8832fprop_hswish_s4_128x32x64_64x32x64_1_nhwc_nc32hw32.cu",
"cutlass_tensorop_s8_i8832fprop_1x1_hswish_s4_128x32x64_64x32x64_1_nhwc_nc32hw32.cu",
"cutlass_tensorop_s8_i8832fprop_roc_id_s4_128x32x64_64x32x64_1_nhwc_nc32hw32.cu",
"cutlass_tensorop_s8_i8832fprop_1x1_roc_id_s4_128x32x64_64x32x64_1_nhwc_nc32hw32.cu",
"cutlass_tensorop_s8_i8832fprop_roc_relu_s4_128x32x64_64x32x64_1_nhwc_nc32hw32.cu",
"cutlass_tensorop_s8_i8832fprop_1x1_roc_relu_s4_128x32x64_64x32x64_1_nhwc_nc32hw32.cu",
"cutlass_tensorop_s8_i8832fprop_roc_hswish_s4_128x32x64_64x32x64_1_nhwc_nc32hw32.cu",
"cutlass_tensorop_s8_i8832fprop_1x1_roc_hswish_s4_128x32x64_64x32x64_1_nhwc_nc32hw32.cu",
"cutlass_tensorop_s8_i8832fprop_id_s4_128x64x64_64x64x64_1_nhwc_nc32hw32.cu",
"cutlass_tensorop_s8_i8832fprop_1x1_id_s4_128x64x64_64x64x64_1_nhwc_nc32hw32.cu",
"cutlass_tensorop_s8_i8832fprop_relu_s4_128x64x64_64x64x64_1_nhwc_nc32hw32.cu",
"cutlass_tensorop_s8_i8832fprop_1x1_relu_s4_128x64x64_64x64x64_1_nhwc_nc32hw32.cu",
"cutlass_tensorop_s8_i8832fprop_hswish_s4_128x64x64_64x64x64_1_nhwc_nc32hw32.cu",
"cutlass_tensorop_s8_i8832fprop_1x1_hswish_s4_128x64x64_64x64x64_1_nhwc_nc32hw32.cu",
"cutlass_tensorop_s8_i8832fprop_roc_id_s4_128x64x64_64x64x64_1_nhwc_nc32hw32.cu",
"cutlass_tensorop_s8_i8832fprop_1x1_roc_id_s4_128x64x64_64x64x64_1_nhwc_nc32hw32.cu",
"cutlass_tensorop_s8_i8832fprop_roc_relu_s4_128x64x64_64x64x64_1_nhwc_nc32hw32.cu",
"cutlass_tensorop_s8_i8832fprop_1x1_roc_relu_s4_128x64x64_64x64x64_1_nhwc_nc32hw32.cu",
"cutlass_tensorop_s8_i8832fprop_roc_hswish_s4_128x64x64_64x64x64_1_nhwc_nc32hw32.cu",
"cutlass_tensorop_s8_i8832fprop_1x1_roc_hswish_s4_128x64x64_64x64x64_1_nhwc_nc32hw32.cu",
"cutlass_tensorop_s8_i8832fprop_id_u4_s4_128x16x64_128x16x64_2_nhwc_nc8hw8.cu",
"cutlass_tensorop_s8_i8832fprop_1x1_id_u4_s4_128x16x64_128x16x64_2_nhwc_nc8hw8.cu",
"cutlass_tensorop_s8_i8832fprop_relu_u4_s4_128x16x64_128x16x64_2_nhwc_nc8hw8.cu",
"cutlass_tensorop_s8_i8832fprop_1x1_relu_u4_s4_128x16x64_128x16x64_2_nhwc_nc8hw8.cu",
"cutlass_tensorop_s8_i8832fprop_id_u4_s4_128x32x64_64x32x64_1_nhwc_nc8hw8.cu",
"cutlass_tensorop_s8_i8832fprop_1x1_id_u4_s4_128x32x64_64x32x64_1_nhwc_nc8hw8.cu",
"cutlass_tensorop_s8_i8832fprop_relu_u4_s4_128x32x64_64x32x64_1_nhwc_nc8hw8.cu",
"cutlass_tensorop_s8_i8832fprop_1x1_relu_u4_s4_128x32x64_64x32x64_1_nhwc_nc8hw8.cu",
"cutlass_tensorop_s8_i8832fprop_roc_id_u4_s4_128x32x64_64x32x64_1_nhwc_nc8hw8.cu",
"cutlass_tensorop_s8_i8832fprop_1x1_roc_id_u4_s4_128x32x64_64x32x64_1_nhwc_nc8hw8.cu",
"cutlass_tensorop_s8_i8832fprop_roc_relu_u4_s4_128x32x64_64x32x64_1_nhwc_nc8hw8.cu",
"cutlass_tensorop_s8_i8832fprop_1x1_roc_relu_u4_s4_128x32x64_64x32x64_1_nhwc_nc8hw8.cu",
"cutlass_tensorop_s8_i8832fprop_id_u4_s4_128x64x64_64x64x64_1_nhwc_nc8hw8.cu",
"cutlass_tensorop_s8_i8832fprop_1x1_id_u4_s4_128x64x64_64x64x64_1_nhwc_nc8hw8.cu",
"cutlass_tensorop_s8_i8832fprop_relu_u4_s4_128x64x64_64x64x64_1_nhwc_nc8hw8.cu",
"cutlass_tensorop_s8_i8832fprop_1x1_relu_u4_s4_128x64x64_64x64x64_1_nhwc_nc8hw8.cu",
"cutlass_tensorop_s8_i8832fprop_roc_id_u4_s4_128x64x64_64x64x64_1_nhwc_nc8hw8.cu",
"cutlass_tensorop_s8_i8832fprop_1x1_roc_id_u4_s4_128x64x64_64x64x64_1_nhwc_nc8hw8.cu",
"cutlass_tensorop_s8_i8832fprop_roc_relu_u4_s4_128x64x64_64x64x64_1_nhwc_nc8hw8.cu",
"cutlass_tensorop_s8_i8832fprop_1x1_roc_relu_u4_s4_128x64x64_64x64x64_1_nhwc_nc8hw8.cu",
"cutlass_tensorop_s8_i8832fprop_id_u4_s4_128x16x64_128x16x64_2_nhwc_nc16hw16.cu",
"cutlass_tensorop_s8_i8832fprop_1x1_id_u4_s4_128x16x64_128x16x64_2_nhwc_nc16hw16.cu",
"cutlass_tensorop_s8_i8832fprop_relu_u4_s4_128x16x64_128x16x64_2_nhwc_nc16hw16.cu",
"cutlass_tensorop_s8_i8832fprop_1x1_relu_u4_s4_128x16x64_128x16x64_2_nhwc_nc16hw16.cu",
"cutlass_tensorop_s8_i8832fprop_id_u4_s4_128x32x64_64x32x64_1_nhwc_nc16hw16.cu",
"cutlass_tensorop_s8_i8832fprop_1x1_id_u4_s4_128x32x64_64x32x64_1_nhwc_nc16hw16.cu",
"cutlass_tensorop_s8_i8832fprop_relu_u4_s4_128x32x64_64x32x64_1_nhwc_nc16hw16.cu",
"cutlass_tensorop_s8_i8832fprop_1x1_relu_u4_s4_128x32x64_64x32x64_1_nhwc_nc16hw16.cu",
"cutlass_tensorop_s8_i8832fprop_roc_id_u4_s4_128x32x64_64x32x64_1_nhwc_nc16hw16.cu",
"cutlass_tensorop_s8_i8832fprop_1x1_roc_id_u4_s4_128x32x64_64x32x64_1_nhwc_nc16hw16.cu",
"cutlass_tensorop_s8_i8832fprop_roc_relu_u4_s4_128x32x64_64x32x64_1_nhwc_nc16hw16.cu",
"cutlass_tensorop_s8_i8832fprop_1x1_roc_relu_u4_s4_128x32x64_64x32x64_1_nhwc_nc16hw16.cu",
"cutlass_tensorop_s8_i8832fprop_id_u4_s4_128x64x64_64x64x64_1_nhwc_nc16hw16.cu",
"cutlass_tensorop_s8_i8832fprop_1x1_id_u4_s4_128x64x64_64x64x64_1_nhwc_nc16hw16.cu",
"cutlass_tensorop_s8_i8832fprop_relu_u4_s4_128x64x64_64x64x64_1_nhwc_nc16hw16.cu",
"cutlass_tensorop_s8_i8832fprop_1x1_relu_u4_s4_128x64x64_64x64x64_1_nhwc_nc16hw16.cu",
"cutlass_tensorop_s8_i8832fprop_roc_id_u4_s4_128x64x64_64x64x64_1_nhwc_nc16hw16.cu",
"cutlass_tensorop_s8_i8832fprop_1x1_roc_id_u4_s4_128x64x64_64x64x64_1_nhwc_nc16hw16.cu",
"cutlass_tensorop_s8_i8832fprop_roc_relu_u4_s4_128x64x64_64x64x64_1_nhwc_nc16hw16.cu",
"cutlass_tensorop_s8_i8832fprop_1x1_roc_relu_u4_s4_128x64x64_64x64x64_1_nhwc_nc16hw16.cu",
"cutlass_tensorop_s8_i8832fprop_id_u4_s4_128x16x64_128x16x64_2_nhwc_nc32hw32.cu",
"cutlass_tensorop_s8_i8832fprop_1x1_id_u4_s4_128x16x64_128x16x64_2_nhwc_nc32hw32.cu",
"cutlass_tensorop_s8_i8832fprop_relu_u4_s4_128x16x64_128x16x64_2_nhwc_nc32hw32.cu",
"cutlass_tensorop_s8_i8832fprop_1x1_relu_u4_s4_128x16x64_128x16x64_2_nhwc_nc32hw32.cu",
"cutlass_tensorop_s8_i8832fprop_id_u4_s4_128x32x64_64x32x64_1_nhwc_nc32hw32.cu",
"cutlass_tensorop_s8_i8832fprop_1x1_id_u4_s4_128x32x64_64x32x64_1_nhwc_nc32hw32.cu",
"cutlass_tensorop_s8_i8832fprop_relu_u4_s4_128x32x64_64x32x64_1_nhwc_nc32hw32.cu",
"cutlass_tensorop_s8_i8832fprop_1x1_relu_u4_s4_128x32x64_64x32x64_1_nhwc_nc32hw32.cu",
"cutlass_tensorop_s8_i8832fprop_roc_id_u4_s4_128x32x64_64x32x64_1_nhwc_nc32hw32.cu",
"cutlass_tensorop_s8_i8832fprop_1x1_roc_id_u4_s4_128x32x64_64x32x64_1_nhwc_nc32hw32.cu",
"cutlass_tensorop_s8_i8832fprop_roc_relu_u4_s4_128x32x64_64x32x64_1_nhwc_nc32hw32.cu",
"cutlass_tensorop_s8_i8832fprop_1x1_roc_relu_u4_s4_128x32x64_64x32x64_1_nhwc_nc32hw32.cu",
"cutlass_tensorop_s8_i8832fprop_id_u4_s4_128x64x64_64x64x64_1_nhwc_nc32hw32.cu",
"cutlass_tensorop_s8_i8832fprop_1x1_id_u4_s4_128x64x64_64x64x64_1_nhwc_nc32hw32.cu",
"cutlass_tensorop_s8_i8832fprop_relu_u4_s4_128x64x64_64x64x64_1_nhwc_nc32hw32.cu",
"cutlass_tensorop_s8_i8832fprop_1x1_relu_u4_s4_128x64x64_64x64x64_1_nhwc_nc32hw32.cu",
"cutlass_tensorop_s8_i8832fprop_roc_id_u4_s4_128x64x64_64x64x64_1_nhwc_nc32hw32.cu",
"cutlass_tensorop_s8_i8832fprop_1x1_roc_id_u4_s4_128x64x64_64x64x64_1_nhwc_nc32hw32.cu",
"cutlass_tensorop_s8_i8832fprop_roc_relu_u4_s4_128x64x64_64x64x64_1_nhwc_nc32hw32.cu",
"cutlass_tensorop_s8_i8832fprop_1x1_roc_relu_u4_s4_128x64x64_64x64x64_1_nhwc_nc32hw32.cu",
"all_conv2d_tensorop8832_operations.cu",
]
]

+ 4
- 1
dnn/src/common/convolution.cpp View File

@@ -553,7 +553,10 @@ void ConvolutionBase<Parameter>::check_or_deduce_dtype_fwd(DType src,
dst.valid() && (dst.enumv() == src.enumv() ||
((dst.enumv() == DTypeEnum::QuantizedS4 ||
dst.enumv() == DTypeEnum::Quantized4Asymm) &&
src.enumv() == DTypeEnum::QuantizedS8));
src.enumv() == DTypeEnum::QuantizedS8) ||
((src.enumv() == DTypeEnum::QuantizedS4 ||
src.enumv() == DTypeEnum::Quantized4Asymm) &&
dst.enumv() == DTypeEnum::QuantizedS8));
if (cond_dst) {
supported_dst_dtype.push_back(dst);
}


+ 30
- 1
dnn/src/cuda/conv_bias/algo.cpp View File

@@ -71,6 +71,9 @@ ConvBiasForwardImpl::AlgoPack::AlgoPack() {
for (auto&& algo : int8_nchw32_imma) {
all_algos.push_back(&algo);
}
for (auto&& algo : int8_nhwc_imma) {
all_algos.push_back(&algo);
}
for (auto&& algo : int4_int4_nchw64_imma) {
all_algos.push_back(&algo);
}
@@ -236,7 +239,21 @@ void ConvBiasForwardImpl::AlgoPack::fill_imma_algos() {
int8_nchw32_imma.emplace_back(
AlgoParam{32, 128, 32, 32, 64, 32, 8, 8, 16, 1});
}

{
using AlgoParam = AlgoInt8NHWCIMMAImplicitGemm::AlgoParam;
int8_nhwc_imma.emplace_back(
AlgoParam{64, 16, 32, 64, 16, 32, 8, 8, 16, 2, 16});
int8_nhwc_imma.emplace_back(
AlgoParam{64, 16, 32, 64, 16, 32, 8, 8, 16, 2, 8});
int8_nhwc_imma.emplace_back(
AlgoParam{64, 16, 32, 64, 16, 32, 8, 8, 16, 2, 4});
int8_nhwc_imma.emplace_back(
AlgoParam{128, 32, 32, 64, 32, 32, 8, 8, 16, 1, 16});
int8_nhwc_imma.emplace_back(
AlgoParam{128, 32, 32, 64, 32, 32, 8, 8, 16, 1, 8});
int8_nhwc_imma.emplace_back(
AlgoParam{128, 32, 32, 64, 32, 32, 8, 8, 16, 1, 4});
}
{
using AlgoParam = AlgoInt4Int4NCHW64IMMAImplicitGemm::AlgoParam;
int4_int4_nchw64_imma.emplace_back(
@@ -262,6 +279,12 @@ void ConvBiasForwardImpl::AlgoPack::fill_imma_algos() {
{
using AlgoParam = AlgoInt4Int4NHWCIMMAImplicitGemm::AlgoParam;
int4_int4_nhwc_imma.emplace_back(
AlgoParam{128, 16, 64, 128, 16, 64, 8, 8, 32, 2, 32});
int4_int4_nhwc_imma.emplace_back(
AlgoParam{128, 16, 64, 128, 16, 64, 8, 8, 32, 2, 16});
int4_int4_nhwc_imma.emplace_back(
AlgoParam{128, 16, 64, 128, 16, 64, 8, 8, 32, 2, 8});
int4_int4_nhwc_imma.emplace_back(
AlgoParam{128, 32, 64, 64, 32, 64, 8, 8, 32, 1, 32});
int4_int4_nhwc_imma.emplace_back(
AlgoParam{128, 32, 64, 64, 32, 64, 8, 8, 32, 1, 16});
@@ -277,6 +300,12 @@ void ConvBiasForwardImpl::AlgoPack::fill_imma_algos() {
{
using AlgoParam = AlgoUInt4Int4NHWCIMMAImplicitGemm::AlgoParam;
uint4_int4_nhwc_imma.emplace_back(
AlgoParam{128, 16, 64, 128, 16, 64, 8, 8, 32, 2, 32});
uint4_int4_nhwc_imma.emplace_back(
AlgoParam{128, 16, 64, 128, 16, 64, 8, 8, 32, 2, 16});
uint4_int4_nhwc_imma.emplace_back(
AlgoParam{128, 16, 64, 128, 16, 64, 8, 8, 32, 2, 8});
uint4_int4_nhwc_imma.emplace_back(
AlgoParam{128, 32, 64, 64, 32, 64, 8, 8, 32, 1, 32});
uint4_int4_nhwc_imma.emplace_back(
AlgoParam{128, 32, 64, 64, 32, 64, 8, 8, 32, 1, 16});


+ 45
- 1
dnn/src/cuda/conv_bias/algo.h View File

@@ -72,6 +72,7 @@ public:
CUDA_IMPLICIT_GEMM_REORDER_FILTER_CHWN4_IMMA_INT8,
CUDA_IMPLICIT_GEMM_UNROLL_WIDTH_CHWN4_IMMA_INT8,
CUDA_IMPLICIT_GEMM_IMMA_NCHW32_INT8,
CUDA_IMPLICIT_GEMM_IMMA_NHWC_INT8,
CUDA_IMPLICIT_GEMM_IMMA_NCHW64_INT4_INT4,
CUDA_IMPLICIT_GEMM_IMMA_NCHW64_UINT4_INT4,
CUDA_IMPLICIT_GEMM_IMMA_NHWC_INT4_INT4,
@@ -524,6 +525,7 @@ public:
* +
* +--- AlgoInt8NCHW4DotProdImplicitGemm
* +--- AlgoInt8NCHW32IMMAImplicitGemm
* +--- AlgoInt8NHWCIMMAImplicitGemm
* +
* +--- AlgoInt4NCHW64IMMAImplicitGemmBase
* +----+--- AlgoInt4Int4NCHW64IMMAImplicitGemm
@@ -582,7 +584,7 @@ public:
// operation (cutlass kernel) from the global OperationTable
const cutlass::library::Operation* get_cutlass_conv_op(
const SizeArgs& args, ConvOperator conv_op, ConvType conv_type,
bool load_from_const, bool without_shared_load) const;
bool use_conv_filter_unity_opt, bool without_shared_load) const;

// execute the cutlass kernel found by get_cutlass_conv_op. we give
// subclasses full freedom to decide where and how these arguments are
@@ -829,6 +831,47 @@ private:
std::string m_name;
};

class ConvBiasForwardImpl::AlgoInt8NHWCIMMAImplicitGemm final
: public AlgoCutlassConvolutionBase {
public:
AlgoInt8NHWCIMMAImplicitGemm(AlgoParam algo_param)
: AlgoCutlassConvolutionBase(algo_param) {
m_name = ConvBias::algo_name<ConvBias::DirectParam>(
ssprintf("INT8_NHWC_IMMA_IMPLICIT_GEMM_%s",
to_string(m_algo_param).c_str()),
ConvBias::DirectParam{});
}
bool is_available(const SizeArgs& args) const override;
size_t get_workspace_in_bytes(const SizeArgs& args) const override;
void exec(const ExecArgs& args) const override;
const char* name() const override { return m_name.c_str(); }
AlgoAttribute attribute() const override {
return AlgoAttribute::REPRODUCIBLE;
}
static std::string to_string(AlgoParam algo_param);
size_t get_preprocess_workspace_in_bytes(
const SizeArgs& args) const override;
SmallVector<TensorLayout> deduce_preprocessed_filter_layout(
const SizeArgs& args) const override;
void exec_preprocess(const ExecArgs& args) const override;
MEGDNN_DECL_ALGO_TYPE(CUDA_IMPLICIT_GEMM_IMMA_NHWC_INT8)

std::string param() const override {
std::string ret;
serialize_write_pod(m_algo_param, ret);
return ret;
}

private:
std::tuple<float, float, float, float, float> get_constants(
const ExecArgs& args) const;

void reorder_filter(const ExecArgs& args, int interleaved,
void* reordered_filter) const;

std::string m_name;
};

class ConvBiasForwardImpl::AlgoInt4NCHW64IMMAImplicitGemmBase
: public AlgoCutlassConvolutionBase {
public:
@@ -1087,6 +1130,7 @@ public:
#endif
#if CUDA_VERSION >= 10020
std::vector<AlgoInt8NCHW32IMMAImplicitGemm> int8_nchw32_imma;
std::vector<AlgoInt8NHWCIMMAImplicitGemm> int8_nhwc_imma;
std::vector<AlgoInt4Int4NCHW64IMMAImplicitGemm> int4_int4_nchw64_imma;
std::vector<AlgoUInt4Int4NCHW64IMMAImplicitGemm> uint4_int4_nchw64_imma;
std::vector<AlgoInt4Int4NHWCIMMAImplicitGemm> int4_int4_nhwc_imma;


+ 16
- 5
dnn/src/cuda/conv_bias/cutlass_convolution_base.cpp View File

@@ -140,6 +140,11 @@ LayoutPack get_layout_pack(const param::ConvBias::Format format,
LayoutTypeID::kTensorNC64HW64};
case Format::NHWC:
switch (access_type) {
case 4:
return {LayoutTypeID::kTensorNHWC,
LayoutTypeID::kTensorNC4HW4,
LayoutTypeID::kTensorNHWC,
LayoutTypeID::kTensorNHWC};
case 8:
return {LayoutTypeID::kTensorNHWC,
LayoutTypeID::kTensorNC8HW8,
@@ -192,12 +197,18 @@ EpilogueType get_epilogue_type(const param::ConvBias::NonlineMode mode,
const Operation*
ConvBiasForwardImpl::AlgoCutlassConvolutionBase::get_cutlass_conv_op(
const SizeArgs& args, ConvOperator conv_op, ConvType conv_type,
bool load_from_const, bool without_shared_load) const {
using Format = param::ConvBias::Format;
bool use_conv_filter_unity_opt, bool without_shared_load) const {
auto&& param = args.opr->param();
auto layouts = get_layout_pack(param.format, m_algo_param.access_size);
auto epilogue_type = get_epilogue_type(param.nonlineMode,
param.format != Format::NCHW4_NCHW);
auto epilogue_type = get_epilogue_type(
param.nonlineMode,
args.dst_layout->dtype.enumv() != DTypeEnum::Float32);

cutlass::conv::SpecialOptimizeDesc special_optimization =
(use_conv_filter_unity_opt)
? cutlass::conv::SpecialOptimizeDesc::CONV_FILTER_UNITY
: cutlass::conv::SpecialOptimizeDesc::NONE;

ConvolutionKey key{convert_conv_op(conv_op),
convert_dtype(args.src_layout->dtype.enumv()),
layouts.src,
@@ -219,7 +230,7 @@ ConvBiasForwardImpl::AlgoCutlassConvolutionBase::get_cutlass_conv_op(
m_algo_param.instruction_k,
epilogue_type,
m_algo_param.stage,
load_from_const,
special_optimization,
without_shared_load};

return Singleton::get().operation_table.find_op(key);


+ 45
- 26
dnn/src/cuda/conv_bias/cutlass_reorder_filter.cu View File

@@ -144,28 +144,48 @@ void megdnn::cuda::cutlass_wrapper::reorder_ncxhwx_imma_filter(
IC, FH, FW, trans_oc);
after_kernel_launch();
}

template <uint32_t size_bits, uint32_t alignbits>
template <uint32_t size_bits>
void megdnn::cuda::cutlass_wrapper::reorder_nhwc_imma_filter(
int8_t* dst_filter, const int8_t* src_filter, uint32_t OC, uint32_t IC,
uint32_t FH, uint32_t FW, bool trans_oc, uint32_t oc_interleaved,
cudaStream_t stream) {
static constexpr uint32_t elements_per_access = alignbits / size_bits;
uint32_t nr_threads =
query_blocksize_for_kernel(reinterpret_cast<const void*>(
reorder_nhwc_imma_filter_kernel<size_bits, alignbits, 32>));
uint32_t FH, uint32_t FW, bool trans_oc, uint32_t alignbits,
uint32_t interleaved, cudaStream_t stream) {
const uint32_t elements_per_access = alignbits / size_bits;

void (*kern)(int8_t* __restrict__, const int8_t* __restrict__, uint32_t,
uint32_t, uint32_t, uint32_t, bool);
kern = nullptr;

auto get_kern = [&kern](const uint32_t alignbits,
const uint32_t interleaved) {
#define DISPATCH_KERNEL(alignbits_, interleaved_) \
if (alignbits == alignbits_ && interleaved == interleaved_) { \
kern = reorder_nhwc_imma_filter_kernel<size_bits, alignbits_, \
interleaved_>; \
return; \
}
DISPATCH_KERNEL(128, 16);
DISPATCH_KERNEL(64, 16);
DISPATCH_KERNEL(32, 16);
DISPATCH_KERNEL(128, 32);
DISPATCH_KERNEL(64, 32);
DISPATCH_KERNEL(32, 32);
DISPATCH_KERNEL(128, 64);
DISPATCH_KERNEL(64, 64);
DISPATCH_KERNEL(32, 64);

#undef DISPATCH_KERNEL
};

get_kern(alignbits, interleaved);

uint32_t nr_threads = query_blocksize_for_kernel(kern);
uint32_t vthreads = DIVUP(OC * IC * FH * FW, elements_per_access);
nr_threads = std::min(nr_threads, vthreads);
uint32_t nr_blocks = DIVUP(vthreads, nr_threads);
if (oc_interleaved == 32) {
reorder_nhwc_imma_filter_kernel<size_bits, alignbits, 32>
<<<nr_blocks, nr_threads, 0, stream>>>(
dst_filter, src_filter, OC, IC, FH, FW, trans_oc);
} else {
reorder_nhwc_imma_filter_kernel<size_bits, alignbits, 64>
<<<nr_blocks, nr_threads, 0, stream>>>(
dst_filter, src_filter, OC, IC, FH, FW, trans_oc);
}

kern<<<nr_blocks, nr_threads, 0, stream>>>(dst_filter, src_filter, OC, IC,
FH, FW, trans_oc);

after_kernel_launch();
}

@@ -180,15 +200,14 @@ INST(8, 32)
INST(4, 64)
#undef INST

#define INST(_size_bits, _alignbits) \
template void megdnn::cuda::cutlass_wrapper::reorder_nhwc_imma_filter< \
_size_bits, _alignbits>( \
int8_t * dst_filter, const int8_t* src_filter, uint32_t OC, \
uint32_t IC, uint32_t FH, uint32_t FW, bool trans_oc, \
uint32_t oc_interleaved, cudaStream_t stream);
INST(4, 32)
INST(4, 64)
INST(4, 128)
#define INST(_size_bits) \
template void \
megdnn::cuda::cutlass_wrapper::reorder_nhwc_imma_filter<_size_bits>( \
int8_t * dst_filter, const int8_t* src_filter, uint32_t OC, \
uint32_t IC, uint32_t FH, uint32_t FW, bool trans_oc, \
uint32_t alignbits, uint32_t interleaved, cudaStream_t stream);
INST(4)
INST(8)
#undef INST

// vim: syntax=cuda.doxygen

+ 3
- 3
dnn/src/cuda/conv_bias/cutlass_reorder_filter.cuh View File

@@ -23,11 +23,11 @@ void reorder_ncxhwx_imma_filter(int8_t* dst_filter, const int8_t* src_filter,
uint32_t FW, bool trans_oc,
cudaStream_t stream);

template <uint32_t size_bits, uint32_t alignbits>
template <uint32_t size_bits>
void reorder_nhwc_imma_filter(int8_t* dst_filter, const int8_t* src_filter,
uint32_t OC, uint32_t IC, uint32_t FH,
uint32_t FW, bool trans_oc,
uint32_t oc_interleaved, cudaStream_t stream);
uint32_t FW, bool trans_oc, uint32_t alignbits,
uint32_t interleaved, cudaStream_t stream);
} // namespace cutlass_wrapper
} // namespace cuda
} // namespace megdnn

+ 16
- 2
dnn/src/cuda/conv_bias/implicit_gemm_int4_int4_nhwc_imma.cpp View File

@@ -68,13 +68,27 @@ ConvBiasForwardImpl::AlgoInt4Int4NHWCIMMAImplicitGemm::get_constants(
args.filter_layout->dtype.param<dtype::QuantizedS4>().scale,
bias_scale =
args.bias_layout->dtype.param<dtype::QuantizedS32>().scale,
dst_scale = args.dst_layout->dtype.param<dtype::QuantizedS4>().scale;
dst_scale;

if (args.dst_layout->dtype.enumv() == DTypeEnum::QuantizedS4) {
dst_scale = args.dst_layout->dtype.param<dtype::QuantizedS4>().scale;
} else { // DTypeEnum::QuantizedS8
megdnn_assert(args.dst_layout->dtype.enumv() == DTypeEnum::QuantizedS8);
dst_scale = args.dst_layout->dtype.param<dtype::QuantizedS8>().scale;
}

float alpha = src_scale * filter_scale / dst_scale,
beta = bias_scale / dst_scale, gamma = 0.f, delta = 0.f, theta = 0.f;

if (args.z_layout->ndim > 0) {
float z_scale = args.z_layout->dtype.param<dtype::QuantizedS4>().scale;
float z_scale;
if (args.z_layout->dtype.enumv() == DTypeEnum::QuantizedS4) {
z_scale = args.z_layout->dtype.param<dtype::QuantizedS4>().scale;
} else { // DTypeEnum::QuantizedS8
megdnn_assert(args.z_layout->dtype.enumv() ==
DTypeEnum::QuantizedS8);
z_scale = args.z_layout->dtype.param<dtype::QuantizedS8>().scale;
}
gamma = z_scale / dst_scale;
}



+ 10
- 2
dnn/src/cuda/conv_bias/implicit_gemm_int4_nchw64_imma_base.cpp View File

@@ -76,6 +76,14 @@ bool ConvBiasForwardImpl::AlgoInt4NCHW64IMMAImplicitGemmBase::is_available(
if (fh * fw > kMaxFilterPixels)
return false;

bool use_conv_filter_unity_opt = (fh == 1 && fw == 1);
bool without_shared_load = true;
const auto* op = get_cutlass_conv_op(
args, ConvOperator::kFprop, ConvType::kConvolution,
use_conv_filter_unity_opt, without_shared_load);
if (op == nullptr)
return false;

return true;
}

@@ -110,7 +118,7 @@ void ConvBiasForwardImpl::AlgoInt4NCHW64IMMAImplicitGemmBase::exec(
float dst_scale = 0.f;
float threshold = 0.f;
uint8_t src_zero = 0;
bool load_from_const = !(fh == 1 && fw == 1);
bool use_conv_filter_unity_opt = (fh == 1 && fw == 1);
bool without_shared_load = true;

if (args.dst_layout->dtype.enumv() == DTypeEnum::Quantized4Asymm) {
@@ -126,7 +134,7 @@ void ConvBiasForwardImpl::AlgoInt4NCHW64IMMAImplicitGemmBase::exec(

const auto* op = get_cutlass_conv_op(args, ConvOperator::kFprop,
ConvType::kConvolution,
load_from_const, without_shared_load);
use_conv_filter_unity_opt, without_shared_load);

execute_cutlass_conv_op(op, args.src_tensor->raw_ptr, filter_ptr, bias_ptr,
z_ptr, args.dst_tensor->raw_ptr, nullptr, n, hi, wi,


+ 34
- 27
dnn/src/cuda/conv_bias/implicit_gemm_int4_nhwc_imma_base.cpp View File

@@ -56,8 +56,11 @@ bool ConvBiasForwardImpl::AlgoInt4NHWCIMMAImplicitGemmBase::is_available(

if (args.src_layout->dtype.enumv() != src_dtype() ||
args.filter_layout->dtype.enumv() != DTypeEnum::QuantizedS4 ||
args.bias_layout->dtype.enumv() != DTypeEnum::QuantizedS32 ||
args.dst_layout->dtype.enumv() != src_dtype())
args.bias_layout->dtype.enumv() != DTypeEnum::QuantizedS32)
return false;

if (!(args.dst_layout->dtype.enumv() == src_dtype() ||
args.dst_layout->dtype.enumv() == DTypeEnum::QuantizedS8))
return false;

// uint4 do not support H_SWISH activition
@@ -83,6 +86,16 @@ bool ConvBiasForwardImpl::AlgoInt4NHWCIMMAImplicitGemmBase::is_available(
if ((co % 8 != 0) || (ci % m_algo_param.access_size != 0))
return false;

bool use_conv_filter_unity_opt = (fh == 1 && fw == 1);
bool without_shared_load = ((co % m_algo_param.threadblock_n == 0) &&
(m_algo_param.threadblock_n == 32 ||
m_algo_param.threadblock_n == 64));
const auto* op = get_cutlass_conv_op(
args, ConvOperator::kFprop, ConvType::kConvolution,
use_conv_filter_unity_opt, without_shared_load);
if (op == nullptr)
return false;

return true;
}

@@ -117,26 +130,31 @@ void ConvBiasForwardImpl::AlgoInt4NHWCIMMAImplicitGemmBase::exec(
float dst_scale = 0.f;
float threshold = 0.f;
uint8_t src_zero = 0;
bool load_from_const = !(fh == 1 && fw == 1);
bool use_conv_filter_unity_opt = (fh == 1 && fw == 1);

bool without_shared_load = ((co % m_algo_param.threadblock_n == 0) &&
(m_algo_param.threadblock_n == 32 ||
m_algo_param.threadblock_n == 64));

if (args.src_layout->dtype.enumv() == DTypeEnum::Quantized4Asymm) {
src_zero = args.src_layout->dtype.param<dtype::Quantized4Asymm>()
.zero_point;
}

if (args.dst_layout->dtype.enumv() == DTypeEnum::Quantized4Asymm) {
dst_scale =
args.dst_layout->dtype.param<dtype::Quantized4Asymm>().scale;
src_zero = args.src_layout->dtype.param<dtype::Quantized4Asymm>()
.zero_point;
} else { // DTypeEnum::QuantizedS4
} else if (args.dst_layout->dtype.enumv() == DTypeEnum::QuantizedS4) {
dst_scale = args.dst_layout->dtype.param<dtype::QuantizedS4>().scale;
} else { // DTypeEnum::QuantizedS8
dst_scale = args.dst_layout->dtype.param<dtype::QuantizedS8>().scale;
}

cudaStream_t stream = cuda_stream(args.opr->handle());

const auto* op = get_cutlass_conv_op(args, ConvOperator::kFprop,
ConvType::kConvolution,
load_from_const, without_shared_load);
use_conv_filter_unity_opt, without_shared_load);

execute_cutlass_conv_op(op, args.src_tensor->raw_ptr, filter_ptr, bias_ptr,
z_ptr, args.dst_tensor->raw_ptr, nullptr, n, hi, wi,
@@ -166,29 +184,18 @@ void ConvBiasForwardImpl::AlgoInt4NHWCIMMAImplicitGemmBase::reorder_filter(
cudaStream_t stream = cuda_stream(args.opr->handle());

// reformat filter from nhwc to ncxhwx and reorder oc
// use trans_oc threadblock_n must be 32 or 64
// use trans_oc threadblock_n must be 32 or 64 and src dtype == dest dtype
bool trans_oc = ((co % m_algo_param.threadblock_n == 0) &&
(m_algo_param.threadblock_n == 32 ||
m_algo_param.threadblock_n == 64));
uint32_t oc_iterleave = (m_algo_param.threadblock_n == 64) ? 64 : 32;

if (iterleaved == 8) {
cutlass_wrapper::reorder_nhwc_imma_filter<4, 32>(
reinterpret_cast<int8_t*>(reordered_filter),
reinterpret_cast<int8_t*>(args.filter_tensor->raw_ptr), co, ci,
fh, fw, trans_oc, oc_iterleave, stream);
} else if (iterleaved == 16) {
cutlass_wrapper::reorder_nhwc_imma_filter<4, 64>(
reinterpret_cast<int8_t*>(reordered_filter),
reinterpret_cast<int8_t*>(args.filter_tensor->raw_ptr), co, ci,
fh, fw, trans_oc, oc_iterleave, stream);
} else {
megdnn_assert(iterleaved == 32);
cutlass_wrapper::reorder_nhwc_imma_filter<4, 128>(
reinterpret_cast<int8_t*>(reordered_filter),
reinterpret_cast<int8_t*>(args.filter_tensor->raw_ptr), co, ci,
fh, fw, trans_oc, oc_iterleave, stream);
}
uint32_t oc_iterleaved = (m_algo_param.threadblock_n == 64) ? 64 : 32;

uint32_t alignbits = iterleaved * 4;

cutlass_wrapper::reorder_nhwc_imma_filter<4>(
reinterpret_cast<int8_t*>(reordered_filter),
reinterpret_cast<int8_t*>(args.filter_tensor->raw_ptr), co, ci, fh,
fw, trans_oc, alignbits, oc_iterleaved, stream);
}
#endif



+ 10
- 2
dnn/src/cuda/conv_bias/implicit_gemm_int8_nchw32_imma.cpp View File

@@ -77,6 +77,14 @@ bool ConvBiasForwardImpl::AlgoInt8NCHW32IMMAImplicitGemm::is_available(
// FIXME: too large filter size is not supported now
size_t kMaxFilterPixels = 848 / (2 * m_algo_param.warp_k / 32) - 2;
available &= fh * fw <= kMaxFilterPixels;

bool use_conv_filter_unity_opt = (fh == 1 && fw == 1);
bool without_shared_load = (param.format == Format::NCHW32);
const auto* op = get_cutlass_conv_op(
args, ConvOperator::kFprop, ConvType::kConvolution,
use_conv_filter_unity_opt, without_shared_load);
available &= (op != nullptr);

return available;
}

@@ -155,12 +163,12 @@ void ConvBiasForwardImpl::AlgoInt8NCHW32IMMAImplicitGemm::exec(
gamma = z_scale / dst_scale;
}
float delta = 0.f, theta = 0.f, threshold = 0.f;
bool load_from_const = !(fh == 1 && fw == 1);
bool use_conv_filter_unity_opt = (fh == 1 && fw == 1);
bool without_shared_load = (param.format == Format::NCHW32);

const auto* op = get_cutlass_conv_op(args, ConvOperator::kFprop,
ConvType::kConvolution,
load_from_const, without_shared_load);
use_conv_filter_unity_opt, without_shared_load);

execute_cutlass_conv_op(
op, args.src_tensor->raw_ptr, filter_ptr, args.bias_tensor->raw_ptr,


+ 12
- 5
dnn/src/cuda/conv_bias/implicit_gemm_int8_nchw4_dp4a.cpp View File

@@ -98,7 +98,14 @@ bool ConvBiasForwardImpl::AlgoInt8NCHW4DotProdImplicitGemm::is_available(
// FIXME: too large filter size is not supported now
size_t kMaxFilterPixels = 848 / (2 * m_algo_param.warp_k / 4) - 2;
available &= fh * fw <= kMaxFilterPixels;
;

bool use_conv_filter_unity_opt = (fh == 1 && fw == 1);
bool without_shared_load = false;
const auto* op = get_cutlass_conv_op(
args, ConvOperator::kFprop, ConvType::kConvolution,
use_conv_filter_unity_opt, without_shared_load);
available &= (op != nullptr);

return available;
}

@@ -213,12 +220,12 @@ void ConvBiasForwardImpl::AlgoInt8NCHW4DotProdImplicitGemm::exec(
}
}
float threshold = 0.f;
bool load_from_const = !(fh == 1 && fw == 1);
bool use_conv_filter_unity_opt = (fh == 1 && fw == 1);
bool without_shared_load = false;

const auto* op = get_cutlass_conv_op(args, ConvOperator::kFprop,
ConvType::kConvolution,
load_from_const, without_shared_load);
const auto* op = get_cutlass_conv_op(
args, ConvOperator::kFprop, ConvType::kConvolution,
use_conv_filter_unity_opt, without_shared_load);

execute_cutlass_conv_op(
op, args.src_tensor->raw_ptr, filter_ptr, args.bias_tensor->raw_ptr,


+ 294
- 0
dnn/src/cuda/conv_bias/implicit_gemm_int8_nhwc_imma.cpp View File

@@ -0,0 +1,294 @@
/**
* \file dnn/src/cuda/conv_bias/implicit_gemm_int8_nhwc_imma.cpp
* MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
*
* Copyright (c) 2014-2021 Megvii Inc. All rights reserved.
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or
* implied.
*/

#include "src/common/conv_bias.h"
#include "src/cuda/conv_bias/algo.h"
#include "src/cuda/conv_bias/cutlass_reorder_filter.cuh"
#include "src/cuda/convolution_helper/parameter.cuh"
#include "src/cuda/utils.h"

using namespace megdnn;
using namespace cuda;
using namespace convolution;

#if CUDA_VERSION >= 10020
bool ConvBiasForwardImpl::AlgoInt8NHWCIMMAImplicitGemm::is_available(
const SizeArgs& args) const {
if (args.bias_layout->ndim <= 0)
return false;

using Param = param::ConvBias;
using Format = Param::Format;
using Sparse = Param::Sparse;
using Mode = Param::Mode;
using NonlineMode = megdnn::param::ConvBias::NonlineMode;

auto&& param = args.opr->param();

if (!check_bias_share_in_channel(*(args.bias_layout), param.format))
return false;

if (param.format != Format::NHWC || param.sparse != Sparse::DENSE ||
param.mode != Mode::CROSS_CORRELATION)
return false;

if (param.nonlineMode != NonlineMode::IDENTITY &&
param.nonlineMode != NonlineMode::RELU &&
param.nonlineMode != NonlineMode::H_SWISH)
return false;

if (args.src_layout->dtype.enumv() != DTypeEnum::QuantizedS8 ||
args.filter_layout->dtype.enumv() != DTypeEnum::QuantizedS8)
return false;

auto dst_dtype = args.dst_layout->dtype.enumv();

if (!(dst_dtype == DTypeEnum::QuantizedS8 ||
dst_dtype == DTypeEnum::QuantizedS4 ||
dst_dtype == DTypeEnum::Quantized4Asymm ||
dst_dtype == DTypeEnum::Float32))
return false;

if (!(args.bias_layout->dtype.enumv() == DTypeEnum::QuantizedS32 ||
(args.bias_layout->dtype.enumv() == DTypeEnum::Float32 &&
dst_dtype == DTypeEnum::Float32)))
return false;

if (!is_compute_capability_required(7, 5))
return false;

size_t co = args.filter_layout->operator[](0),
ci = args.filter_layout->operator[](3),
fh = args.filter_layout->operator[](1),
fw = args.filter_layout->operator[](2);

// param buffer size is 4K, use 3.4K to store precomputed offset
size_t kMaxFilterPixels =
848 / (m_algo_param.warp_k / m_algo_param.access_size) - 1;
if (fh * fw > kMaxFilterPixels)
return false;
// co should be aligned with 4, and ci should be aligned with
// algo_param.access_size
if ((co % 4 != 0) || (ci % m_algo_param.access_size != 0))
return false;

bool use_conv_filter_unity_opt = (fh == 1 && fw == 1);
bool without_shared_load = ((co % m_algo_param.threadblock_n == 0) &&
(m_algo_param.threadblock_n == 16 ||
(m_algo_param.threadblock_n == 32 &&
dst_dtype != DTypeEnum::Float32)));
const auto* op = get_cutlass_conv_op(
args, ConvOperator::kFprop, ConvType::kConvolution,
use_conv_filter_unity_opt, without_shared_load);
if (op == nullptr)
return false;

return true;
}

size_t
ConvBiasForwardImpl::AlgoInt8NHWCIMMAImplicitGemm::get_workspace_in_bytes(
const SizeArgs& args) const {
if (args.preprocessed_filter) {
return 0;
} else {
return args.filter_layout->span().dist_byte();
}
}

size_t ConvBiasForwardImpl::AlgoInt8NHWCIMMAImplicitGemm::
get_preprocess_workspace_in_bytes(const SizeArgs& args) const {
return 0;
}

SmallVector<TensorLayout> ConvBiasForwardImpl::AlgoInt8NHWCIMMAImplicitGemm::
deduce_preprocessed_filter_layout(const SizeArgs& args) const {
return {args.filter_layout->collapse_contiguous()};
}

void ConvBiasForwardImpl::AlgoInt8NHWCIMMAImplicitGemm::exec_preprocess(
const ExecArgs& args) const {
void* filter_ptr = args.preprocessed_filter->tensors[0].raw_ptr;
reorder_filter(args, m_algo_param.access_size, filter_ptr);
}

std::tuple<float, float, float, float, float>
ConvBiasForwardImpl::AlgoInt8NHWCIMMAImplicitGemm::get_constants(
const ExecArgs& args) const {
float src_scale = args.src_layout->dtype.param<dtype::QuantizedS8>().scale,
filter_scale =
args.filter_layout->dtype.param<dtype::QuantizedS8>().scale,
bias_scale = 1.f, dst_scale;

if (args.bias_layout->dtype.enumv() == DTypeEnum::QuantizedS32) {
bias_scale = args.bias_layout->dtype.param<dtype::QuantizedS32>().scale;
}

uint8_t dst_zero = 0;

if (args.dst_layout->dtype.enumv() == DTypeEnum::QuantizedS8) {
dst_scale = args.dst_layout->dtype.param<dtype::QuantizedS8>().scale;
} else if (args.dst_layout->dtype.enumv() == DTypeEnum::QuantizedS4) {
dst_scale = args.dst_layout->dtype.param<dtype::QuantizedS4>().scale;
} else if (args.dst_layout->dtype.enumv() == DTypeEnum::Quantized4Asymm) {
dst_scale =
args.dst_layout->dtype.param<dtype::Quantized4Asymm>().scale;
dst_zero = args.dst_layout->dtype.param<dtype::Quantized4Asymm>()
.zero_point;
} else { // DTypeEnum::Float32
megdnn_assert(args.dst_layout->dtype.enumv() == DTypeEnum::Float32);
dst_scale = 1.f;
}

float alpha = src_scale * filter_scale / dst_scale,
beta = bias_scale / dst_scale, gamma = 0.f, delta = 0.f,
theta = dst_zero;

if (args.z_layout->ndim > 0) {
float z_scale;
if (args.z_layout->dtype.enumv() == DTypeEnum::QuantizedS8) {
z_scale = args.z_layout->dtype.param<dtype::QuantizedS8>().scale;
gamma = z_scale / dst_scale;
} else if (args.z_layout->dtype.enumv() == DTypeEnum::QuantizedS4) {
z_scale = args.z_layout->dtype.param<dtype::QuantizedS4>().scale;
gamma = z_scale / dst_scale;
} else if (args.z_layout->dtype.enumv() == DTypeEnum::Quantized4Asymm) {
z_scale =
args.z_layout->dtype.param<dtype::Quantized4Asymm>().scale;
uint8_t z_zero =
args.z_layout->dtype.param<dtype::Quantized4Asymm>()
.zero_point;
gamma = z_scale / dst_scale;
delta = -z_zero * gamma;
} else { // DTypeEnum::Float32
megdnn_assert(args.z_layout->dtype.enumv() == DTypeEnum::Float32);
gamma = 1.f;
}
}

if (args.opr->param().nonlineMode ==
param::ConvBias::NonlineMode::IDENTITY) {
delta += theta;
theta = 0.f;
}

return {alpha, beta, gamma, delta, theta};
}

void ConvBiasForwardImpl::AlgoInt8NHWCIMMAImplicitGemm::exec(
const ExecArgs& args) const {
auto&& param = args.opr->param();
auto&& fm = args.filter_meta;
size_t n = args.src_layout->operator[](0),
ci = args.src_layout->operator[](3),
hi = args.src_layout->operator[](1),
wi = args.src_layout->operator[](2);
size_t co = args.dst_layout->operator[](3),
ho = args.dst_layout->operator[](1),
wo = args.dst_layout->operator[](2);
UNPACK_CONV_PARAMETER(fm, param);
MARK_USED_VAR

void* filter_ptr = nullptr;
void* bias_ptr = nullptr;
void* z_ptr = nullptr;

if (args.preprocessed_filter) {
filter_ptr = args.preprocessed_filter->tensors[0].raw_ptr;
} else {
filter_ptr = reinterpret_cast<void*>(args.workspace.raw_ptr);
reorder_filter(args, m_algo_param.access_size, filter_ptr);
}
bias_ptr = args.bias_tensor->raw_ptr;

if (args.z_layout->ndim > 0)
z_ptr = args.z_tensor->raw_ptr;

// \note these constants of cutlass epilogue will be passed to method
// `execute_cutlass_conv_op` by pointer and interpreted as ElementCompute*,
// a different dtype here results in undefined epilogue behaviors
float alpha, beta, gamma, delta, theta;
std::tie(alpha, beta, gamma, delta, theta) = get_constants(args);

float dst_scale = 1.f;
float threshold = 0.f;
bool use_conv_filter_unity_opt = (fh == 1 && fw == 1);

auto dst_dtype = args.dst_layout->dtype.enumv();

bool without_shared_load = ((co % m_algo_param.threadblock_n == 0) &&
(m_algo_param.threadblock_n == 16 ||
(m_algo_param.threadblock_n == 32 &&
dst_dtype != DTypeEnum::Float32)));

if (dst_dtype == DTypeEnum::QuantizedS8) { // DTypeEnum::QuantizedS8
dst_scale = args.dst_layout->dtype.param<dtype::QuantizedS8>().scale;
} else if (dst_dtype == DTypeEnum::QuantizedS4) {
dst_scale = args.dst_layout->dtype.param<dtype::QuantizedS4>().scale;
} else if (dst_dtype == DTypeEnum::Quantized4Asymm) {
dst_scale =
args.dst_layout->dtype.param<dtype::Quantized4Asymm>().scale;
} else { // DTypeEnum::Float32
dst_scale = 1.f;
}

cudaStream_t stream = cuda_stream(args.opr->handle());

const auto* op = get_cutlass_conv_op(
args, ConvOperator::kFprop, ConvType::kConvolution,
use_conv_filter_unity_opt, without_shared_load);

execute_cutlass_conv_op(op, args.src_tensor->raw_ptr, filter_ptr, bias_ptr,
z_ptr, args.dst_tensor->raw_ptr, nullptr, n, hi, wi,
ci, co, fh, fw, ho, wo, ph, pw, sh, sw, dh, dw,
&alpha, &beta, &gamma, &delta, &theta, &threshold,
&dst_scale, stream);

after_kernel_launch();
}

std::string ConvBiasForwardImpl::AlgoInt8NHWCIMMAImplicitGemm::to_string(
AlgoParam algo_param) {
return ssprintf("%dX%dX%d_%dX%dX%d_%d_%d", algo_param.threadblock_m,
algo_param.threadblock_n, algo_param.threadblock_k,
algo_param.warp_m, algo_param.warp_n, algo_param.warp_k,
algo_param.stage, algo_param.access_size);
}

void ConvBiasForwardImpl::AlgoInt8NHWCIMMAImplicitGemm::reorder_filter(
const ExecArgs& args, const int iterleaved,
void* reordered_filter) const {
size_t co = args.filter_layout->operator[](0),
ci = args.filter_layout->operator[](3),
fh = args.filter_layout->operator[](1),
fw = args.filter_layout->operator[](2);

cudaStream_t stream = cuda_stream(args.opr->handle());

// reformat filter from nhwc to ncxhwx and reorder oc
// use trans_oc threadblock_n must be 16 or 32 and src dtype == dest dtype
bool trans_oc = ((co % m_algo_param.threadblock_n == 0) &&
(m_algo_param.threadblock_n == 16 ||
(m_algo_param.threadblock_n == 32 &&
args.dst_layout->dtype.enumv() != DTypeEnum::Float32)));
uint32_t oc_iterleaved = (m_algo_param.threadblock_n == 32) ? 32 : 16;

uint32_t alignbits = iterleaved * 8;

cutlass_wrapper::reorder_nhwc_imma_filter<8>(
reinterpret_cast<int8_t*>(reordered_filter),
reinterpret_cast<int8_t*>(args.filter_tensor->raw_ptr), co, ci, fh,
fw, trans_oc, alignbits, oc_iterleaved, stream);
}
#endif

// vim: syntax=cpp.doxygen

+ 29
- 10
dnn/src/cuda/conv_bias/implicit_gemm_uint4_int4_nhwc_imma.cpp View File

@@ -102,22 +102,41 @@ ConvBiasForwardImpl::AlgoUInt4Int4NHWCIMMAImplicitGemm::get_constants(
args.filter_layout->dtype.param<dtype::QuantizedS4>().scale,
bias_scale =
args.bias_layout->dtype.param<dtype::QuantizedS32>().scale,
dst_scale =
args.dst_layout->dtype.param<dtype::Quantized4Asymm>().scale;
dst_scale;

uint8_t dst_zero = 0;

if (args.dst_layout->dtype.enumv() == DTypeEnum::Quantized4Asymm) {
dst_scale =
args.dst_layout->dtype.param<dtype::Quantized4Asymm>().scale;

dst_zero = args.dst_layout->dtype.param<dtype::Quantized4Asymm>()
.zero_point;
} else { // DTypeEnum::QuantizedS8
megdnn_assert(args.dst_layout->dtype.enumv() == DTypeEnum::QuantizedS8);
dst_scale = args.dst_layout->dtype.param<dtype::QuantizedS8>().scale;
}

uint8_t dst_zero =
args.dst_layout->dtype.param<dtype::Quantized4Asymm>().zero_point;
float alpha = src_scale * filter_scale / dst_scale,
beta = bias_scale / dst_scale, gamma = 0.f, delta = 0.f,
theta = dst_zero;

if (args.z_layout->ndim > 0) {
float z_scale =
args.z_layout->dtype.param<dtype::Quantized4Asymm>().scale;
gamma = z_scale / dst_scale;
uint8_t z_zero =
args.z_layout->dtype.param<dtype::Quantized4Asymm>().zero_point;
delta = -z_zero * gamma;
float z_scale;
if (args.z_layout->dtype.enumv() == DTypeEnum::Quantized4Asymm) {
z_scale =
args.z_layout->dtype.param<dtype::Quantized4Asymm>().scale;
uint8_t z_zero =
args.z_layout->dtype.param<dtype::Quantized4Asymm>()
.zero_point;
gamma = z_scale / dst_scale;
delta = -z_zero * gamma;
} else { // DTypeEnum::QuantizedS8
megdnn_assert(args.z_layout->dtype.enumv() ==
DTypeEnum::QuantizedS8);
z_scale = args.z_layout->dtype.param<dtype::QuantizedS8>().scale;
gamma = z_scale / dst_scale;
}
}

// identity epilogue has no theta:


+ 1
- 0
dnn/src/cuda/conv_bias/opr_impl.h View File

@@ -65,6 +65,7 @@ public:
class AlgoInt8CHWN4IMMAImplicitGemmReorderFilter;
class AlgoInt8CHWN4IMMAImplicitGemmUnrollWidth;
class AlgoInt8NCHW32IMMAImplicitGemm;
class AlgoInt8NHWCIMMAImplicitGemm;
class AlgoInt4NCHW64IMMAImplicitGemmBase;
class AlgoInt4Int4NCHW64IMMAImplicitGemm;
class AlgoUInt4Int4NCHW64IMMAImplicitGemm;


+ 2
- 0
dnn/src/cuda/convolution/backward_data/algo.h View File

@@ -275,6 +275,7 @@ public:
private:
WorkspaceBundle get_workspace_bundle(dt_byte* raw_ptr,
const SizeArgs& args) const;
const void* get_available_op(const SizeArgs& args) const;
AlgoParam m_algo_param;
std::string m_name;
};
@@ -295,6 +296,7 @@ public:
private:
WorkspaceBundle get_workspace_bundle(dt_byte* raw_ptr,
const SizeArgs& args) const;
const void* get_available_op(const SizeArgs& args) const;
};

class ConvolutionBackwardDataImpl::AlgoPack : NonCopyableObj {


+ 41
- 30
dnn/src/cuda/convolution/backward_data/implicit_gemm_int8_nchw4_dp4a.cpp View File

@@ -20,6 +20,43 @@
using namespace megdnn;
using namespace cuda;

const void*
ConvolutionBackwardDataImpl::AlgoInt8NCHW4DotProdImplicitGemm::get_available_op(
const SizeArgs& args) const {
using namespace cutlass::library;
auto&& fm = args.filter_meta;
size_t sh = fm.stride[0], sw = fm.stride[1];
cutlass::conv::SpecialOptimizeDesc special_optimization =
(sh == 2 && sw == 2) ? cutlass::conv::SpecialOptimizeDesc::
DECONV_DOUBLE_UPSAMPLING
: cutlass::conv::SpecialOptimizeDesc::NONE;
ConvolutionKey key{
cutlass::conv::Operator::kDgrad,
NumericTypeID::kS8,
LayoutTypeID::kTensorNC4HW4,
NumericTypeID::kS8,
LayoutTypeID::kTensorK4RSC4,
NumericTypeID::kS8,
LayoutTypeID::kTensorNC4HW4,
NumericTypeID::kS32,
LayoutTypeID::kTensorNC4HW4,
cutlass::conv::ConvType::kConvolution,
m_algo_param.threadblock_m,
m_algo_param.threadblock_n,
m_algo_param.threadblock_k,
m_algo_param.warp_m,
m_algo_param.warp_n,
m_algo_param.warp_k,
1,
1,
4,
cutlass::epilogue::EpilogueType::kBiasAddLinearCombinationClamp,
m_algo_param.stage,
special_optimization,
false};
return (void*)Singleton::get().operation_table.find_op(key);
}

bool ConvolutionBackwardDataImpl::AlgoInt8NCHW4DotProdImplicitGemm::
is_available(const SizeArgs& args) const {
auto&& fm = args.filter_meta;
@@ -51,6 +88,7 @@ bool ConvolutionBackwardDataImpl::AlgoInt8NCHW4DotProdImplicitGemm::
// FIXME: too large filter size is not supported now
available &= fm.spatial[0] * fm.spatial[1] <=
(uint32_t)(848 / (2 * m_algo_param.warp_k / 4) - 2);
available &= (get_available_op(args) != nullptr);
// only support sm_61 or later, platform should have fast native int8
// support
available &= is_compute_capability_required(6, 1);
@@ -105,40 +143,14 @@ void ConvolutionBackwardDataImpl::AlgoInt8NCHW4DotProdImplicitGemm::exec(
args.grad_layout->dtype.param<dtype::QuantizedS8>().scale;

// \note these constants of cutlass epilogue will be passed to struct
// `ConvolutionArguments` by pointer and interpreted as ElementCompute*, a
// different dtype here results in undefined epilogue behaviors
// `ConvolutionArguments` by pointer and interpreted as ElementCompute*,
// a different dtype here results in undefined epilogue behaviors
float alpha = diff_scale * filter_scale / grad_scale, beta = 0.f,
gamma = 0.f, delta = 0.f;

using namespace cutlass::library;

// only use 16x64x8_16x64x8_2stages impl
ConvolutionKey key{
cutlass::conv::Operator::kDgrad,
NumericTypeID::kS8,
LayoutTypeID::kTensorNC4HW4,
NumericTypeID::kS8,
LayoutTypeID::kTensorK4RSC4,
NumericTypeID::kS8,
LayoutTypeID::kTensorNC4HW4,
NumericTypeID::kS32,
LayoutTypeID::kTensorNC4HW4,
cutlass::conv::ConvType::kConvolution,
m_algo_param.threadblock_m,
m_algo_param.threadblock_n,
m_algo_param.threadblock_k,
m_algo_param.warp_m,
m_algo_param.warp_n,
m_algo_param.warp_k,
1,
1,
4,
cutlass::epilogue::EpilogueType::kBiasAddLinearCombinationClamp,
m_algo_param.stage,
true,
false};

const Operation* op = Singleton::get().operation_table.find_op(key);
const Operation* op = (const Operation*)get_available_op(args);

// gcc prints warnings when size_t values are implicitly narrowed to int
cutlass::conv::Conv2dProblemSize problem_size{
@@ -167,7 +179,6 @@ void ConvolutionBackwardDataImpl::AlgoPack::fill_int8_dp4a_algos() {
int8_nchw4_dotprod.emplace_back(AlgoParam{16, 128, 16, 16, 64, 16, 2});
int8_nchw4_dotprod.emplace_back(AlgoParam{16, 128, 16, 16, 128, 16, 1});
int8_nchw4_dotprod.emplace_back(AlgoParam{32, 128, 32, 32, 64, 32, 2});
int8_nchw4_dotprod.emplace_back(AlgoParam{64, 128, 32, 64, 32, 32, 2});
}

// vim: syntax=cpp.doxygen

+ 42
- 27
dnn/src/cuda/convolution/backward_data/implicit_gemm_int8_nchw_dp4a.cpp View File

@@ -19,6 +19,44 @@
using namespace megdnn;
using namespace cuda;

const void*
ConvolutionBackwardDataImpl::AlgoInt8NCHWDotProdImplicitGemm::get_available_op(
const SizeArgs& args) const {
using namespace cutlass::library;
auto&& fm = args.filter_meta;
size_t sh = fm.stride[0], sw = fm.stride[1];
cutlass::conv::SpecialOptimizeDesc special_optimization =
(sh == 2 && sw == 2) ? cutlass::conv::SpecialOptimizeDesc::
DECONV_DOUBLE_UPSAMPLING
: cutlass::conv::SpecialOptimizeDesc::NONE;
// only use 16x64x8_16x64x8_2stages impl
ConvolutionKey key{
cutlass::conv::Operator::kDgrad,
NumericTypeID::kS8,
LayoutTypeID::kTensorNC4HW4,
NumericTypeID::kS8,
LayoutTypeID::kTensorK4RSC4,
NumericTypeID::kS8,
LayoutTypeID::kTensorNC4HW4,
NumericTypeID::kS32,
LayoutTypeID::kTensorNC4HW4,
cutlass::conv::ConvType::kConvolution,
16,
64,
8,
16,
64,
8,
1,
1,
4,
cutlass::epilogue::EpilogueType::kBiasAddLinearCombinationClamp,
2,
special_optimization,
false};
return (void*)Singleton::get().operation_table.find_op(key);
}

bool ConvolutionBackwardDataImpl::AlgoInt8NCHWDotProdImplicitGemm::is_available(
const SizeArgs& args) const {
auto&& fm = args.filter_meta;
@@ -52,6 +90,9 @@ bool ConvolutionBackwardDataImpl::AlgoInt8NCHWDotProdImplicitGemm::is_available(
available &= (fm.dilation[0] == 1 && fm.dilation[1] == 1);
// FIXME: too large filter size is not supported now
available &= fm.spatial[0] * fm.spatial[1] <= (848 / (2 * 8 / 4) - 2);

available &= (get_available_op(args) != nullptr);

// only support sm_61 or later, platform should have fast native int8
// support
available &= is_compute_capability_required(6, 1);
@@ -138,33 +179,7 @@ void ConvolutionBackwardDataImpl::AlgoInt8NCHWDotProdImplicitGemm::exec(

using namespace cutlass::library;

// only use 16x64x8_16x64x8_2stages impl
ConvolutionKey key{
cutlass::conv::Operator::kDgrad,
NumericTypeID::kS8,
LayoutTypeID::kTensorNC4HW4,
NumericTypeID::kS8,
LayoutTypeID::kTensorK4RSC4,
NumericTypeID::kS8,
LayoutTypeID::kTensorNC4HW4,
NumericTypeID::kS32,
LayoutTypeID::kTensorNC4HW4,
cutlass::conv::ConvType::kConvolution,
16,
64,
8,
16,
64,
8,
1,
1,
4,
cutlass::epilogue::EpilogueType::kBiasAddLinearCombinationClamp,
2,
true,
false};

const Operation* op = Singleton::get().operation_table.find_op(key);
const Operation* op = (const Operation*)get_available_op(args);

// gcc prints warnings when size_t values are implicitly narrowed to int
cutlass::conv::Conv2dProblemSize problem_size{


+ 2
- 2
dnn/src/cuda/cutlass/convolution_operation.h View File

@@ -119,8 +119,8 @@ public:
m_description.threadblock_swizzle = ThreadblockSwizzleMap<
typename Operator::ThreadblockSwizzle>::kId;

m_description.need_load_from_const_mem =
Operator::kNeedLoadFromConstMem;
m_description.special_optimization =
Operator::kSpecialOpt;
m_description.gemm_mode = Operator::kGemmMode;
m_description.without_shared_load = Operator::kWithoutSharedLoad;
}


+ 1
- 1
dnn/src/cuda/cutlass/library.h View File

@@ -487,7 +487,7 @@ struct ConvolutionDescription : public OperationDescription {

ThreadblockSwizzleID threadblock_swizzle;

bool need_load_from_const_mem;
conv::SpecialOptimizeDesc special_optimization;
conv::ImplicitGemmMode gemm_mode;
bool without_shared_load;
};


+ 15
- 13
dnn/src/cuda/cutlass/operation_table.cpp View File

@@ -124,7 +124,7 @@ ConvolutionKey get_convolution_key_from_desc(
key.epilogue_type = desc.epilogue_type;

key.stages = desc.tile_description.threadblock_stages;
key.need_load_from_const_mem = desc.need_load_from_const_mem;
key.special_optimization = desc.special_optimization;
key.without_shared_load = desc.without_shared_load;

return key;
@@ -156,23 +156,25 @@ void OperationTable::append(Manifest const& manifest) {
/////////////////////////////////////////////////////////////////////////////////////////////////

Operation const* OperationTable::find_op(GemmKey const& key) const {
megdnn_assert(gemm_operations.count(key) > 0,
"key not found in cutlass operation table");
auto const& ops = gemm_operations.at(key);
megdnn_assert(ops.size() == 1, "exactly one kernel expected, got %zu",
ops.size());
return ops[0];
if (gemm_operations.count(key)) {
auto const& ops = gemm_operations.at(key);
megdnn_assert(ops.size() == 1, "exactly one kernel expected, got %zu",
ops.size());
return ops[0];
}
return nullptr;
}

/////////////////////////////////////////////////////////////////////////////////////////////////

Operation const* OperationTable::find_op(ConvolutionKey const& key) const {
megdnn_assert(convolution_operations.count(key) > 0,
"key not found in cutlass operation table");
auto const& ops = convolution_operations.at(key);
megdnn_assert(ops.size() == 1, "exactly one kernel expected, got %zu",
ops.size());
return ops[0];
if (convolution_operations.count(key) > 0) {
auto const& ops = convolution_operations.at(key);
megdnn_assert(ops.size() == 1, "exactly one kernel expected, got %zu",
ops.size());
return ops[0];
}
return nullptr;
}

/////////////////////////////////////////////////////////////////////////////////////////////////


+ 6
- 6
dnn/src/cuda/cutlass/operation_table.h View File

@@ -211,7 +211,7 @@ struct ConvolutionKey {

epilogue::EpilogueType epilogue_type;
int stages;
bool need_load_from_const_mem;
conv::SpecialOptimizeDesc special_optimization;
bool without_shared_load;

inline bool operator==(ConvolutionKey const& rhs) const {
@@ -234,7 +234,7 @@ struct ConvolutionKey {
(instruction_shape_n == rhs.instruction_shape_n) &&
(instruction_shape_k == rhs.instruction_shape_k) &&
(epilogue_type == rhs.epilogue_type) && (stages == rhs.stages) &&
(need_load_from_const_mem == rhs.need_load_from_const_mem) &&
(special_optimization == rhs.special_optimization) &&
(without_shared_load == rhs.without_shared_load);
}

@@ -270,8 +270,8 @@ struct ConvolutionKey {
"\n instruction_shape: " + instruction_shape_str +
"\n epilogue_type: " + to_string(epilogue_type) +
"\n stages: " + std::to_string(stages) +
"\n need_load_from_const_mem: " +
to_string(need_load_from_const_mem) +
"\n special_optimization: " +
to_string(special_optimization) +
"\n without_shared_load: " + to_string(without_shared_load) +
"\n}";
}
@@ -308,8 +308,8 @@ struct ConvolutionKeyHasher {
sizeof(key.instruction_shape_k))
.update(&key.epilogue_type, sizeof(key.epilogue_type))
.update(&key.stages, sizeof(key.stages))
.update(&key.need_load_from_const_mem,
sizeof(key.need_load_from_const_mem))
.update(&key.special_optimization,
sizeof(key.special_optimization))
.update(&key.without_shared_load,
sizeof(key.without_shared_load))
.digest();


+ 29
- 0
dnn/src/cuda/cutlass/util.cu View File

@@ -1569,6 +1569,35 @@ char const* to_string(MathOperationID math_op, bool pretty) {
static struct {
char const* text;
char const* pretty;
conv::SpecialOptimizeDesc enumerant;
} SpecialOptimizeDesc_enumerants[] = {
{"none_special_opt", "NoneSpecialOpt", conv::SpecialOptimizeDesc::NONE},
{"conv_filter_unity", "ConvFilterUnity",
conv::SpecialOptimizeDesc::CONV_FILTER_UNITY},
{"deconv_double_upsampling", "DeconvDoubleUpsampling",
conv::SpecialOptimizeDesc::DECONV_DOUBLE_UPSAMPLING},
};

/// Converts an SpecialOptimizeDesc enumerant to a string
char const* to_string(conv::SpecialOptimizeDesc special_opt, bool pretty) {
for (auto const& possible : SpecialOptimizeDesc_enumerants) {
if (special_opt == possible.enumerant) {
if (pretty) {
return possible.pretty;
} else {
return possible.text;
}
}
}

return pretty ? "Invalid" : "invalid";
}

///////////////////////////////////////////////////////////////////////////////////////////////////

static struct {
char const* text;
char const* pretty;
conv::ImplicitGemmMode enumerant;
} ImplicitGemmMode_enumerants[] = {
{"gemm_nt", "GemmNT", conv::ImplicitGemmMode::GEMM_NT},


+ 4
- 0
dnn/src/cuda/cutlass/util.h View File

@@ -207,6 +207,10 @@ char const* to_string(bool val, bool pretty = false);
/// Converts a MathOperationID enumerant to a string
char const* to_string(MathOperationID math_op, bool pretty = false);

/// Converts a SpecialOptimizeDesc enumerant to a string
char const* to_string(conv::SpecialOptimizeDesc special_opt,
bool pretty = false);

/// Converts an ImplicitGemmMode enumerant to a string
char const* to_string(conv::ImplicitGemmMode mode, bool pretty = false);



+ 4
- 0
dnn/src/cuda/matrix_mul/algos.h View File

@@ -235,6 +235,7 @@ public:
m_name{ssprintf("CUTLASS_FLOAT32_SIMT_%s",
m_algo_param.to_string().c_str())} {}
bool is_available(const SizeArgs& args) const override;

size_t get_workspace_in_bytes(const SizeArgs& args) const override;
const char* name() const override { return m_name.c_str(); }
AlgoAttribute attribute() const override {
@@ -260,6 +261,7 @@ private:
void do_exec(const ExecArgs& args) const override;
int min_alignment_requirement() const override { return 1; }
std::string m_name;
const void* get_available_op(const SizeArgs& args) const;
};

class MatrixMulForwardImpl::AlgoFloat32SIMTSplitK final
@@ -270,6 +272,7 @@ public:
m_name{ssprintf("CUTLASS_FLOAT32_SIMT_SPLIT_K_%s",
m_algo_param.to_string().c_str())} {}
bool is_available(const SizeArgs& args) const override;
size_t get_workspace_in_bytes(const SizeArgs& args) const override;
const char* name() const override { return m_name.c_str(); }
AlgoAttribute attribute() const override {
@@ -297,6 +300,7 @@ private:
void do_exec(const ExecArgs& args) const override;
int min_alignment_requirement() const override { return 1; }
std::string m_name;
const void* get_available_op(const SizeArgs& args) const;
};

class MatrixMulForwardImpl::AlgoFloat32SIMTGemvBatchedStrided final


+ 36
- 28
dnn/src/cuda/matrix_mul/cutlass_float32_simt.cpp View File

@@ -19,6 +19,39 @@
using namespace megdnn;
using namespace cuda;

const void* MatrixMulForwardImpl::AlgoFloat32SIMT::get_available_op(
const SizeArgs& args) const {
using namespace cutlass::library;
auto&& param = args.opr->param();
auto layoutA = param.transposeA ? LayoutTypeID::kColumnMajor
: LayoutTypeID::kRowMajor;
auto layoutB = param.transposeB ? LayoutTypeID::kColumnMajor
: LayoutTypeID::kRowMajor;

int alignment = min_alignment_requirement();
GemmKey key{NumericTypeID::kF32,
layoutA,
NumericTypeID::kF32,
layoutB,
NumericTypeID::kF32,
LayoutTypeID::kRowMajor,
NumericTypeID::kF32,
m_algo_param.threadblock_m,
m_algo_param.threadblock_n,
m_algo_param.threadblock_k,
m_algo_param.warp_m,
m_algo_param.warp_n,
m_algo_param.warp_k,
1,
1,
1,
2,
alignment,
alignment,
SplitKMode::kNone};
return (void*)Singleton::get().operation_table.find_op(key);
}

bool MatrixMulForwardImpl::AlgoFloat32SIMT::is_available(
const SizeArgs& args) const {
bool available =
@@ -34,6 +67,8 @@ bool MatrixMulForwardImpl::AlgoFloat32SIMT::is_available(
m_algo_param.threadblock_n <=
y_grid_limit);

available &= (get_available_op(args) != nullptr);

return available;
}

@@ -61,34 +96,7 @@ void MatrixMulForwardImpl::AlgoFloat32SIMT::do_exec(

using namespace cutlass::library;

auto layoutA = param.transposeA ? LayoutTypeID::kColumnMajor
: LayoutTypeID::kRowMajor;
auto layoutB = param.transposeB ? LayoutTypeID::kColumnMajor
: LayoutTypeID::kRowMajor;

int alignment = min_alignment_requirement();
GemmKey key{NumericTypeID::kF32,
layoutA,
NumericTypeID::kF32,
layoutB,
NumericTypeID::kF32,
LayoutTypeID::kRowMajor,
NumericTypeID::kF32,
m_algo_param.threadblock_m,
m_algo_param.threadblock_n,
m_algo_param.threadblock_k,
m_algo_param.warp_m,
m_algo_param.warp_n,
m_algo_param.warp_k,
1,
1,
1,
2,
alignment,
alignment,
SplitKMode::kNone};

const Operation* op = Singleton::get().operation_table.find_op(key);
const Operation* op = (const Operation*)get_available_op(args);

GemmArguments gemm_args{problem_size,
args.tensor_a.raw_ptr,


+ 36
- 29
dnn/src/cuda/matrix_mul/cutlass_float32_simt_split_k.cpp View File

@@ -19,6 +19,39 @@
using namespace megdnn;
using namespace cuda;

const void* MatrixMulForwardImpl::AlgoFloat32SIMTSplitK::get_available_op(
const SizeArgs& args) const {
using namespace cutlass::library;
auto&& param = args.opr->param();
auto layoutA = param.transposeA ? LayoutTypeID::kColumnMajor
: LayoutTypeID::kRowMajor;
auto layoutB = param.transposeB ? LayoutTypeID::kColumnMajor
: LayoutTypeID::kRowMajor;

int alignment = min_alignment_requirement();
GemmKey key{NumericTypeID::kF32,
layoutA,
NumericTypeID::kF32,
layoutB,
NumericTypeID::kF32,
LayoutTypeID::kRowMajor,
NumericTypeID::kF32,
m_algo_param.threadblock_m,
m_algo_param.threadblock_n,
m_algo_param.threadblock_k,
m_algo_param.warp_m,
m_algo_param.warp_n,
m_algo_param.warp_k,
1,
1,
1,
2,
alignment,
alignment,
SplitKMode::kParallel};
return (void*)Singleton::get().operation_table.find_op(key);
}

bool MatrixMulForwardImpl::AlgoFloat32SIMTSplitK::is_available(
const SizeArgs& args) const {
auto&& param = args.opr->param();
@@ -35,6 +68,8 @@ bool MatrixMulForwardImpl::AlgoFloat32SIMTSplitK::is_available(
available &= ((m + m_algo_param.threadblock_m - 1) /
m_algo_param.threadblock_m <=
y_grid_limit);
available &= (get_available_op(args) != nullptr);

return available;
}

@@ -66,35 +101,7 @@ void MatrixMulForwardImpl::AlgoFloat32SIMTSplitK::do_exec(
float alpha = 1.f, beta = 0.f;

using namespace cutlass::library;

auto layoutA = param.transposeA ? LayoutTypeID::kColumnMajor
: LayoutTypeID::kRowMajor;
auto layoutB = param.transposeB ? LayoutTypeID::kColumnMajor
: LayoutTypeID::kRowMajor;

int alignment = min_alignment_requirement();
GemmKey key{NumericTypeID::kF32,
layoutA,
NumericTypeID::kF32,
layoutB,
NumericTypeID::kF32,
LayoutTypeID::kRowMajor,
NumericTypeID::kF32,
m_algo_param.threadblock_m,
m_algo_param.threadblock_n,
m_algo_param.threadblock_k,
m_algo_param.warp_m,
m_algo_param.warp_n,
m_algo_param.warp_k,
1,
1,
1,
2,
alignment,
alignment,
SplitKMode::kParallel};

Operation const* op = Singleton::get().operation_table.find_op(key);
const Operation* op = (const Operation*)get_available_op(args);

GemmArguments gemm_args{problem_size,
args.tensor_a.raw_ptr,


+ 130
- 1
dnn/test/cuda/conv_bias_int8.cpp View File

@@ -882,6 +882,125 @@ TEST_F(CUDA, CUTLASS_CONV_BIAS_INT8_NCHW32_IMMA) {
ConvBias::DirectParam{});
check(algo);
}

TEST_F(CUDA, CUTLASS_CONV_BIAS_INT8_NHWC) {
require_compute_capability(7, 5);
Checker<ConvBiasForward> checker(handle_cuda());
auto check = [&checker](const std::string& algo) {
checker.set_before_exec_callback(
conv_bias::ConvBiasAlgoChecker<ConvBiasForward>(algo.c_str()));
UniformIntRNG rng{-8, 8};
UniformIntRNG bias_rng{-50, 50};
checker.set_rng(0, &rng)
.set_rng(1, &rng)
.set_rng(2, &bias_rng)
.set_rng(3, &rng)
.set_dtype(0, dtype::QuantizedS8{1.2f})
.set_dtype(1, dtype::QuantizedS8{1.3f})
.set_dtype(2, dtype::QuantizedS32{1.2f * 1.3f})
.set_dtype(3, dtype::QuantizedS8{19.990229f})
.set_dtype(4, dtype::QuantizedS8{19.990228f})
.set_epsilon(1e-3);
param::ConvBias param;
param.pad_h = param.pad_w = 1;
param.stride_h = param.stride_w = 1;
param.format = param::ConvBias::Format::NHWC;
checker.set_param(param).execs(
{{16, 7, 7, 16}, {32, 3, 3, 16}, {1, 1, 1, 32}, {}, {}});
param.pad_h = param.pad_w = 0;
param.nonlineMode = param::ConvBias::NonlineMode::RELU;
checker.set_param(param).execs(
{{16, 7, 7, 16}, {16, 1, 1, 16}, {1, 1, 1, 16}, {}, {}});
};
std::string algo = ConvBias::algo_name<ConvBias::DirectParam>(
"INT8_NHWC_IMMA_IMPLICIT_GEMM_64X16X32_64X16X32_2_16",
ConvBias::DirectParam{});
check(algo);
algo = ConvBias::algo_name<ConvBias::DirectParam>(
"INT8_NHWC_IMMA_IMPLICIT_GEMM_128X32X32_64X32X32_1_16",
ConvBias::DirectParam{});
check(algo);
}

TEST_F(CUDA, CUTLASS_CONV_BIAS_INT8_NHWC_UINT4_WEIGHT_PREPROCESS) {
require_compute_capability(7, 5);
Checker<ConvBiasForward, OprWeightPreprocessProxy<ConvBiasForward>> checker(
handle_cuda());
auto check = [&checker](const std::string& algo) {
checker.set_before_exec_callback(
conv_bias::ConvBiasAlgoChecker<ConvBiasForward>(algo.c_str()));
UniformIntRNG rng{-8, 8};
UniformIntRNG bias_rng{-50, 50};
UniformIntRNG rng_u4{0, 15};
checker.set_rng(0, &rng)
.set_rng(1, &rng)
.set_rng(2, &bias_rng)
.set_rng(3, &rng_u4)
.set_dtype(0, dtype::QuantizedS8{0.2f})
.set_dtype(1, dtype::QuantizedS8{0.3f})
.set_dtype(2, dtype::QuantizedS32{0.2f * 0.3f})
.set_dtype(3, dtype::Quantized4Asymm{0.5f, 8})
.set_dtype(4, dtype::Quantized4Asymm{0.5f, 4})
.set_epsilon(1 + 1e-3);
param::ConvBias param;
param.pad_h = param.pad_w = 1;
param.stride_h = param.stride_w = 1;
param.format = param::ConvBias::Format::NHWC;
checker.set_param(param).execs(
{{16, 7, 7, 16}, {32, 3, 3, 16}, {1, 1, 1, 32}, {}, {}});
param.pad_h = param.pad_w = 0;
param.nonlineMode = param::ConvBias::NonlineMode::RELU;
checker.set_param(param).execs(
{{16, 7, 7, 16}, {16, 1, 1, 16}, {1, 1, 1, 16}, {}, {}});
};
std::string algo = ConvBias::algo_name<ConvBias::DirectParam>(
"INT8_NHWC_IMMA_IMPLICIT_GEMM_64X16X32_64X16X32_2_16",
ConvBias::DirectParam{});
check(algo);
algo = ConvBias::algo_name<ConvBias::DirectParam>(
"INT8_NHWC_IMMA_IMPLICIT_GEMM_128X32X32_64X32X32_1_16",
ConvBias::DirectParam{});
check(algo);
}

TEST_F(CUDA, CUTLASS_CONV_BIAS_INT8_NHWC_FLOAT) {
require_compute_capability(7, 5);
Checker<ConvBiasForward> checker(handle_cuda());
auto check = [&checker](const std::string& algo) {
checker.set_before_exec_callback(
conv_bias::ConvBiasAlgoChecker<ConvBiasForward>(algo.c_str()));
UniformIntRNG rng{-8, 8};
UniformFloatRNG float_rng{-50, 50};
checker.set_rng(0, &rng)
.set_rng(1, &rng)
.set_rng(2, &float_rng)
.set_rng(3, &float_rng)
.set_dtype(0, dtype::QuantizedS8(1.9980618f))
.set_dtype(1, dtype::QuantizedS8(1.9980927f))
.set_dtype(2, dtype::Float32())
.set_dtype(3, dtype::Float32())
.set_dtype(4, dtype::Float32());
param::ConvBias param;
param.pad_h = param.pad_w = 1;
param.stride_h = param.stride_w = 1;
param.format = param::ConvBias::Format::NHWC;
checker.set_param(param).execs(
{{16, 7, 7, 16}, {32, 3, 3, 16}, {1, 1, 1, 32}, {}, {}});
param.pad_h = param.pad_w = 0;
param.nonlineMode = param::ConvBias::NonlineMode::RELU;
checker.set_param(param).execs(
{{16, 7, 7, 16}, {16, 1, 1, 16}, {1, 1, 1, 16}, {}, {}});
};
std::string algo = ConvBias::algo_name<ConvBias::DirectParam>(
"INT8_NHWC_IMMA_IMPLICIT_GEMM_64X16X32_64X16X32_2_16",
ConvBias::DirectParam{});
check(algo);
algo = ConvBias::algo_name<ConvBias::DirectParam>(
"INT8_NHWC_IMMA_IMPLICIT_GEMM_128X32X32_64X32X32_1_16",
ConvBias::DirectParam{});
check(algo);
}

#endif

TEST_F(CUDA, CUTLASS_CONV_BIAS_INT8_NCHW4_NCHW) {
@@ -969,7 +1088,7 @@ TEST_F(CUDA, CUTLASS_CONV_BIAS_INT8_NCHW32_NCHW4) {
checker.set_before_exec_callback(conv_bias::ConvBiasAlgoChecker<
ConvBiasForward>(
ConvBias::algo_name<ConvBias::DirectParam>(
"INT8_NCHW32_IMMA_IMPLICIT_GEMM_128X128X64_64X64X64_2",
"INT8_NCHW32_IMMA_IMPLICIT_GEMM_32X128X32_32X64X32_1",
ConvBias::DirectParam{})
.c_str()));
checker.set_dtype(0, dtype::QuantizedS8(1.9980618f))
@@ -1109,6 +1228,16 @@ TEST_F(CUDA, BENCHMARK_CUTLASS_CONV_BIAS_INT8_NCHW32) {
"DIRECT:INT8_NCHW32_IMMA_IMPLICIT_GEMM",
param::ConvBias::Format::NCHW32);
}

TEST_F(CUDA, BENCHMARK_CUTLASS_CONV_BIAS_INT8_NHWC) {
require_compute_capability(7, 5);
benchmark_target_algo_with_cudnn_tsc(
handle_cuda(), get_det_first_bench_args(16),
dtype::QuantizedS8{1.2f}, dtype::QuantizedS8{1.3f},
dtype::QuantizedS32{1.2f * 1.3f}, dtype::QuantizedS8{1.0f},
"DIRECT:INT8_NHWC_IMMA_IMPLICIT_GEMM",
param::ConvBias::Format::NHWC);
}
#endif

TEST_F(CUDA, BENCHMARK_CUTLASS_CONV_BIAS_INT8_NCHW4) {


+ 6
- 2
dnn/test/cuda/conv_test_utils.cpp View File

@@ -102,9 +102,7 @@ std::vector<BenchArgs> get_det_first_bench_args(size_t batch) {
args.emplace_back(BenchArgs{batch, 16, 384, 640, 16, 3, 1});
args.emplace_back(BenchArgs{batch, 16, 384, 640, 32, 3, 2});
args.emplace_back(BenchArgs{batch, 32, 184, 320, 32, 3, 1});
args.emplace_back(BenchArgs{batch, 32, 384, 640, 64, 3, 2});
args.emplace_back(BenchArgs{batch, 32, 184, 320, 32, 1, 1});
args.emplace_back(BenchArgs{batch, 32, 384, 640, 64, 1, 2});
return args;
}

@@ -333,6 +331,9 @@ void benchmark_target_algo_with_cudnn_tsc(
.reshape({shape[0], shape[1] / 4, 4, shape[2],
shape[3]})
.dimshuffle({1, 3, 4, 0, 2}));
} else if (format == Format::NHWC) {
ret = static_cast<TensorShape>(
TensorLayout{shape, dtype}.dimshuffle({0, 2, 3, 1}));
}
return ret;
};
@@ -363,6 +364,9 @@ void benchmark_target_algo_with_cudnn_tsc(
if ((format == Format::CHWN4 || format == Format::NCHW4) &&
(arg.ci % 16 != 0))
continue;
// skip testcase which cannot enable nhwc tensorcore
if ((format == Format::NHWC) && (arg.ci % 4 != 0 || arg.co % 4 != 0))
continue;
Format format_cudnn = arg.ci % 32 == 0 && arg.co % 32 == 0
? Format::NCHW32
: Format::NCHW4;


+ 0
- 1
dnn/test/cuda/convolution.cpp View File

@@ -327,7 +327,6 @@ TEST_F(CUDA, CONVOLUTION_BACKWARD_DATA_INT8_NCHW4_DP4A) {
all_params.emplace_back(AlgoParam{16, 128, 16, 16, 64, 16, 2});
all_params.emplace_back(AlgoParam{16, 128, 16, 16, 128, 16, 1});
all_params.emplace_back(AlgoParam{32, 128, 32, 32, 64, 32, 2});
all_params.emplace_back(AlgoParam{64, 128, 32, 64, 32, 32, 2});

for (auto algo_param : all_params) {
Checker<ConvolutionBackwardData> checker(handle_cuda());


Loading…
Cancel
Save