feat(dnn/cuda): add nhwc int8 imma conv and conv fuse typecvt

GitOrigin-RevId: 229e1eb4be
3 years ago · 11f022ff7c
--- a/dnn/scripts/cutlass_generator/conv2d_operation.py
+++ b/dnn/scripts/cutlass_generator/conv2d_operation.py
@@ -19,8 +19,8 @@ class Conv2dOperation:
  #
  def __init__(self, conv_kind, conv_type, arch, tile_description, src, flt, bias, dst, element_epilogue, \
    epilogue_functor = EpilogueFunctor.LinearCombination, swizzling_functor = SwizzlingFunctor.Identity4, \
    need_load_from_const = True, implicit_gemm_mode = ImplicitGemmMode.GemmNT, without_shared_load = False, \
    required_cuda_ver_major = 9, required_cuda_ver_minor = 2):
    special_optimization = SpecialOptimizeDesc.NoneSpecialOpt, implicit_gemm_mode = ImplicitGemmMode.GemmNT, \
    without_shared_load = False, required_cuda_ver_major = 9, required_cuda_ver_minor = 2):

    self.operation_kind = OperationKind.Conv2d
    self.conv_kind = conv_kind
@@ -34,7 +34,7 @@ class Conv2dOperation:
    self.element_epilogue = element_epilogue
    self.epilogue_functor = epilogue_functor
    self.swizzling_functor = swizzling_functor
    self.need_load_from_const = need_load_from_const  
    self.special_optimization = special_optimization  
    self.implicit_gemm_mode = implicit_gemm_mode
    self.without_shared_load = without_shared_load
    self.required_cuda_ver_major = required_cuda_ver_major
@@ -60,16 +60,18 @@ class Conv2dOperation:
    else:
      inst_shape = ''

    unity_kernel = ''
    if not self.need_load_from_const:
      unity_kernel = '_1x1'
    special_opt = ''
    if self.special_optimization == SpecialOptimizeDesc.ConvFilterUnity:
      special_opt = '_1x1'
    elif self.special_optimization == SpecialOptimizeDesc.DeconvDoubleUpsampling:
       special_opt = '_s2'

    reorder_k = ''
    if self.without_shared_load:
      reorder_k = '_roc'

    return "%s%s%s%s%s%s_%s" % (ShortDataTypeNames[self.accumulator_type()], \
      inst_shape, intermediate_type, ConvKindNames[self.conv_kind], unity_kernel, \
      inst_shape, intermediate_type, ConvKindNames[self.conv_kind], special_opt, \
      reorder_k, ShortEpilogueNames[self.epilogue_functor])

  #
@@ -183,7 +185,7 @@ using Convolution =
    ${stages},
    ${alignment_src}, 
    ${alignment_filter}, 
    ${nonuninity_kernel}, 
    ${special_optimization}, 
    ${math_operator},
    ${implicit_gemm_mode}, 
    ${without_shared_load}>;
@@ -226,7 +228,7 @@ using Convolution =
      'stages': str(operation.tile_description.stages),
      'alignment_src': str(operation.src.alignment), 
      'alignment_filter': str(operation.flt.alignment), 
      'nonuninity_kernel': str(operation.need_load_from_const).lower(),  
      'special_optimization': SpecialOptimizeDescTag[operation.special_optimization],  
      'math_operator': MathOperationTag[operation.tile_description.math_instruction.math_operation],
      'implicit_gemm_mode': ImplicitGemmModeTag[operation.implicit_gemm_mode],
      'without_shared_load': str(operation.without_shared_load).lower()
@@ -266,7 +268,7 @@ using Deconvolution =
    ${stages},
    ${alignment_src}, 
    ${alignment_filter}, 
    ${nonuninity_kernel}, 
    ${special_optimization}, 
    ${math_operator},
    ${implicit_gemm_mode}>;
 """
@@ -308,7 +310,7 @@ using Deconvolution =
      'stages': str(operation.tile_description.stages),
      'alignment_src': str(operation.src.alignment), 
      'alignment_filter': str(operation.flt.alignment), 
      'nonuninity_kernel': str(operation.need_load_from_const).lower(),
      'special_optimization': SpecialOptimizeDescTag[operation.special_optimization],
      'math_operator': MathOperationTag[operation.tile_description.math_instruction.math_operation],
      'implicit_gemm_mode': ImplicitGemmModeTag[operation.implicit_gemm_mode]
    }
@@ -323,9 +325,9 @@ using Deconvolution =
 ###################################################################################################

 #
 def GenerateConv2d(conv_kind, tile_descriptions, src_layout, flt_layout, dst_layout, dst_type, min_cc, src_align = 32, flt_align = 32, dst_align = 128, \
  skip_unity_kernel = False, implicit_gemm_mode = ImplicitGemmMode.GemmNT, without_shared_load = False, required_cuda_ver_major = 9, \
  required_cuda_ver_minor = 2):
 def GenerateConv2d(conv_kind, tile_descriptions, src_layout, flt_layout, dst_layout, dst_type, min_cc, src_align = 32, flt_align = 32, dst_align = 32, \
  use_special_optimization = SpecialOptimizeDesc.NoneSpecialOpt, implicit_gemm_mode = ImplicitGemmMode.GemmNT, without_shared_load = False, \
  required_cuda_ver_major = 9, required_cuda_ver_minor = 2):
  operations = []

  element_epilogue = DataType.f32 
@@ -412,10 +414,10 @@ def GenerateConv2d(conv_kind, tile_descriptions, src_layout, flt_layout, dst_lay
      bias = TensorDescription(bias_type, dst_layout, max(1, int(32 / DataTypeSize[bias_type])))
      dst = TensorDescription(dst_type, dst_layout, int(dst_align / DataTypeSize[dst_type])) 

      new_operation = Conv2dOperation(conv_kind, ConvType.Convolution, min_cc, tile, src, flt, bias, dst, element_epilogue, epilogue, swizzling_functor, True, implicit_gemm_mode, without_shared_load, required_cuda_ver_major, required_cuda_ver_minor)
      new_operation = Conv2dOperation(conv_kind, ConvType.Convolution, min_cc, tile, src, flt, bias, dst, element_epilogue, epilogue, swizzling_functor, SpecialOptimizeDesc.NoneSpecialOpt, implicit_gemm_mode, without_shared_load, required_cuda_ver_major, required_cuda_ver_minor)
      operations.append(new_operation)
      if not skip_unity_kernel:
        new_operation = Conv2dOperation(conv_kind, ConvType.Convolution, min_cc, tile, src, flt, bias, dst, element_epilogue, epilogue, swizzling_functor, False, implicit_gemm_mode, without_shared_load, required_cuda_ver_major, required_cuda_ver_minor)
      if use_special_optimization != SpecialOptimizeDesc.NoneSpecialOpt:
        new_operation = Conv2dOperation(conv_kind, ConvType.Convolution, min_cc, tile, src, flt, bias, dst, element_epilogue, epilogue, swizzling_functor, use_special_optimization , implicit_gemm_mode, without_shared_load, required_cuda_ver_major, required_cuda_ver_minor)
        operations.append(new_operation)
  return operations

--- a/dnn/scripts/cutlass_generator/generator.py
+++ b/dnn/scripts/cutlass_generator/generator.py
@@ -168,10 +168,10 @@ def GenerateConv2d_Simt(args):
      for dst_type, dst_layout in zip(dst_types, dst_layouts):
        if dst_type == DataType.s4 or dst_type == DataType.u4:
          min_cc = 75
          skip_unity_kernel = True
          use_special_optimization = SpecialOptimizeDesc.NoneSpecialOpt
        else:
          min_cc = 61
          skip_unity_kernel = False
          use_special_optimization = SpecialOptimizeDesc.ConvFilterUnity
        tile_descriptions = [
          TileDescription([128, 128, 32], 2, [2, 4, 1], math_inst, min_cc, max_cc),
          TileDescription([128,  64, 32], 2, [2, 2, 1], math_inst, min_cc, max_cc),
@@ -182,10 +182,16 @@ def GenerateConv2d_Simt(args):
          TileDescription([ 64,  32, 32], 2, [1, 1, 1], math_inst, min_cc, max_cc), 
          TileDescription([ 16, 128, 16], 1, [1, 1, 1], math_inst, min_cc, max_cc), 
          TileDescription([ 16,  64,  8], 2, [1, 1, 1], math_inst, min_cc, max_cc), 
        ] 
        operations += GenerateConv2d(ConvKind.Fprop, tile_descriptions, layout[0], layout[1], 
                                     dst_layout, dst_type, min_cc, 32, 32, 32, 
                                     skip_unity_kernel) 
        ]
        for tile in tile_descriptions:
          if dst_layout == LayoutType.TensorNC32HW32 and tile.threadblock_shape[0] > 32:
            continue
          if (dst_layout == LayoutType.TensorNCHW or dst_layout == LayoutType.TensorNHWC) \
              and tile.threadblock_shape[0] > 16:
            continue
          operations += GenerateConv2d(ConvKind.Fprop, [tile], layout[0], layout[1], 
                                      dst_layout, dst_type, min_cc, 32, 32, 32, 
                                      use_special_optimization) 
  return operations


@@ -214,6 +220,8 @@ def GenerateConv2d_TensorOp_8816(args):
      DataType.s8, 
  ]

  use_special_optimization = SpecialOptimizeDesc.ConvFilterUnity

  min_cc = 75
  max_cc = 1024

@@ -232,28 +240,69 @@ def GenerateConv2d_TensorOp_8816(args):
            TileDescription([ 64, 128, 64], 2, [2, 2, 1], math_inst, min_cc, max_cc),
            TileDescription([128,  64, 32], 1, [2, 2, 1], math_inst, min_cc, max_cc),
            TileDescription([128,  32, 32], 1, [2, 1, 1], math_inst, min_cc, max_cc),
            TileDescription([ 64, 128, 32], 1, [2, 2, 1], math_inst, min_cc, max_cc),
            TileDescription([ 32, 128, 32], 1, [1, 2, 1], math_inst, min_cc, max_cc),
          ] 
          operations += GenerateConv2d(ConvKind.Fprop, tile_descriptions, layout[0], layout[1], 
                                     dst_layout, dst_type, min_cc, 128, 128, 64,  
                                     False, ImplicitGemmMode.GemmTN, True, cuda_major, cuda_minor) 
                                     dst_layout, dst_type, min_cc, 128, 128, 64, use_special_optimization, 
                                     ImplicitGemmMode.GemmTN, True, cuda_major, cuda_minor)
        else:
          assert dst_layout == LayoutType.TensorNC4HW4
          tile_descriptions = [
            TileDescription([128, 256, 64], 2, [2, 4, 1], math_inst, min_cc, max_cc),
            TileDescription([256, 128, 64], 2, [4, 2, 1], math_inst, min_cc, max_cc),
            TileDescription([128, 128, 64], 2, [2, 2, 1], math_inst, min_cc, max_cc),
            TileDescription([128,  64, 64], 2, [2, 2, 1], math_inst, min_cc, max_cc),
            TileDescription([ 64, 128, 64], 2, [2, 2, 1], math_inst, min_cc, max_cc),
            TileDescription([128,  64, 32], 1, [2, 2, 1], math_inst, min_cc, max_cc),
            TileDescription([128,  32, 32], 1, [2, 1, 1], math_inst, min_cc, max_cc),
            TileDescription([ 64, 128, 32], 1, [2, 2, 1], math_inst, min_cc, max_cc),
            TileDescription([ 32, 128, 32], 1, [1, 2, 1], math_inst, min_cc, max_cc),
          ]
          operations += GenerateConv2d(ConvKind.Fprop, tile_descriptions, layout[0], layout[1], 
                                     dst_layout, dst_type, min_cc, 128, 128, 64,  
                                     False, ImplicitGemmMode.GemmNT, False, cuda_major, cuda_minor) 
                                     dst_layout, dst_type, min_cc, 128, 128, 64, use_special_optimization, 
                                     ImplicitGemmMode.GemmNT, False, cuda_major, cuda_minor) 

  layouts_nhwc = [
    (LayoutType.TensorNHWC, LayoutType.TensorNC4HW4, 32), 
    (LayoutType.TensorNHWC, LayoutType.TensorNC8HW8, 64), 
    (LayoutType.TensorNHWC, LayoutType.TensorNC16HW16, 128), 
  ]

  dst_layouts_nhwc = [
      LayoutType.TensorNHWC, 
  ]

  for math_inst in math_instructions:
    for layout in layouts_nhwc:
      for dst_layout in dst_layouts_nhwc:
          dst_type = math_inst.element_b
          tile_descriptions = [
              TileDescription([128, 32, 32], 1, [2, 1, 1], math_inst, min_cc, max_cc),
              TileDescription([64, 16, 32], 2, [1, 1, 1], math_inst, min_cc, max_cc),
          ]
          for tile in tile_descriptions:
            dst_align = 32 if tile.threadblock_shape[1] == 16 else 64
            operations += GenerateConv2d(ConvKind.Fprop, [tile], layout[0], layout[1], dst_layout, 
                                      dst_type, min_cc, layout[2], layout[2], dst_align, use_special_optimization, 
                                      ImplicitGemmMode.GemmTN, False, cuda_major, cuda_minor)                      
            if tile.threadblock_shape[1] == 16 or tile.threadblock_shape[1] == 32:
              operations += GenerateConv2d(ConvKind.Fprop, [tile], layout[0], layout[1], dst_layout, 
                                      dst_type, min_cc, layout[2], layout[2], dst_align, use_special_optimization, 
                                      ImplicitGemmMode.GemmTN, True, cuda_major, cuda_minor)

  out_dtypes = [DataType.s4, DataType.u4, DataType.f32]

  #INT8x8x4 and INT8x8x32
  for math_inst in math_instructions:
    for layout in layouts_nhwc:
      for dst_layout in dst_layouts_nhwc:
        for out_dtype in out_dtypes:
          tile_descriptions = [
              TileDescription([128, 32, 32], 1, [2, 1, 1], math_inst, min_cc, max_cc),
              TileDescription([64, 16, 32], 2, [1, 1, 1], math_inst, min_cc, max_cc),
          ]
          for tile in tile_descriptions:
            dst_align = 4 * DataTypeSize[out_dtype] if tile.threadblock_shape[1] == 16 or out_dtype == DataType.f32 \
              else 8 * DataTypeSize[out_dtype]
            operations += GenerateConv2d(ConvKind.Fprop, [tile], layout[0], layout[1], dst_layout, 
                                      out_dtype, min_cc, layout[2], layout[2], dst_align, use_special_optimization, 
                                      ImplicitGemmMode.GemmTN, False, cuda_major, cuda_minor)
            if tile.threadblock_shape[1] == 16 or (tile.threadblock_shape[1] == 32 and out_dtype != DataType.f32):
              operations += GenerateConv2d(ConvKind.Fprop, [tile], layout[0], layout[1], dst_layout, 
                                        out_dtype, min_cc, layout[2], layout[2], dst_align, use_special_optimization, 
                                        ImplicitGemmMode.GemmTN, True, cuda_major, cuda_minor) 
        
  return operations

@@ -281,6 +330,8 @@ def GenerateConv2d_TensorOp_8832(args):
      LayoutType.TensorNC64HW64, 
  ]

  use_special_optimization = SpecialOptimizeDesc.ConvFilterUnity

  min_cc = 75
  max_cc = 1024

@@ -298,8 +349,8 @@ def GenerateConv2d_TensorOp_8832(args):
          TileDescription([128, 64, 64], 1, [2, 1, 1], math_inst, min_cc, max_cc),
        ] 
        operations += GenerateConv2d(ConvKind.Fprop, tile_descriptions, layout[0], layout[1], 
                                     dst_layout, dst_type, min_cc, 128, 128, 64,  
                                     False, ImplicitGemmMode.GemmTN, True, cuda_major, cuda_minor)
                                     dst_layout, dst_type, min_cc, 128, 128, 64, use_special_optimization, 
                                     ImplicitGemmMode.GemmTN, True, cuda_major, cuda_minor)

  layouts_nhwc = [
    (LayoutType.TensorNHWC, LayoutType.TensorNC8HW8, 32), 
@@ -316,18 +367,39 @@ def GenerateConv2d_TensorOp_8832(args):
      for dst_layout in dst_layouts_nhwc:
          dst_type = math_inst.element_b
          tile_descriptions = [
              TileDescription([128, 16, 64], 2, [1, 1, 1], math_inst, min_cc, max_cc),
              TileDescription([128, 32, 64], 1, [2, 1, 1], math_inst, min_cc, max_cc),
              TileDescription([128, 64, 64], 1, [2, 1, 1], math_inst, min_cc, max_cc),
          ]
          for tile in tile_descriptions:
            operations += GenerateConv2d(ConvKind.Fprop, [tile], layout[0], layout[1], 
                                      dst_layout, dst_type, min_cc, layout[2], layout[2], 32, 
                                      False, ImplicitGemmMode.GemmTN, False, cuda_major, cuda_minor)
            dst_align = 16 if tile.threadblock_shape[1] == 16 else 32
            operations += GenerateConv2d(ConvKind.Fprop, [tile], layout[0], layout[1], dst_layout, 
                                      dst_type, min_cc, layout[2], layout[2], dst_align, use_special_optimization, 
                                      ImplicitGemmMode.GemmTN, False, cuda_major, cuda_minor)
            if tile.threadblock_shape[1] == 32 or tile.threadblock_shape[1] == 64:
              dst_align = 32 if tile.threadblock_shape[1] == 32 else 64
              operations += GenerateConv2d(ConvKind.Fprop, [tile], layout[0], layout[1], 
                                      dst_layout, dst_type, min_cc, layout[2], layout[2], dst_align, 
                                      False, ImplicitGemmMode.GemmTN, True, cuda_major, cuda_minor)
              operations += GenerateConv2d(ConvKind.Fprop, [tile], layout[0], layout[1], dst_layout, 
                                      dst_type, min_cc, layout[2], layout[2], dst_align, use_special_optimization, 
                                      ImplicitGemmMode.GemmTN, True, cuda_major, cuda_minor)
  # INT4x4x8
  for math_inst in math_instructions:
    for layout in layouts_nhwc:
      for dst_layout in dst_layouts_nhwc:
          tile_descriptions = [
              TileDescription([128, 16, 64], 2, [1, 1, 1], math_inst, min_cc, max_cc),
              TileDescription([128, 32, 64], 1, [2, 1, 1], math_inst, min_cc, max_cc),
              TileDescription([128, 64, 64], 1, [2, 1, 1], math_inst, min_cc, max_cc),
          ]
          for tile in tile_descriptions:
            dst_align = 32 if tile.threadblock_shape[1] == 16 else 64
            operations += GenerateConv2d(ConvKind.Fprop, [tile], layout[0], layout[1], dst_layout, 
                                      DataType.s8, min_cc, layout[2], layout[2], dst_align, use_special_optimization, 
                                      ImplicitGemmMode.GemmTN, False, cuda_major, cuda_minor)
            if tile.threadblock_shape[1] == 32 or tile.threadblock_shape[1] == 64:
              dst_align = 64 if tile.threadblock_shape[1] == 32 else 128
              operations += GenerateConv2d(ConvKind.Fprop, [tile], layout[0], layout[1], dst_layout, 
                                      DataType.s8, min_cc, layout[2], layout[2], dst_align, use_special_optimization, 
                                      ImplicitGemmMode.GemmTN, True, cuda_major, cuda_minor)

  return operations

@@ -354,6 +426,8 @@ def GenerateDeconv_Simt(args):
      DataType.s8, 
  ]

  use_special_optimization = SpecialOptimizeDesc.DeconvDoubleUpsampling

  min_cc = 61
  max_cc = 1024

@@ -361,7 +435,6 @@ def GenerateDeconv_Simt(args):
    for layout in layouts:
      for dst_type, dst_layout in zip(dst_types, dst_layouts):
        tile_descriptions = [
          TileDescription([64, 128, 32], 2, [1, 4, 1], math_inst, min_cc, max_cc),
          TileDescription([32, 128, 32], 2, [1, 2, 1], math_inst, min_cc, max_cc),
          TileDescription([16, 128, 16], 2, [1, 2, 1], math_inst, min_cc, max_cc),
          TileDescription([16, 128, 16], 1, [1, 1, 1], math_inst, min_cc, max_cc),
@@ -369,7 +442,7 @@ def GenerateDeconv_Simt(args):
        ] 
        operations += GenerateConv2d(ConvKind.Dgrad, tile_descriptions, layout[0], layout[1], 
                                     dst_layout, dst_type, min_cc, 32, 32, 32, 
                                     True) 
                                     use_special_optimization) 
  return operations

 ################################################################################
--- a/dnn/scripts/cutlass_generator/library.py
+++ b/dnn/scripts/cutlass_generator/library.py
@@ -562,6 +562,24 @@ StrideSupportNames = {
  StrideSupport.Unity: 'unity_stride',
 }

 class SpecialOptimizeDesc(enum.Enum):
  NoneSpecialOpt = enum_auto()
  ConvFilterUnity = enum_auto()
  DeconvDoubleUpsampling = enum_auto()

 SpecialOptimizeDescNames = {
  SpecialOptimizeDesc.NoneSpecialOpt: 'none',
  SpecialOptimizeDesc.ConvFilterUnity: 'conv_filter_unity',
  SpecialOptimizeDesc.DeconvDoubleUpsampling: 'deconv_double_upsampling',
 }

 SpecialOptimizeDescTag = {
  SpecialOptimizeDesc.NoneSpecialOpt: 'cutlass::conv::SpecialOptimizeDesc::NONE',
  SpecialOptimizeDesc.ConvFilterUnity: 'cutlass::conv::SpecialOptimizeDesc::CONV_FILTER_UNITY',
  SpecialOptimizeDesc.DeconvDoubleUpsampling: 'cutlass::conv::SpecialOptimizeDesc::DECONV_DOUBLE_UPSAMPLING',
 }


 class ImplicitGemmMode(enum.Enum):
  GemmNT = enum_auto()
  GemmTN = enum_auto()
--- a/dnn/scripts/cutlass_generator/list.bzl
+++ b/dnn/scripts/cutlass_generator/list.bzl
@@ -455,11 +455,14 @@ cutlass_gen_list = [
    "cutlass_simt_sgemv_batched_strided_1x32_32_tt_align1x4.cu",
    "cutlass_simt_sgemv_batched_strided_1x32_16_tt_align1x2.cu",
    "cutlass_simt_sgemv_batched_strided_1x32_8_tt_align1x1.cu",
    "cutlass_simt_s8_idgrad_id_s8_64x128x32_64x32x32_2_nc4hw4_k4rsc4.cu",
    "cutlass_simt_s8_idgrad_id_s8_32x128x32_32x64x32_2_nc4hw4_k4rsc4.cu",
    "cutlass_simt_s8_idgrad_s2_id_s8_32x128x32_32x64x32_2_nc4hw4_k4rsc4.cu",
    "cutlass_simt_s8_idgrad_id_s8_16x128x16_16x64x16_2_nc4hw4_k4rsc4.cu",
    "cutlass_simt_s8_idgrad_s2_id_s8_16x128x16_16x64x16_2_nc4hw4_k4rsc4.cu",
    "cutlass_simt_s8_idgrad_id_s8_16x128x16_16x128x16_1_nc4hw4_k4rsc4.cu",
    "cutlass_simt_s8_idgrad_s2_id_s8_16x128x16_16x128x16_1_nc4hw4_k4rsc4.cu",
    "cutlass_simt_s8_idgrad_id_s8_16x64x8_16x64x8_2_nc4hw4_k4rsc4.cu",
    "cutlass_simt_s8_idgrad_s2_id_s8_16x64x8_16x64x8_2_nc4hw4_k4rsc4.cu",
    "all_deconv_simt_operations.cu",
    "cutlass_simt_s8_ifprop_id_s8_128x128x32_64x32x32_2_nc4hw4_c4rsk4.cu",
    "cutlass_simt_s8_ifprop_1x1_id_s8_128x128x32_64x32x32_2_nc4hw4_c4rsk4.cu",
@@ -515,30 +518,6 @@ cutlass_gen_list = [
    "cutlass_simt_s8_ifprop_1x1_relu_s8_16x64x8_16x64x8_2_nc4hw4_c4rsk4.cu",
    "cutlass_simt_s8_ifprop_hswish_s8_16x64x8_16x64x8_2_nc4hw4_c4rsk4.cu",
    "cutlass_simt_s8_ifprop_1x1_hswish_s8_16x64x8_16x64x8_2_nc4hw4_c4rsk4.cu",
    "cutlass_simt_s8_ifprop_id_s8_128x128x32_64x32x32_2_nc4hw4_c4rsk4_nc32hw32.cu",
    "cutlass_simt_s8_ifprop_1x1_id_s8_128x128x32_64x32x32_2_nc4hw4_c4rsk4_nc32hw32.cu",
    "cutlass_simt_s8_ifprop_relu_s8_128x128x32_64x32x32_2_nc4hw4_c4rsk4_nc32hw32.cu",
    "cutlass_simt_s8_ifprop_1x1_relu_s8_128x128x32_64x32x32_2_nc4hw4_c4rsk4_nc32hw32.cu",
    "cutlass_simt_s8_ifprop_hswish_s8_128x128x32_64x32x32_2_nc4hw4_c4rsk4_nc32hw32.cu",
    "cutlass_simt_s8_ifprop_1x1_hswish_s8_128x128x32_64x32x32_2_nc4hw4_c4rsk4_nc32hw32.cu",
    "cutlass_simt_s8_ifprop_id_s8_128x64x32_64x32x32_2_nc4hw4_c4rsk4_nc32hw32.cu",
    "cutlass_simt_s8_ifprop_1x1_id_s8_128x64x32_64x32x32_2_nc4hw4_c4rsk4_nc32hw32.cu",
    "cutlass_simt_s8_ifprop_relu_s8_128x64x32_64x32x32_2_nc4hw4_c4rsk4_nc32hw32.cu",
    "cutlass_simt_s8_ifprop_1x1_relu_s8_128x64x32_64x32x32_2_nc4hw4_c4rsk4_nc32hw32.cu",
    "cutlass_simt_s8_ifprop_hswish_s8_128x64x32_64x32x32_2_nc4hw4_c4rsk4_nc32hw32.cu",
    "cutlass_simt_s8_ifprop_1x1_hswish_s8_128x64x32_64x32x32_2_nc4hw4_c4rsk4_nc32hw32.cu",
    "cutlass_simt_s8_ifprop_id_s8_64x128x32_64x32x32_2_nc4hw4_c4rsk4_nc32hw32.cu",
    "cutlass_simt_s8_ifprop_1x1_id_s8_64x128x32_64x32x32_2_nc4hw4_c4rsk4_nc32hw32.cu",
    "cutlass_simt_s8_ifprop_relu_s8_64x128x32_64x32x32_2_nc4hw4_c4rsk4_nc32hw32.cu",
    "cutlass_simt_s8_ifprop_1x1_relu_s8_64x128x32_64x32x32_2_nc4hw4_c4rsk4_nc32hw32.cu",
    "cutlass_simt_s8_ifprop_hswish_s8_64x128x32_64x32x32_2_nc4hw4_c4rsk4_nc32hw32.cu",
    "cutlass_simt_s8_ifprop_1x1_hswish_s8_64x128x32_64x32x32_2_nc4hw4_c4rsk4_nc32hw32.cu",
    "cutlass_simt_s8_ifprop_id_s8_128x32x32_64x32x32_2_nc4hw4_c4rsk4_nc32hw32.cu",
    "cutlass_simt_s8_ifprop_1x1_id_s8_128x32x32_64x32x32_2_nc4hw4_c4rsk4_nc32hw32.cu",
    "cutlass_simt_s8_ifprop_relu_s8_128x32x32_64x32x32_2_nc4hw4_c4rsk4_nc32hw32.cu",
    "cutlass_simt_s8_ifprop_1x1_relu_s8_128x32x32_64x32x32_2_nc4hw4_c4rsk4_nc32hw32.cu",
    "cutlass_simt_s8_ifprop_hswish_s8_128x32x32_64x32x32_2_nc4hw4_c4rsk4_nc32hw32.cu",
    "cutlass_simt_s8_ifprop_1x1_hswish_s8_128x32x32_64x32x32_2_nc4hw4_c4rsk4_nc32hw32.cu",
    "cutlass_simt_s8_ifprop_id_s8_32x128x32_32x64x32_2_nc4hw4_c4rsk4_nc32hw32.cu",
    "cutlass_simt_s8_ifprop_1x1_id_s8_32x128x32_32x64x32_2_nc4hw4_c4rsk4_nc32hw32.cu",
    "cutlass_simt_s8_ifprop_relu_s8_32x128x32_32x64x32_2_nc4hw4_c4rsk4_nc32hw32.cu",
@@ -551,108 +530,18 @@ cutlass_gen_list = [
    "cutlass_simt_s8_ifprop_1x1_relu_s8_32x64x32_32x64x32_2_nc4hw4_c4rsk4_nc32hw32.cu",
    "cutlass_simt_s8_ifprop_hswish_s8_32x64x32_32x64x32_2_nc4hw4_c4rsk4_nc32hw32.cu",
    "cutlass_simt_s8_ifprop_1x1_hswish_s8_32x64x32_32x64x32_2_nc4hw4_c4rsk4_nc32hw32.cu",
    "cutlass_simt_s8_ifprop_id_s8_64x32x32_64x32x32_2_nc4hw4_c4rsk4_nc32hw32.cu",
    "cutlass_simt_s8_ifprop_1x1_id_s8_64x32x32_64x32x32_2_nc4hw4_c4rsk4_nc32hw32.cu",
    "cutlass_simt_s8_ifprop_relu_s8_64x32x32_64x32x32_2_nc4hw4_c4rsk4_nc32hw32.cu",
    "cutlass_simt_s8_ifprop_1x1_relu_s8_64x32x32_64x32x32_2_nc4hw4_c4rsk4_nc32hw32.cu",
    "cutlass_simt_s8_ifprop_hswish_s8_64x32x32_64x32x32_2_nc4hw4_c4rsk4_nc32hw32.cu",
    "cutlass_simt_s8_ifprop_1x1_hswish_s8_64x32x32_64x32x32_2_nc4hw4_c4rsk4_nc32hw32.cu",
    "cutlass_simt_u4_ifprop_id_s8_128x128x32_64x32x32_2_nc4hw4_c4rsk4_nhwc.cu",
    "cutlass_simt_u4_ifprop_relu_s8_128x128x32_64x32x32_2_nc4hw4_c4rsk4_nhwc.cu",
    "cutlass_simt_u4_ifprop_hswish_s8_128x128x32_64x32x32_2_nc4hw4_c4rsk4_nhwc.cu",
    "cutlass_simt_u4_ifprop_id_s8_128x64x32_64x32x32_2_nc4hw4_c4rsk4_nhwc.cu",
    "cutlass_simt_u4_ifprop_relu_s8_128x64x32_64x32x32_2_nc4hw4_c4rsk4_nhwc.cu",
    "cutlass_simt_u4_ifprop_hswish_s8_128x64x32_64x32x32_2_nc4hw4_c4rsk4_nhwc.cu",
    "cutlass_simt_u4_ifprop_id_s8_64x128x32_64x32x32_2_nc4hw4_c4rsk4_nhwc.cu",
    "cutlass_simt_u4_ifprop_relu_s8_64x128x32_64x32x32_2_nc4hw4_c4rsk4_nhwc.cu",
    "cutlass_simt_u4_ifprop_hswish_s8_64x128x32_64x32x32_2_nc4hw4_c4rsk4_nhwc.cu",
    "cutlass_simt_u4_ifprop_id_s8_128x32x32_64x32x32_2_nc4hw4_c4rsk4_nhwc.cu",
    "cutlass_simt_u4_ifprop_relu_s8_128x32x32_64x32x32_2_nc4hw4_c4rsk4_nhwc.cu",
    "cutlass_simt_u4_ifprop_hswish_s8_128x32x32_64x32x32_2_nc4hw4_c4rsk4_nhwc.cu",
    "cutlass_simt_u4_ifprop_id_s8_32x128x32_32x64x32_2_nc4hw4_c4rsk4_nhwc.cu",
    "cutlass_simt_u4_ifprop_relu_s8_32x128x32_32x64x32_2_nc4hw4_c4rsk4_nhwc.cu",
    "cutlass_simt_u4_ifprop_hswish_s8_32x128x32_32x64x32_2_nc4hw4_c4rsk4_nhwc.cu",
    "cutlass_simt_u4_ifprop_id_s8_32x64x32_32x64x32_2_nc4hw4_c4rsk4_nhwc.cu",
    "cutlass_simt_u4_ifprop_relu_s8_32x64x32_32x64x32_2_nc4hw4_c4rsk4_nhwc.cu",
    "cutlass_simt_u4_ifprop_hswish_s8_32x64x32_32x64x32_2_nc4hw4_c4rsk4_nhwc.cu",
    "cutlass_simt_u4_ifprop_id_s8_64x32x32_64x32x32_2_nc4hw4_c4rsk4_nhwc.cu",
    "cutlass_simt_u4_ifprop_relu_s8_64x32x32_64x32x32_2_nc4hw4_c4rsk4_nhwc.cu",
    "cutlass_simt_u4_ifprop_hswish_s8_64x32x32_64x32x32_2_nc4hw4_c4rsk4_nhwc.cu",
    "cutlass_simt_u4_ifprop_id_s8_16x128x16_16x128x16_1_nc4hw4_c4rsk4_nhwc.cu",
    "cutlass_simt_u4_ifprop_relu_s8_16x128x16_16x128x16_1_nc4hw4_c4rsk4_nhwc.cu",
    "cutlass_simt_u4_ifprop_hswish_s8_16x128x16_16x128x16_1_nc4hw4_c4rsk4_nhwc.cu",
    "cutlass_simt_u4_ifprop_id_s8_16x64x8_16x64x8_2_nc4hw4_c4rsk4_nhwc.cu",
    "cutlass_simt_u4_ifprop_relu_s8_16x64x8_16x64x8_2_nc4hw4_c4rsk4_nhwc.cu",
    "cutlass_simt_u4_ifprop_hswish_s8_16x64x8_16x64x8_2_nc4hw4_c4rsk4_nhwc.cu",
    "cutlass_simt_s4_ifprop_id_s8_128x128x32_64x32x32_2_nc4hw4_c4rsk4_nhwc.cu",
    "cutlass_simt_s4_ifprop_relu_s8_128x128x32_64x32x32_2_nc4hw4_c4rsk4_nhwc.cu",
    "cutlass_simt_s4_ifprop_hswish_s8_128x128x32_64x32x32_2_nc4hw4_c4rsk4_nhwc.cu",
    "cutlass_simt_s4_ifprop_id_s8_128x64x32_64x32x32_2_nc4hw4_c4rsk4_nhwc.cu",
    "cutlass_simt_s4_ifprop_relu_s8_128x64x32_64x32x32_2_nc4hw4_c4rsk4_nhwc.cu",
    "cutlass_simt_s4_ifprop_hswish_s8_128x64x32_64x32x32_2_nc4hw4_c4rsk4_nhwc.cu",
    "cutlass_simt_s4_ifprop_id_s8_64x128x32_64x32x32_2_nc4hw4_c4rsk4_nhwc.cu",
    "cutlass_simt_s4_ifprop_relu_s8_64x128x32_64x32x32_2_nc4hw4_c4rsk4_nhwc.cu",
    "cutlass_simt_s4_ifprop_hswish_s8_64x128x32_64x32x32_2_nc4hw4_c4rsk4_nhwc.cu",
    "cutlass_simt_s4_ifprop_id_s8_128x32x32_64x32x32_2_nc4hw4_c4rsk4_nhwc.cu",
    "cutlass_simt_s4_ifprop_relu_s8_128x32x32_64x32x32_2_nc4hw4_c4rsk4_nhwc.cu",
    "cutlass_simt_s4_ifprop_hswish_s8_128x32x32_64x32x32_2_nc4hw4_c4rsk4_nhwc.cu",
    "cutlass_simt_s4_ifprop_id_s8_32x128x32_32x64x32_2_nc4hw4_c4rsk4_nhwc.cu",
    "cutlass_simt_s4_ifprop_relu_s8_32x128x32_32x64x32_2_nc4hw4_c4rsk4_nhwc.cu",
    "cutlass_simt_s4_ifprop_hswish_s8_32x128x32_32x64x32_2_nc4hw4_c4rsk4_nhwc.cu",
    "cutlass_simt_s4_ifprop_id_s8_32x64x32_32x64x32_2_nc4hw4_c4rsk4_nhwc.cu",
    "cutlass_simt_s4_ifprop_relu_s8_32x64x32_32x64x32_2_nc4hw4_c4rsk4_nhwc.cu",
    "cutlass_simt_s4_ifprop_hswish_s8_32x64x32_32x64x32_2_nc4hw4_c4rsk4_nhwc.cu",
    "cutlass_simt_s4_ifprop_id_s8_64x32x32_64x32x32_2_nc4hw4_c4rsk4_nhwc.cu",
    "cutlass_simt_s4_ifprop_relu_s8_64x32x32_64x32x32_2_nc4hw4_c4rsk4_nhwc.cu",
    "cutlass_simt_s4_ifprop_hswish_s8_64x32x32_64x32x32_2_nc4hw4_c4rsk4_nhwc.cu",
    "cutlass_simt_s4_ifprop_id_s8_16x128x16_16x128x16_1_nc4hw4_c4rsk4_nhwc.cu",
    "cutlass_simt_s4_ifprop_relu_s8_16x128x16_16x128x16_1_nc4hw4_c4rsk4_nhwc.cu",
    "cutlass_simt_s4_ifprop_hswish_s8_16x128x16_16x128x16_1_nc4hw4_c4rsk4_nhwc.cu",
    "cutlass_simt_s4_ifprop_id_s8_16x64x8_16x64x8_2_nc4hw4_c4rsk4_nhwc.cu",
    "cutlass_simt_s4_ifprop_relu_s8_16x64x8_16x64x8_2_nc4hw4_c4rsk4_nhwc.cu",
    "cutlass_simt_s4_ifprop_hswish_s8_16x64x8_16x64x8_2_nc4hw4_c4rsk4_nhwc.cu",
    "cutlass_simt_f32_ifprop_id_s8_128x128x32_64x32x32_2_nc4hw4_c4rsk4_nchw.cu",
    "cutlass_simt_f32_ifprop_1x1_id_s8_128x128x32_64x32x32_2_nc4hw4_c4rsk4_nchw.cu",
    "cutlass_simt_f32_ifprop_relu_s8_128x128x32_64x32x32_2_nc4hw4_c4rsk4_nchw.cu",
    "cutlass_simt_f32_ifprop_1x1_relu_s8_128x128x32_64x32x32_2_nc4hw4_c4rsk4_nchw.cu",
    "cutlass_simt_f32_ifprop_hswish_s8_128x128x32_64x32x32_2_nc4hw4_c4rsk4_nchw.cu",
    "cutlass_simt_f32_ifprop_1x1_hswish_s8_128x128x32_64x32x32_2_nc4hw4_c4rsk4_nchw.cu",
    "cutlass_simt_f32_ifprop_id_s8_128x64x32_64x32x32_2_nc4hw4_c4rsk4_nchw.cu",
    "cutlass_simt_f32_ifprop_1x1_id_s8_128x64x32_64x32x32_2_nc4hw4_c4rsk4_nchw.cu",
    "cutlass_simt_f32_ifprop_relu_s8_128x64x32_64x32x32_2_nc4hw4_c4rsk4_nchw.cu",
    "cutlass_simt_f32_ifprop_1x1_relu_s8_128x64x32_64x32x32_2_nc4hw4_c4rsk4_nchw.cu",
    "cutlass_simt_f32_ifprop_hswish_s8_128x64x32_64x32x32_2_nc4hw4_c4rsk4_nchw.cu",
    "cutlass_simt_f32_ifprop_1x1_hswish_s8_128x64x32_64x32x32_2_nc4hw4_c4rsk4_nchw.cu",
    "cutlass_simt_f32_ifprop_id_s8_64x128x32_64x32x32_2_nc4hw4_c4rsk4_nchw.cu",
    "cutlass_simt_f32_ifprop_1x1_id_s8_64x128x32_64x32x32_2_nc4hw4_c4rsk4_nchw.cu",
    "cutlass_simt_f32_ifprop_relu_s8_64x128x32_64x32x32_2_nc4hw4_c4rsk4_nchw.cu",
    "cutlass_simt_f32_ifprop_1x1_relu_s8_64x128x32_64x32x32_2_nc4hw4_c4rsk4_nchw.cu",
    "cutlass_simt_f32_ifprop_hswish_s8_64x128x32_64x32x32_2_nc4hw4_c4rsk4_nchw.cu",
    "cutlass_simt_f32_ifprop_1x1_hswish_s8_64x128x32_64x32x32_2_nc4hw4_c4rsk4_nchw.cu",
    "cutlass_simt_f32_ifprop_id_s8_128x32x32_64x32x32_2_nc4hw4_c4rsk4_nchw.cu",
    "cutlass_simt_f32_ifprop_1x1_id_s8_128x32x32_64x32x32_2_nc4hw4_c4rsk4_nchw.cu",
    "cutlass_simt_f32_ifprop_relu_s8_128x32x32_64x32x32_2_nc4hw4_c4rsk4_nchw.cu",
    "cutlass_simt_f32_ifprop_1x1_relu_s8_128x32x32_64x32x32_2_nc4hw4_c4rsk4_nchw.cu",
    "cutlass_simt_f32_ifprop_hswish_s8_128x32x32_64x32x32_2_nc4hw4_c4rsk4_nchw.cu",
    "cutlass_simt_f32_ifprop_1x1_hswish_s8_128x32x32_64x32x32_2_nc4hw4_c4rsk4_nchw.cu",
    "cutlass_simt_f32_ifprop_id_s8_32x128x32_32x64x32_2_nc4hw4_c4rsk4_nchw.cu",
    "cutlass_simt_f32_ifprop_1x1_id_s8_32x128x32_32x64x32_2_nc4hw4_c4rsk4_nchw.cu",
    "cutlass_simt_f32_ifprop_relu_s8_32x128x32_32x64x32_2_nc4hw4_c4rsk4_nchw.cu",
    "cutlass_simt_f32_ifprop_1x1_relu_s8_32x128x32_32x64x32_2_nc4hw4_c4rsk4_nchw.cu",
    "cutlass_simt_f32_ifprop_hswish_s8_32x128x32_32x64x32_2_nc4hw4_c4rsk4_nchw.cu",
    "cutlass_simt_f32_ifprop_1x1_hswish_s8_32x128x32_32x64x32_2_nc4hw4_c4rsk4_nchw.cu",
    "cutlass_simt_f32_ifprop_id_s8_32x64x32_32x64x32_2_nc4hw4_c4rsk4_nchw.cu",
    "cutlass_simt_f32_ifprop_1x1_id_s8_32x64x32_32x64x32_2_nc4hw4_c4rsk4_nchw.cu",
    "cutlass_simt_f32_ifprop_relu_s8_32x64x32_32x64x32_2_nc4hw4_c4rsk4_nchw.cu",
    "cutlass_simt_f32_ifprop_1x1_relu_s8_32x64x32_32x64x32_2_nc4hw4_c4rsk4_nchw.cu",
    "cutlass_simt_f32_ifprop_hswish_s8_32x64x32_32x64x32_2_nc4hw4_c4rsk4_nchw.cu",
    "cutlass_simt_f32_ifprop_1x1_hswish_s8_32x64x32_32x64x32_2_nc4hw4_c4rsk4_nchw.cu",
    "cutlass_simt_f32_ifprop_id_s8_64x32x32_64x32x32_2_nc4hw4_c4rsk4_nchw.cu",
    "cutlass_simt_f32_ifprop_1x1_id_s8_64x32x32_64x32x32_2_nc4hw4_c4rsk4_nchw.cu",
    "cutlass_simt_f32_ifprop_relu_s8_64x32x32_64x32x32_2_nc4hw4_c4rsk4_nchw.cu",
    "cutlass_simt_f32_ifprop_1x1_relu_s8_64x32x32_64x32x32_2_nc4hw4_c4rsk4_nchw.cu",
    "cutlass_simt_f32_ifprop_hswish_s8_64x32x32_64x32x32_2_nc4hw4_c4rsk4_nchw.cu",
    "cutlass_simt_f32_ifprop_1x1_hswish_s8_64x32x32_64x32x32_2_nc4hw4_c4rsk4_nchw.cu",
    "cutlass_simt_f32_ifprop_id_s8_16x128x16_16x128x16_1_nc4hw4_c4rsk4_nchw.cu",
    "cutlass_simt_f32_ifprop_1x1_id_s8_16x128x16_16x128x16_1_nc4hw4_c4rsk4_nchw.cu",
    "cutlass_simt_f32_ifprop_relu_s8_16x128x16_16x128x16_1_nc4hw4_c4rsk4_nchw.cu",
@@ -708,72 +597,288 @@ cutlass_gen_list = [
    "cutlass_tensorop_s8_i8816fprop_1x1_roc_relu_s8_128x32x32_64x32x32_1_nc32hw32_c32rsk32.cu",
    "cutlass_tensorop_s8_i8816fprop_roc_hswish_s8_128x32x32_64x32x32_1_nc32hw32_c32rsk32.cu",
    "cutlass_tensorop_s8_i8816fprop_1x1_roc_hswish_s8_128x32x32_64x32x32_1_nc32hw32_c32rsk32.cu",
    "cutlass_tensorop_s8_i8816fprop_roc_id_s8_64x128x32_32x64x32_1_nc32hw32_c32rsk32.cu",
    "cutlass_tensorop_s8_i8816fprop_1x1_roc_id_s8_64x128x32_32x64x32_1_nc32hw32_c32rsk32.cu",
    "cutlass_tensorop_s8_i8816fprop_roc_relu_s8_64x128x32_32x64x32_1_nc32hw32_c32rsk32.cu",
    "cutlass_tensorop_s8_i8816fprop_1x1_roc_relu_s8_64x128x32_32x64x32_1_nc32hw32_c32rsk32.cu",
    "cutlass_tensorop_s8_i8816fprop_roc_hswish_s8_64x128x32_32x64x32_1_nc32hw32_c32rsk32.cu",
    "cutlass_tensorop_s8_i8816fprop_1x1_roc_hswish_s8_64x128x32_32x64x32_1_nc32hw32_c32rsk32.cu",
    "cutlass_tensorop_s8_i8816fprop_roc_id_s8_32x128x32_32x64x32_1_nc32hw32_c32rsk32.cu",
    "cutlass_tensorop_s8_i8816fprop_1x1_roc_id_s8_32x128x32_32x64x32_1_nc32hw32_c32rsk32.cu",
    "cutlass_tensorop_s8_i8816fprop_roc_relu_s8_32x128x32_32x64x32_1_nc32hw32_c32rsk32.cu",
    "cutlass_tensorop_s8_i8816fprop_1x1_roc_relu_s8_32x128x32_32x64x32_1_nc32hw32_c32rsk32.cu",
    "cutlass_tensorop_s8_i8816fprop_roc_hswish_s8_32x128x32_32x64x32_1_nc32hw32_c32rsk32.cu",
    "cutlass_tensorop_s8_i8816fprop_1x1_roc_hswish_s8_32x128x32_32x64x32_1_nc32hw32_c32rsk32.cu",
    "cutlass_tensorop_s8_i8816fprop_id_s8_128x256x64_64x64x64_2_nc32hw32_c32rsk32_nc4hw4.cu",
    "cutlass_tensorop_s8_i8816fprop_1x1_id_s8_128x256x64_64x64x64_2_nc32hw32_c32rsk32_nc4hw4.cu",
    "cutlass_tensorop_s8_i8816fprop_relu_s8_128x256x64_64x64x64_2_nc32hw32_c32rsk32_nc4hw4.cu",
    "cutlass_tensorop_s8_i8816fprop_1x1_relu_s8_128x256x64_64x64x64_2_nc32hw32_c32rsk32_nc4hw4.cu",
    "cutlass_tensorop_s8_i8816fprop_hswish_s8_128x256x64_64x64x64_2_nc32hw32_c32rsk32_nc4hw4.cu",
    "cutlass_tensorop_s8_i8816fprop_1x1_hswish_s8_128x256x64_64x64x64_2_nc32hw32_c32rsk32_nc4hw4.cu",
    "cutlass_tensorop_s8_i8816fprop_id_s8_256x128x64_64x64x64_2_nc32hw32_c32rsk32_nc4hw4.cu",
    "cutlass_tensorop_s8_i8816fprop_1x1_id_s8_256x128x64_64x64x64_2_nc32hw32_c32rsk32_nc4hw4.cu",
    "cutlass_tensorop_s8_i8816fprop_relu_s8_256x128x64_64x64x64_2_nc32hw32_c32rsk32_nc4hw4.cu",
    "cutlass_tensorop_s8_i8816fprop_1x1_relu_s8_256x128x64_64x64x64_2_nc32hw32_c32rsk32_nc4hw4.cu",
    "cutlass_tensorop_s8_i8816fprop_hswish_s8_256x128x64_64x64x64_2_nc32hw32_c32rsk32_nc4hw4.cu",
    "cutlass_tensorop_s8_i8816fprop_1x1_hswish_s8_256x128x64_64x64x64_2_nc32hw32_c32rsk32_nc4hw4.cu",
    "cutlass_tensorop_s8_i8816fprop_id_s8_128x128x64_64x64x64_2_nc32hw32_c32rsk32_nc4hw4.cu",
    "cutlass_tensorop_s8_i8816fprop_1x1_id_s8_128x128x64_64x64x64_2_nc32hw32_c32rsk32_nc4hw4.cu",
    "cutlass_tensorop_s8_i8816fprop_relu_s8_128x128x64_64x64x64_2_nc32hw32_c32rsk32_nc4hw4.cu",
    "cutlass_tensorop_s8_i8816fprop_1x1_relu_s8_128x128x64_64x64x64_2_nc32hw32_c32rsk32_nc4hw4.cu",
    "cutlass_tensorop_s8_i8816fprop_hswish_s8_128x128x64_64x64x64_2_nc32hw32_c32rsk32_nc4hw4.cu",
    "cutlass_tensorop_s8_i8816fprop_1x1_hswish_s8_128x128x64_64x64x64_2_nc32hw32_c32rsk32_nc4hw4.cu",
    "cutlass_tensorop_s8_i8816fprop_id_s8_128x64x64_64x32x64_2_nc32hw32_c32rsk32_nc4hw4.cu",
    "cutlass_tensorop_s8_i8816fprop_1x1_id_s8_128x64x64_64x32x64_2_nc32hw32_c32rsk32_nc4hw4.cu",
    "cutlass_tensorop_s8_i8816fprop_relu_s8_128x64x64_64x32x64_2_nc32hw32_c32rsk32_nc4hw4.cu",
    "cutlass_tensorop_s8_i8816fprop_1x1_relu_s8_128x64x64_64x32x64_2_nc32hw32_c32rsk32_nc4hw4.cu",
    "cutlass_tensorop_s8_i8816fprop_hswish_s8_128x64x64_64x32x64_2_nc32hw32_c32rsk32_nc4hw4.cu",
    "cutlass_tensorop_s8_i8816fprop_1x1_hswish_s8_128x64x64_64x32x64_2_nc32hw32_c32rsk32_nc4hw4.cu",
    "cutlass_tensorop_s8_i8816fprop_id_s8_64x128x64_32x64x64_2_nc32hw32_c32rsk32_nc4hw4.cu",
    "cutlass_tensorop_s8_i8816fprop_1x1_id_s8_64x128x64_32x64x64_2_nc32hw32_c32rsk32_nc4hw4.cu",
    "cutlass_tensorop_s8_i8816fprop_relu_s8_64x128x64_32x64x64_2_nc32hw32_c32rsk32_nc4hw4.cu",
    "cutlass_tensorop_s8_i8816fprop_1x1_relu_s8_64x128x64_32x64x64_2_nc32hw32_c32rsk32_nc4hw4.cu",
    "cutlass_tensorop_s8_i8816fprop_hswish_s8_64x128x64_32x64x64_2_nc32hw32_c32rsk32_nc4hw4.cu",
    "cutlass_tensorop_s8_i8816fprop_1x1_hswish_s8_64x128x64_32x64x64_2_nc32hw32_c32rsk32_nc4hw4.cu",
    "cutlass_tensorop_s8_i8816fprop_id_s8_128x64x32_64x32x32_1_nc32hw32_c32rsk32_nc4hw4.cu",
    "cutlass_tensorop_s8_i8816fprop_1x1_id_s8_128x64x32_64x32x32_1_nc32hw32_c32rsk32_nc4hw4.cu",
    "cutlass_tensorop_s8_i8816fprop_relu_s8_128x64x32_64x32x32_1_nc32hw32_c32rsk32_nc4hw4.cu",
    "cutlass_tensorop_s8_i8816fprop_1x1_relu_s8_128x64x32_64x32x32_1_nc32hw32_c32rsk32_nc4hw4.cu",
    "cutlass_tensorop_s8_i8816fprop_hswish_s8_128x64x32_64x32x32_1_nc32hw32_c32rsk32_nc4hw4.cu",
    "cutlass_tensorop_s8_i8816fprop_1x1_hswish_s8_128x64x32_64x32x32_1_nc32hw32_c32rsk32_nc4hw4.cu",
    "cutlass_tensorop_s8_i8816fprop_id_s8_128x32x32_64x32x32_1_nc32hw32_c32rsk32_nc4hw4.cu",
    "cutlass_tensorop_s8_i8816fprop_1x1_id_s8_128x32x32_64x32x32_1_nc32hw32_c32rsk32_nc4hw4.cu",
    "cutlass_tensorop_s8_i8816fprop_relu_s8_128x32x32_64x32x32_1_nc32hw32_c32rsk32_nc4hw4.cu",
    "cutlass_tensorop_s8_i8816fprop_1x1_relu_s8_128x32x32_64x32x32_1_nc32hw32_c32rsk32_nc4hw4.cu",
    "cutlass_tensorop_s8_i8816fprop_hswish_s8_128x32x32_64x32x32_1_nc32hw32_c32rsk32_nc4hw4.cu",
    "cutlass_tensorop_s8_i8816fprop_1x1_hswish_s8_128x32x32_64x32x32_1_nc32hw32_c32rsk32_nc4hw4.cu",
    "cutlass_tensorop_s8_i8816fprop_id_s8_64x128x32_32x64x32_1_nc32hw32_c32rsk32_nc4hw4.cu",
    "cutlass_tensorop_s8_i8816fprop_1x1_id_s8_64x128x32_32x64x32_1_nc32hw32_c32rsk32_nc4hw4.cu",
    "cutlass_tensorop_s8_i8816fprop_relu_s8_64x128x32_32x64x32_1_nc32hw32_c32rsk32_nc4hw4.cu",
    "cutlass_tensorop_s8_i8816fprop_1x1_relu_s8_64x128x32_32x64x32_1_nc32hw32_c32rsk32_nc4hw4.cu",
    "cutlass_tensorop_s8_i8816fprop_hswish_s8_64x128x32_32x64x32_1_nc32hw32_c32rsk32_nc4hw4.cu",
    "cutlass_tensorop_s8_i8816fprop_1x1_hswish_s8_64x128x32_32x64x32_1_nc32hw32_c32rsk32_nc4hw4.cu",
    "cutlass_tensorop_s8_i8816fprop_id_s8_32x128x32_32x64x32_1_nc32hw32_c32rsk32_nc4hw4.cu",
    "cutlass_tensorop_s8_i8816fprop_1x1_id_s8_32x128x32_32x64x32_1_nc32hw32_c32rsk32_nc4hw4.cu",
    "cutlass_tensorop_s8_i8816fprop_relu_s8_32x128x32_32x64x32_1_nc32hw32_c32rsk32_nc4hw4.cu",
    "cutlass_tensorop_s8_i8816fprop_1x1_relu_s8_32x128x32_32x64x32_1_nc32hw32_c32rsk32_nc4hw4.cu",
    "cutlass_tensorop_s8_i8816fprop_hswish_s8_32x128x32_32x64x32_1_nc32hw32_c32rsk32_nc4hw4.cu",
    "cutlass_tensorop_s8_i8816fprop_1x1_hswish_s8_32x128x32_32x64x32_1_nc32hw32_c32rsk32_nc4hw4.cu",
    "cutlass_tensorop_s8_i8816fprop_id_s8_128x32x32_64x32x32_1_nhwc_nc4hw4.cu",
    "cutlass_tensorop_s8_i8816fprop_1x1_id_s8_128x32x32_64x32x32_1_nhwc_nc4hw4.cu",
    "cutlass_tensorop_s8_i8816fprop_relu_s8_128x32x32_64x32x32_1_nhwc_nc4hw4.cu",
    "cutlass_tensorop_s8_i8816fprop_1x1_relu_s8_128x32x32_64x32x32_1_nhwc_nc4hw4.cu",
    "cutlass_tensorop_s8_i8816fprop_hswish_s8_128x32x32_64x32x32_1_nhwc_nc4hw4.cu",
    "cutlass_tensorop_s8_i8816fprop_1x1_hswish_s8_128x32x32_64x32x32_1_nhwc_nc4hw4.cu",
    "cutlass_tensorop_s8_i8816fprop_roc_id_s8_128x32x32_64x32x32_1_nhwc_nc4hw4.cu",
    "cutlass_tensorop_s8_i8816fprop_1x1_roc_id_s8_128x32x32_64x32x32_1_nhwc_nc4hw4.cu",
    "cutlass_tensorop_s8_i8816fprop_roc_relu_s8_128x32x32_64x32x32_1_nhwc_nc4hw4.cu",
    "cutlass_tensorop_s8_i8816fprop_1x1_roc_relu_s8_128x32x32_64x32x32_1_nhwc_nc4hw4.cu",
    "cutlass_tensorop_s8_i8816fprop_roc_hswish_s8_128x32x32_64x32x32_1_nhwc_nc4hw4.cu",
    "cutlass_tensorop_s8_i8816fprop_1x1_roc_hswish_s8_128x32x32_64x32x32_1_nhwc_nc4hw4.cu",
    "cutlass_tensorop_s8_i8816fprop_id_s8_64x16x32_64x16x32_2_nhwc_nc4hw4.cu",
    "cutlass_tensorop_s8_i8816fprop_1x1_id_s8_64x16x32_64x16x32_2_nhwc_nc4hw4.cu",
    "cutlass_tensorop_s8_i8816fprop_relu_s8_64x16x32_64x16x32_2_nhwc_nc4hw4.cu",
    "cutlass_tensorop_s8_i8816fprop_1x1_relu_s8_64x16x32_64x16x32_2_nhwc_nc4hw4.cu",
    "cutlass_tensorop_s8_i8816fprop_hswish_s8_64x16x32_64x16x32_2_nhwc_nc4hw4.cu",
    "cutlass_tensorop_s8_i8816fprop_1x1_hswish_s8_64x16x32_64x16x32_2_nhwc_nc4hw4.cu",
    "cutlass_tensorop_s8_i8816fprop_roc_id_s8_64x16x32_64x16x32_2_nhwc_nc4hw4.cu",
    "cutlass_tensorop_s8_i8816fprop_1x1_roc_id_s8_64x16x32_64x16x32_2_nhwc_nc4hw4.cu",
    "cutlass_tensorop_s8_i8816fprop_roc_relu_s8_64x16x32_64x16x32_2_nhwc_nc4hw4.cu",
    "cutlass_tensorop_s8_i8816fprop_1x1_roc_relu_s8_64x16x32_64x16x32_2_nhwc_nc4hw4.cu",
    "cutlass_tensorop_s8_i8816fprop_roc_hswish_s8_64x16x32_64x16x32_2_nhwc_nc4hw4.cu",
    "cutlass_tensorop_s8_i8816fprop_1x1_roc_hswish_s8_64x16x32_64x16x32_2_nhwc_nc4hw4.cu",
    "cutlass_tensorop_s8_i8816fprop_id_s8_128x32x32_64x32x32_1_nhwc_nc8hw8.cu",
    "cutlass_tensorop_s8_i8816fprop_1x1_id_s8_128x32x32_64x32x32_1_nhwc_nc8hw8.cu",
    "cutlass_tensorop_s8_i8816fprop_relu_s8_128x32x32_64x32x32_1_nhwc_nc8hw8.cu",
    "cutlass_tensorop_s8_i8816fprop_1x1_relu_s8_128x32x32_64x32x32_1_nhwc_nc8hw8.cu",
    "cutlass_tensorop_s8_i8816fprop_hswish_s8_128x32x32_64x32x32_1_nhwc_nc8hw8.cu",
    "cutlass_tensorop_s8_i8816fprop_1x1_hswish_s8_128x32x32_64x32x32_1_nhwc_nc8hw8.cu",
    "cutlass_tensorop_s8_i8816fprop_roc_id_s8_128x32x32_64x32x32_1_nhwc_nc8hw8.cu",
    "cutlass_tensorop_s8_i8816fprop_1x1_roc_id_s8_128x32x32_64x32x32_1_nhwc_nc8hw8.cu",
    "cutlass_tensorop_s8_i8816fprop_roc_relu_s8_128x32x32_64x32x32_1_nhwc_nc8hw8.cu",
    "cutlass_tensorop_s8_i8816fprop_1x1_roc_relu_s8_128x32x32_64x32x32_1_nhwc_nc8hw8.cu",
    "cutlass_tensorop_s8_i8816fprop_roc_hswish_s8_128x32x32_64x32x32_1_nhwc_nc8hw8.cu",
    "cutlass_tensorop_s8_i8816fprop_1x1_roc_hswish_s8_128x32x32_64x32x32_1_nhwc_nc8hw8.cu",
    "cutlass_tensorop_s8_i8816fprop_id_s8_64x16x32_64x16x32_2_nhwc_nc8hw8.cu",
    "cutlass_tensorop_s8_i8816fprop_1x1_id_s8_64x16x32_64x16x32_2_nhwc_nc8hw8.cu",
    "cutlass_tensorop_s8_i8816fprop_relu_s8_64x16x32_64x16x32_2_nhwc_nc8hw8.cu",
    "cutlass_tensorop_s8_i8816fprop_1x1_relu_s8_64x16x32_64x16x32_2_nhwc_nc8hw8.cu",
    "cutlass_tensorop_s8_i8816fprop_hswish_s8_64x16x32_64x16x32_2_nhwc_nc8hw8.cu",
    "cutlass_tensorop_s8_i8816fprop_1x1_hswish_s8_64x16x32_64x16x32_2_nhwc_nc8hw8.cu",
    "cutlass_tensorop_s8_i8816fprop_roc_id_s8_64x16x32_64x16x32_2_nhwc_nc8hw8.cu",
    "cutlass_tensorop_s8_i8816fprop_1x1_roc_id_s8_64x16x32_64x16x32_2_nhwc_nc8hw8.cu",
    "cutlass_tensorop_s8_i8816fprop_roc_relu_s8_64x16x32_64x16x32_2_nhwc_nc8hw8.cu",
    "cutlass_tensorop_s8_i8816fprop_1x1_roc_relu_s8_64x16x32_64x16x32_2_nhwc_nc8hw8.cu",
    "cutlass_tensorop_s8_i8816fprop_roc_hswish_s8_64x16x32_64x16x32_2_nhwc_nc8hw8.cu",
    "cutlass_tensorop_s8_i8816fprop_1x1_roc_hswish_s8_64x16x32_64x16x32_2_nhwc_nc8hw8.cu",
    "cutlass_tensorop_s8_i8816fprop_id_s8_128x32x32_64x32x32_1_nhwc_nc16hw16.cu",
    "cutlass_tensorop_s8_i8816fprop_1x1_id_s8_128x32x32_64x32x32_1_nhwc_nc16hw16.cu",
    "cutlass_tensorop_s8_i8816fprop_relu_s8_128x32x32_64x32x32_1_nhwc_nc16hw16.cu",
    "cutlass_tensorop_s8_i8816fprop_1x1_relu_s8_128x32x32_64x32x32_1_nhwc_nc16hw16.cu",
    "cutlass_tensorop_s8_i8816fprop_hswish_s8_128x32x32_64x32x32_1_nhwc_nc16hw16.cu",
    "cutlass_tensorop_s8_i8816fprop_1x1_hswish_s8_128x32x32_64x32x32_1_nhwc_nc16hw16.cu",
    "cutlass_tensorop_s8_i8816fprop_roc_id_s8_128x32x32_64x32x32_1_nhwc_nc16hw16.cu",
    "cutlass_tensorop_s8_i8816fprop_1x1_roc_id_s8_128x32x32_64x32x32_1_nhwc_nc16hw16.cu",
    "cutlass_tensorop_s8_i8816fprop_roc_relu_s8_128x32x32_64x32x32_1_nhwc_nc16hw16.cu",
    "cutlass_tensorop_s8_i8816fprop_1x1_roc_relu_s8_128x32x32_64x32x32_1_nhwc_nc16hw16.cu",
    "cutlass_tensorop_s8_i8816fprop_roc_hswish_s8_128x32x32_64x32x32_1_nhwc_nc16hw16.cu",
    "cutlass_tensorop_s8_i8816fprop_1x1_roc_hswish_s8_128x32x32_64x32x32_1_nhwc_nc16hw16.cu",
    "cutlass_tensorop_s8_i8816fprop_id_s8_64x16x32_64x16x32_2_nhwc_nc16hw16.cu",
    "cutlass_tensorop_s8_i8816fprop_1x1_id_s8_64x16x32_64x16x32_2_nhwc_nc16hw16.cu",
    "cutlass_tensorop_s8_i8816fprop_relu_s8_64x16x32_64x16x32_2_nhwc_nc16hw16.cu",
    "cutlass_tensorop_s8_i8816fprop_1x1_relu_s8_64x16x32_64x16x32_2_nhwc_nc16hw16.cu",
    "cutlass_tensorop_s8_i8816fprop_hswish_s8_64x16x32_64x16x32_2_nhwc_nc16hw16.cu",
    "cutlass_tensorop_s8_i8816fprop_1x1_hswish_s8_64x16x32_64x16x32_2_nhwc_nc16hw16.cu",
    "cutlass_tensorop_s8_i8816fprop_roc_id_s8_64x16x32_64x16x32_2_nhwc_nc16hw16.cu",
    "cutlass_tensorop_s8_i8816fprop_1x1_roc_id_s8_64x16x32_64x16x32_2_nhwc_nc16hw16.cu",
    "cutlass_tensorop_s8_i8816fprop_roc_relu_s8_64x16x32_64x16x32_2_nhwc_nc16hw16.cu",
    "cutlass_tensorop_s8_i8816fprop_1x1_roc_relu_s8_64x16x32_64x16x32_2_nhwc_nc16hw16.cu",
    "cutlass_tensorop_s8_i8816fprop_roc_hswish_s8_64x16x32_64x16x32_2_nhwc_nc16hw16.cu",
    "cutlass_tensorop_s8_i8816fprop_1x1_roc_hswish_s8_64x16x32_64x16x32_2_nhwc_nc16hw16.cu",
    "cutlass_tensorop_s4_i8816fprop_id_s8_128x32x32_64x32x32_1_nhwc_nc4hw4.cu",
    "cutlass_tensorop_s4_i8816fprop_1x1_id_s8_128x32x32_64x32x32_1_nhwc_nc4hw4.cu",
    "cutlass_tensorop_s4_i8816fprop_relu_s8_128x32x32_64x32x32_1_nhwc_nc4hw4.cu",
    "cutlass_tensorop_s4_i8816fprop_1x1_relu_s8_128x32x32_64x32x32_1_nhwc_nc4hw4.cu",
    "cutlass_tensorop_s4_i8816fprop_hswish_s8_128x32x32_64x32x32_1_nhwc_nc4hw4.cu",
    "cutlass_tensorop_s4_i8816fprop_1x1_hswish_s8_128x32x32_64x32x32_1_nhwc_nc4hw4.cu",
    "cutlass_tensorop_s4_i8816fprop_roc_id_s8_128x32x32_64x32x32_1_nhwc_nc4hw4.cu",
    "cutlass_tensorop_s4_i8816fprop_1x1_roc_id_s8_128x32x32_64x32x32_1_nhwc_nc4hw4.cu",
    "cutlass_tensorop_s4_i8816fprop_roc_relu_s8_128x32x32_64x32x32_1_nhwc_nc4hw4.cu",
    "cutlass_tensorop_s4_i8816fprop_1x1_roc_relu_s8_128x32x32_64x32x32_1_nhwc_nc4hw4.cu",
    "cutlass_tensorop_s4_i8816fprop_roc_hswish_s8_128x32x32_64x32x32_1_nhwc_nc4hw4.cu",
    "cutlass_tensorop_s4_i8816fprop_1x1_roc_hswish_s8_128x32x32_64x32x32_1_nhwc_nc4hw4.cu",
    "cutlass_tensorop_s4_i8816fprop_id_s8_64x16x32_64x16x32_2_nhwc_nc4hw4.cu",
    "cutlass_tensorop_s4_i8816fprop_1x1_id_s8_64x16x32_64x16x32_2_nhwc_nc4hw4.cu",
    "cutlass_tensorop_s4_i8816fprop_relu_s8_64x16x32_64x16x32_2_nhwc_nc4hw4.cu",
    "cutlass_tensorop_s4_i8816fprop_1x1_relu_s8_64x16x32_64x16x32_2_nhwc_nc4hw4.cu",
    "cutlass_tensorop_s4_i8816fprop_hswish_s8_64x16x32_64x16x32_2_nhwc_nc4hw4.cu",
    "cutlass_tensorop_s4_i8816fprop_1x1_hswish_s8_64x16x32_64x16x32_2_nhwc_nc4hw4.cu",
    "cutlass_tensorop_s4_i8816fprop_roc_id_s8_64x16x32_64x16x32_2_nhwc_nc4hw4.cu",
    "cutlass_tensorop_s4_i8816fprop_1x1_roc_id_s8_64x16x32_64x16x32_2_nhwc_nc4hw4.cu",
    "cutlass_tensorop_s4_i8816fprop_roc_relu_s8_64x16x32_64x16x32_2_nhwc_nc4hw4.cu",
    "cutlass_tensorop_s4_i8816fprop_1x1_roc_relu_s8_64x16x32_64x16x32_2_nhwc_nc4hw4.cu",
    "cutlass_tensorop_s4_i8816fprop_roc_hswish_s8_64x16x32_64x16x32_2_nhwc_nc4hw4.cu",
    "cutlass_tensorop_s4_i8816fprop_1x1_roc_hswish_s8_64x16x32_64x16x32_2_nhwc_nc4hw4.cu",
    "cutlass_tensorop_u4_i8816fprop_id_s8_128x32x32_64x32x32_1_nhwc_nc4hw4.cu",
    "cutlass_tensorop_u4_i8816fprop_1x1_id_s8_128x32x32_64x32x32_1_nhwc_nc4hw4.cu",
    "cutlass_tensorop_u4_i8816fprop_relu_s8_128x32x32_64x32x32_1_nhwc_nc4hw4.cu",
    "cutlass_tensorop_u4_i8816fprop_1x1_relu_s8_128x32x32_64x32x32_1_nhwc_nc4hw4.cu",
    "cutlass_tensorop_u4_i8816fprop_hswish_s8_128x32x32_64x32x32_1_nhwc_nc4hw4.cu",
    "cutlass_tensorop_u4_i8816fprop_1x1_hswish_s8_128x32x32_64x32x32_1_nhwc_nc4hw4.cu",
    "cutlass_tensorop_u4_i8816fprop_roc_id_s8_128x32x32_64x32x32_1_nhwc_nc4hw4.cu",
    "cutlass_tensorop_u4_i8816fprop_1x1_roc_id_s8_128x32x32_64x32x32_1_nhwc_nc4hw4.cu",
    "cutlass_tensorop_u4_i8816fprop_roc_relu_s8_128x32x32_64x32x32_1_nhwc_nc4hw4.cu",
    "cutlass_tensorop_u4_i8816fprop_1x1_roc_relu_s8_128x32x32_64x32x32_1_nhwc_nc4hw4.cu",
    "cutlass_tensorop_u4_i8816fprop_roc_hswish_s8_128x32x32_64x32x32_1_nhwc_nc4hw4.cu",
    "cutlass_tensorop_u4_i8816fprop_1x1_roc_hswish_s8_128x32x32_64x32x32_1_nhwc_nc4hw4.cu",
    "cutlass_tensorop_u4_i8816fprop_id_s8_64x16x32_64x16x32_2_nhwc_nc4hw4.cu",
    "cutlass_tensorop_u4_i8816fprop_1x1_id_s8_64x16x32_64x16x32_2_nhwc_nc4hw4.cu",
    "cutlass_tensorop_u4_i8816fprop_relu_s8_64x16x32_64x16x32_2_nhwc_nc4hw4.cu",
    "cutlass_tensorop_u4_i8816fprop_1x1_relu_s8_64x16x32_64x16x32_2_nhwc_nc4hw4.cu",
    "cutlass_tensorop_u4_i8816fprop_hswish_s8_64x16x32_64x16x32_2_nhwc_nc4hw4.cu",
    "cutlass_tensorop_u4_i8816fprop_1x1_hswish_s8_64x16x32_64x16x32_2_nhwc_nc4hw4.cu",
    "cutlass_tensorop_u4_i8816fprop_roc_id_s8_64x16x32_64x16x32_2_nhwc_nc4hw4.cu",
    "cutlass_tensorop_u4_i8816fprop_1x1_roc_id_s8_64x16x32_64x16x32_2_nhwc_nc4hw4.cu",
    "cutlass_tensorop_u4_i8816fprop_roc_relu_s8_64x16x32_64x16x32_2_nhwc_nc4hw4.cu",
    "cutlass_tensorop_u4_i8816fprop_1x1_roc_relu_s8_64x16x32_64x16x32_2_nhwc_nc4hw4.cu",
    "cutlass_tensorop_u4_i8816fprop_roc_hswish_s8_64x16x32_64x16x32_2_nhwc_nc4hw4.cu",
    "cutlass_tensorop_u4_i8816fprop_1x1_roc_hswish_s8_64x16x32_64x16x32_2_nhwc_nc4hw4.cu",
    "cutlass_tensorop_f32_i8816fprop_id_s8_128x32x32_64x32x32_1_nhwc_nc4hw4.cu",
    "cutlass_tensorop_f32_i8816fprop_1x1_id_s8_128x32x32_64x32x32_1_nhwc_nc4hw4.cu",
    "cutlass_tensorop_f32_i8816fprop_relu_s8_128x32x32_64x32x32_1_nhwc_nc4hw4.cu",
    "cutlass_tensorop_f32_i8816fprop_1x1_relu_s8_128x32x32_64x32x32_1_nhwc_nc4hw4.cu",
    "cutlass_tensorop_f32_i8816fprop_hswish_s8_128x32x32_64x32x32_1_nhwc_nc4hw4.cu",
    "cutlass_tensorop_f32_i8816fprop_1x1_hswish_s8_128x32x32_64x32x32_1_nhwc_nc4hw4.cu",
    "cutlass_tensorop_f32_i8816fprop_id_s8_64x16x32_64x16x32_2_nhwc_nc4hw4.cu",
    "cutlass_tensorop_f32_i8816fprop_1x1_id_s8_64x16x32_64x16x32_2_nhwc_nc4hw4.cu",
    "cutlass_tensorop_f32_i8816fprop_relu_s8_64x16x32_64x16x32_2_nhwc_nc4hw4.cu",
    "cutlass_tensorop_f32_i8816fprop_1x1_relu_s8_64x16x32_64x16x32_2_nhwc_nc4hw4.cu",
    "cutlass_tensorop_f32_i8816fprop_hswish_s8_64x16x32_64x16x32_2_nhwc_nc4hw4.cu",
    "cutlass_tensorop_f32_i8816fprop_1x1_hswish_s8_64x16x32_64x16x32_2_nhwc_nc4hw4.cu",
    "cutlass_tensorop_f32_i8816fprop_roc_id_s8_64x16x32_64x16x32_2_nhwc_nc4hw4.cu",
    "cutlass_tensorop_f32_i8816fprop_1x1_roc_id_s8_64x16x32_64x16x32_2_nhwc_nc4hw4.cu",
    "cutlass_tensorop_f32_i8816fprop_roc_relu_s8_64x16x32_64x16x32_2_nhwc_nc4hw4.cu",
    "cutlass_tensorop_f32_i8816fprop_1x1_roc_relu_s8_64x16x32_64x16x32_2_nhwc_nc4hw4.cu",
    "cutlass_tensorop_f32_i8816fprop_roc_hswish_s8_64x16x32_64x16x32_2_nhwc_nc4hw4.cu",
    "cutlass_tensorop_f32_i8816fprop_1x1_roc_hswish_s8_64x16x32_64x16x32_2_nhwc_nc4hw4.cu",
    "cutlass_tensorop_s4_i8816fprop_id_s8_128x32x32_64x32x32_1_nhwc_nc8hw8.cu",
    "cutlass_tensorop_s4_i8816fprop_1x1_id_s8_128x32x32_64x32x32_1_nhwc_nc8hw8.cu",
    "cutlass_tensorop_s4_i8816fprop_relu_s8_128x32x32_64x32x32_1_nhwc_nc8hw8.cu",
    "cutlass_tensorop_s4_i8816fprop_1x1_relu_s8_128x32x32_64x32x32_1_nhwc_nc8hw8.cu",
    "cutlass_tensorop_s4_i8816fprop_hswish_s8_128x32x32_64x32x32_1_nhwc_nc8hw8.cu",
    "cutlass_tensorop_s4_i8816fprop_1x1_hswish_s8_128x32x32_64x32x32_1_nhwc_nc8hw8.cu",
    "cutlass_tensorop_s4_i8816fprop_roc_id_s8_128x32x32_64x32x32_1_nhwc_nc8hw8.cu",
    "cutlass_tensorop_s4_i8816fprop_1x1_roc_id_s8_128x32x32_64x32x32_1_nhwc_nc8hw8.cu",
    "cutlass_tensorop_s4_i8816fprop_roc_relu_s8_128x32x32_64x32x32_1_nhwc_nc8hw8.cu",
    "cutlass_tensorop_s4_i8816fprop_1x1_roc_relu_s8_128x32x32_64x32x32_1_nhwc_nc8hw8.cu",
    "cutlass_tensorop_s4_i8816fprop_roc_hswish_s8_128x32x32_64x32x32_1_nhwc_nc8hw8.cu",
    "cutlass_tensorop_s4_i8816fprop_1x1_roc_hswish_s8_128x32x32_64x32x32_1_nhwc_nc8hw8.cu",
    "cutlass_tensorop_s4_i8816fprop_id_s8_64x16x32_64x16x32_2_nhwc_nc8hw8.cu",
    "cutlass_tensorop_s4_i8816fprop_1x1_id_s8_64x16x32_64x16x32_2_nhwc_nc8hw8.cu",
    "cutlass_tensorop_s4_i8816fprop_relu_s8_64x16x32_64x16x32_2_nhwc_nc8hw8.cu",
    "cutlass_tensorop_s4_i8816fprop_1x1_relu_s8_64x16x32_64x16x32_2_nhwc_nc8hw8.cu",
    "cutlass_tensorop_s4_i8816fprop_hswish_s8_64x16x32_64x16x32_2_nhwc_nc8hw8.cu",
    "cutlass_tensorop_s4_i8816fprop_1x1_hswish_s8_64x16x32_64x16x32_2_nhwc_nc8hw8.cu",
    "cutlass_tensorop_s4_i8816fprop_roc_id_s8_64x16x32_64x16x32_2_nhwc_nc8hw8.cu",
    "cutlass_tensorop_s4_i8816fprop_1x1_roc_id_s8_64x16x32_64x16x32_2_nhwc_nc8hw8.cu",
    "cutlass_tensorop_s4_i8816fprop_roc_relu_s8_64x16x32_64x16x32_2_nhwc_nc8hw8.cu",
    "cutlass_tensorop_s4_i8816fprop_1x1_roc_relu_s8_64x16x32_64x16x32_2_nhwc_nc8hw8.cu",
    "cutlass_tensorop_s4_i8816fprop_roc_hswish_s8_64x16x32_64x16x32_2_nhwc_nc8hw8.cu",
    "cutlass_tensorop_s4_i8816fprop_1x1_roc_hswish_s8_64x16x32_64x16x32_2_nhwc_nc8hw8.cu",
    "cutlass_tensorop_u4_i8816fprop_id_s8_128x32x32_64x32x32_1_nhwc_nc8hw8.cu",
    "cutlass_tensorop_u4_i8816fprop_1x1_id_s8_128x32x32_64x32x32_1_nhwc_nc8hw8.cu",
    "cutlass_tensorop_u4_i8816fprop_relu_s8_128x32x32_64x32x32_1_nhwc_nc8hw8.cu",
    "cutlass_tensorop_u4_i8816fprop_1x1_relu_s8_128x32x32_64x32x32_1_nhwc_nc8hw8.cu",
    "cutlass_tensorop_u4_i8816fprop_hswish_s8_128x32x32_64x32x32_1_nhwc_nc8hw8.cu",
    "cutlass_tensorop_u4_i8816fprop_1x1_hswish_s8_128x32x32_64x32x32_1_nhwc_nc8hw8.cu",
    "cutlass_tensorop_u4_i8816fprop_roc_id_s8_128x32x32_64x32x32_1_nhwc_nc8hw8.cu",
    "cutlass_tensorop_u4_i8816fprop_1x1_roc_id_s8_128x32x32_64x32x32_1_nhwc_nc8hw8.cu",
    "cutlass_tensorop_u4_i8816fprop_roc_relu_s8_128x32x32_64x32x32_1_nhwc_nc8hw8.cu",
    "cutlass_tensorop_u4_i8816fprop_1x1_roc_relu_s8_128x32x32_64x32x32_1_nhwc_nc8hw8.cu",
    "cutlass_tensorop_u4_i8816fprop_roc_hswish_s8_128x32x32_64x32x32_1_nhwc_nc8hw8.cu",
    "cutlass_tensorop_u4_i8816fprop_1x1_roc_hswish_s8_128x32x32_64x32x32_1_nhwc_nc8hw8.cu",
    "cutlass_tensorop_u4_i8816fprop_id_s8_64x16x32_64x16x32_2_nhwc_nc8hw8.cu",
    "cutlass_tensorop_u4_i8816fprop_1x1_id_s8_64x16x32_64x16x32_2_nhwc_nc8hw8.cu",
    "cutlass_tensorop_u4_i8816fprop_relu_s8_64x16x32_64x16x32_2_nhwc_nc8hw8.cu",
    "cutlass_tensorop_u4_i8816fprop_1x1_relu_s8_64x16x32_64x16x32_2_nhwc_nc8hw8.cu",
    "cutlass_tensorop_u4_i8816fprop_hswish_s8_64x16x32_64x16x32_2_nhwc_nc8hw8.cu",
    "cutlass_tensorop_u4_i8816fprop_1x1_hswish_s8_64x16x32_64x16x32_2_nhwc_nc8hw8.cu",
    "cutlass_tensorop_u4_i8816fprop_roc_id_s8_64x16x32_64x16x32_2_nhwc_nc8hw8.cu",
    "cutlass_tensorop_u4_i8816fprop_1x1_roc_id_s8_64x16x32_64x16x32_2_nhwc_nc8hw8.cu",
    "cutlass_tensorop_u4_i8816fprop_roc_relu_s8_64x16x32_64x16x32_2_nhwc_nc8hw8.cu",
    "cutlass_tensorop_u4_i8816fprop_1x1_roc_relu_s8_64x16x32_64x16x32_2_nhwc_nc8hw8.cu",
    "cutlass_tensorop_u4_i8816fprop_roc_hswish_s8_64x16x32_64x16x32_2_nhwc_nc8hw8.cu",
    "cutlass_tensorop_u4_i8816fprop_1x1_roc_hswish_s8_64x16x32_64x16x32_2_nhwc_nc8hw8.cu",
    "cutlass_tensorop_f32_i8816fprop_id_s8_128x32x32_64x32x32_1_nhwc_nc8hw8.cu",
    "cutlass_tensorop_f32_i8816fprop_1x1_id_s8_128x32x32_64x32x32_1_nhwc_nc8hw8.cu",
    "cutlass_tensorop_f32_i8816fprop_relu_s8_128x32x32_64x32x32_1_nhwc_nc8hw8.cu",
    "cutlass_tensorop_f32_i8816fprop_1x1_relu_s8_128x32x32_64x32x32_1_nhwc_nc8hw8.cu",
    "cutlass_tensorop_f32_i8816fprop_hswish_s8_128x32x32_64x32x32_1_nhwc_nc8hw8.cu",
    "cutlass_tensorop_f32_i8816fprop_1x1_hswish_s8_128x32x32_64x32x32_1_nhwc_nc8hw8.cu",
    "cutlass_tensorop_f32_i8816fprop_id_s8_64x16x32_64x16x32_2_nhwc_nc8hw8.cu",
    "cutlass_tensorop_f32_i8816fprop_1x1_id_s8_64x16x32_64x16x32_2_nhwc_nc8hw8.cu",
    "cutlass_tensorop_f32_i8816fprop_relu_s8_64x16x32_64x16x32_2_nhwc_nc8hw8.cu",
    "cutlass_tensorop_f32_i8816fprop_1x1_relu_s8_64x16x32_64x16x32_2_nhwc_nc8hw8.cu",
    "cutlass_tensorop_f32_i8816fprop_hswish_s8_64x16x32_64x16x32_2_nhwc_nc8hw8.cu",
    "cutlass_tensorop_f32_i8816fprop_1x1_hswish_s8_64x16x32_64x16x32_2_nhwc_nc8hw8.cu",
    "cutlass_tensorop_f32_i8816fprop_roc_id_s8_64x16x32_64x16x32_2_nhwc_nc8hw8.cu",
    "cutlass_tensorop_f32_i8816fprop_1x1_roc_id_s8_64x16x32_64x16x32_2_nhwc_nc8hw8.cu",
    "cutlass_tensorop_f32_i8816fprop_roc_relu_s8_64x16x32_64x16x32_2_nhwc_nc8hw8.cu",
    "cutlass_tensorop_f32_i8816fprop_1x1_roc_relu_s8_64x16x32_64x16x32_2_nhwc_nc8hw8.cu",
    "cutlass_tensorop_f32_i8816fprop_roc_hswish_s8_64x16x32_64x16x32_2_nhwc_nc8hw8.cu",
    "cutlass_tensorop_f32_i8816fprop_1x1_roc_hswish_s8_64x16x32_64x16x32_2_nhwc_nc8hw8.cu",
    "cutlass_tensorop_s4_i8816fprop_id_s8_128x32x32_64x32x32_1_nhwc_nc16hw16.cu",
    "cutlass_tensorop_s4_i8816fprop_1x1_id_s8_128x32x32_64x32x32_1_nhwc_nc16hw16.cu",
    "cutlass_tensorop_s4_i8816fprop_relu_s8_128x32x32_64x32x32_1_nhwc_nc16hw16.cu",
    "cutlass_tensorop_s4_i8816fprop_1x1_relu_s8_128x32x32_64x32x32_1_nhwc_nc16hw16.cu",
    "cutlass_tensorop_s4_i8816fprop_hswish_s8_128x32x32_64x32x32_1_nhwc_nc16hw16.cu",
    "cutlass_tensorop_s4_i8816fprop_1x1_hswish_s8_128x32x32_64x32x32_1_nhwc_nc16hw16.cu",
    "cutlass_tensorop_s4_i8816fprop_roc_id_s8_128x32x32_64x32x32_1_nhwc_nc16hw16.cu",
    "cutlass_tensorop_s4_i8816fprop_1x1_roc_id_s8_128x32x32_64x32x32_1_nhwc_nc16hw16.cu",
    "cutlass_tensorop_s4_i8816fprop_roc_relu_s8_128x32x32_64x32x32_1_nhwc_nc16hw16.cu",
    "cutlass_tensorop_s4_i8816fprop_1x1_roc_relu_s8_128x32x32_64x32x32_1_nhwc_nc16hw16.cu",
    "cutlass_tensorop_s4_i8816fprop_roc_hswish_s8_128x32x32_64x32x32_1_nhwc_nc16hw16.cu",
    "cutlass_tensorop_s4_i8816fprop_1x1_roc_hswish_s8_128x32x32_64x32x32_1_nhwc_nc16hw16.cu",
    "cutlass_tensorop_s4_i8816fprop_id_s8_64x16x32_64x16x32_2_nhwc_nc16hw16.cu",
    "cutlass_tensorop_s4_i8816fprop_1x1_id_s8_64x16x32_64x16x32_2_nhwc_nc16hw16.cu",
    "cutlass_tensorop_s4_i8816fprop_relu_s8_64x16x32_64x16x32_2_nhwc_nc16hw16.cu",
    "cutlass_tensorop_s4_i8816fprop_1x1_relu_s8_64x16x32_64x16x32_2_nhwc_nc16hw16.cu",
    "cutlass_tensorop_s4_i8816fprop_hswish_s8_64x16x32_64x16x32_2_nhwc_nc16hw16.cu",
    "cutlass_tensorop_s4_i8816fprop_1x1_hswish_s8_64x16x32_64x16x32_2_nhwc_nc16hw16.cu",
    "cutlass_tensorop_s4_i8816fprop_roc_id_s8_64x16x32_64x16x32_2_nhwc_nc16hw16.cu",
    "cutlass_tensorop_s4_i8816fprop_1x1_roc_id_s8_64x16x32_64x16x32_2_nhwc_nc16hw16.cu",
    "cutlass_tensorop_s4_i8816fprop_roc_relu_s8_64x16x32_64x16x32_2_nhwc_nc16hw16.cu",
    "cutlass_tensorop_s4_i8816fprop_1x1_roc_relu_s8_64x16x32_64x16x32_2_nhwc_nc16hw16.cu",
    "cutlass_tensorop_s4_i8816fprop_roc_hswish_s8_64x16x32_64x16x32_2_nhwc_nc16hw16.cu",
    "cutlass_tensorop_s4_i8816fprop_1x1_roc_hswish_s8_64x16x32_64x16x32_2_nhwc_nc16hw16.cu",
    "cutlass_tensorop_u4_i8816fprop_id_s8_128x32x32_64x32x32_1_nhwc_nc16hw16.cu",
    "cutlass_tensorop_u4_i8816fprop_1x1_id_s8_128x32x32_64x32x32_1_nhwc_nc16hw16.cu",
    "cutlass_tensorop_u4_i8816fprop_relu_s8_128x32x32_64x32x32_1_nhwc_nc16hw16.cu",
    "cutlass_tensorop_u4_i8816fprop_1x1_relu_s8_128x32x32_64x32x32_1_nhwc_nc16hw16.cu",
    "cutlass_tensorop_u4_i8816fprop_hswish_s8_128x32x32_64x32x32_1_nhwc_nc16hw16.cu",
    "cutlass_tensorop_u4_i8816fprop_1x1_hswish_s8_128x32x32_64x32x32_1_nhwc_nc16hw16.cu",
    "cutlass_tensorop_u4_i8816fprop_roc_id_s8_128x32x32_64x32x32_1_nhwc_nc16hw16.cu",
    "cutlass_tensorop_u4_i8816fprop_1x1_roc_id_s8_128x32x32_64x32x32_1_nhwc_nc16hw16.cu",
    "cutlass_tensorop_u4_i8816fprop_roc_relu_s8_128x32x32_64x32x32_1_nhwc_nc16hw16.cu",
    "cutlass_tensorop_u4_i8816fprop_1x1_roc_relu_s8_128x32x32_64x32x32_1_nhwc_nc16hw16.cu",
    "cutlass_tensorop_u4_i8816fprop_roc_hswish_s8_128x32x32_64x32x32_1_nhwc_nc16hw16.cu",
    "cutlass_tensorop_u4_i8816fprop_1x1_roc_hswish_s8_128x32x32_64x32x32_1_nhwc_nc16hw16.cu",
    "cutlass_tensorop_u4_i8816fprop_id_s8_64x16x32_64x16x32_2_nhwc_nc16hw16.cu",
    "cutlass_tensorop_u4_i8816fprop_1x1_id_s8_64x16x32_64x16x32_2_nhwc_nc16hw16.cu",
    "cutlass_tensorop_u4_i8816fprop_relu_s8_64x16x32_64x16x32_2_nhwc_nc16hw16.cu",
    "cutlass_tensorop_u4_i8816fprop_1x1_relu_s8_64x16x32_64x16x32_2_nhwc_nc16hw16.cu",
    "cutlass_tensorop_u4_i8816fprop_hswish_s8_64x16x32_64x16x32_2_nhwc_nc16hw16.cu",
    "cutlass_tensorop_u4_i8816fprop_1x1_hswish_s8_64x16x32_64x16x32_2_nhwc_nc16hw16.cu",
    "cutlass_tensorop_u4_i8816fprop_roc_id_s8_64x16x32_64x16x32_2_nhwc_nc16hw16.cu",
    "cutlass_tensorop_u4_i8816fprop_1x1_roc_id_s8_64x16x32_64x16x32_2_nhwc_nc16hw16.cu",
    "cutlass_tensorop_u4_i8816fprop_roc_relu_s8_64x16x32_64x16x32_2_nhwc_nc16hw16.cu",
    "cutlass_tensorop_u4_i8816fprop_1x1_roc_relu_s8_64x16x32_64x16x32_2_nhwc_nc16hw16.cu",
    "cutlass_tensorop_u4_i8816fprop_roc_hswish_s8_64x16x32_64x16x32_2_nhwc_nc16hw16.cu",
    "cutlass_tensorop_u4_i8816fprop_1x1_roc_hswish_s8_64x16x32_64x16x32_2_nhwc_nc16hw16.cu",
    "cutlass_tensorop_f32_i8816fprop_id_s8_128x32x32_64x32x32_1_nhwc_nc16hw16.cu",
    "cutlass_tensorop_f32_i8816fprop_1x1_id_s8_128x32x32_64x32x32_1_nhwc_nc16hw16.cu",
    "cutlass_tensorop_f32_i8816fprop_relu_s8_128x32x32_64x32x32_1_nhwc_nc16hw16.cu",
    "cutlass_tensorop_f32_i8816fprop_1x1_relu_s8_128x32x32_64x32x32_1_nhwc_nc16hw16.cu",
    "cutlass_tensorop_f32_i8816fprop_hswish_s8_128x32x32_64x32x32_1_nhwc_nc16hw16.cu",
    "cutlass_tensorop_f32_i8816fprop_1x1_hswish_s8_128x32x32_64x32x32_1_nhwc_nc16hw16.cu",
    "cutlass_tensorop_f32_i8816fprop_id_s8_64x16x32_64x16x32_2_nhwc_nc16hw16.cu",
    "cutlass_tensorop_f32_i8816fprop_1x1_id_s8_64x16x32_64x16x32_2_nhwc_nc16hw16.cu",
    "cutlass_tensorop_f32_i8816fprop_relu_s8_64x16x32_64x16x32_2_nhwc_nc16hw16.cu",
    "cutlass_tensorop_f32_i8816fprop_1x1_relu_s8_64x16x32_64x16x32_2_nhwc_nc16hw16.cu",
    "cutlass_tensorop_f32_i8816fprop_hswish_s8_64x16x32_64x16x32_2_nhwc_nc16hw16.cu",
    "cutlass_tensorop_f32_i8816fprop_1x1_hswish_s8_64x16x32_64x16x32_2_nhwc_nc16hw16.cu",
    "cutlass_tensorop_f32_i8816fprop_roc_id_s8_64x16x32_64x16x32_2_nhwc_nc16hw16.cu",
    "cutlass_tensorop_f32_i8816fprop_1x1_roc_id_s8_64x16x32_64x16x32_2_nhwc_nc16hw16.cu",
    "cutlass_tensorop_f32_i8816fprop_roc_relu_s8_64x16x32_64x16x32_2_nhwc_nc16hw16.cu",
    "cutlass_tensorop_f32_i8816fprop_1x1_roc_relu_s8_64x16x32_64x16x32_2_nhwc_nc16hw16.cu",
    "cutlass_tensorop_f32_i8816fprop_roc_hswish_s8_64x16x32_64x16x32_2_nhwc_nc16hw16.cu",
    "cutlass_tensorop_f32_i8816fprop_1x1_roc_hswish_s8_64x16x32_64x16x32_2_nhwc_nc16hw16.cu",
    "all_conv2d_tensorop8816_operations.cu",
    "cutlass_tensorop_s4_i8832fprop_roc_id_s4_128x256x128_64x64x128_2_nc64hw64_c64rsk64.cu",
    "cutlass_tensorop_s4_i8832fprop_1x1_roc_id_s4_128x256x128_64x64x128_2_nc64hw64_c64rsk64.cu",
@@ -815,6 +920,12 @@ cutlass_gen_list = [
    "cutlass_tensorop_u4_i8832fprop_1x1_roc_id_u4_s4_128x64x64_64x64x64_1_nc64hw64_c64rsk64.cu",
    "cutlass_tensorop_u4_i8832fprop_roc_relu_u4_s4_128x64x64_64x64x64_1_nc64hw64_c64rsk64.cu",
    "cutlass_tensorop_u4_i8832fprop_1x1_roc_relu_u4_s4_128x64x64_64x64x64_1_nc64hw64_c64rsk64.cu",
    "cutlass_tensorop_s4_i8832fprop_id_s4_128x16x64_128x16x64_2_nhwc_nc8hw8.cu",
    "cutlass_tensorop_s4_i8832fprop_1x1_id_s4_128x16x64_128x16x64_2_nhwc_nc8hw8.cu",
    "cutlass_tensorop_s4_i8832fprop_relu_s4_128x16x64_128x16x64_2_nhwc_nc8hw8.cu",
    "cutlass_tensorop_s4_i8832fprop_1x1_relu_s4_128x16x64_128x16x64_2_nhwc_nc8hw8.cu",
    "cutlass_tensorop_s4_i8832fprop_hswish_s4_128x16x64_128x16x64_2_nhwc_nc8hw8.cu",
    "cutlass_tensorop_s4_i8832fprop_1x1_hswish_s4_128x16x64_128x16x64_2_nhwc_nc8hw8.cu",
    "cutlass_tensorop_s4_i8832fprop_id_s4_128x32x64_64x32x64_1_nhwc_nc8hw8.cu",
    "cutlass_tensorop_s4_i8832fprop_1x1_id_s4_128x32x64_64x32x64_1_nhwc_nc8hw8.cu",
    "cutlass_tensorop_s4_i8832fprop_relu_s4_128x32x64_64x32x64_1_nhwc_nc8hw8.cu",
@@ -839,6 +950,12 @@ cutlass_gen_list = [
    "cutlass_tensorop_s4_i8832fprop_1x1_roc_relu_s4_128x64x64_64x64x64_1_nhwc_nc8hw8.cu",
    "cutlass_tensorop_s4_i8832fprop_roc_hswish_s4_128x64x64_64x64x64_1_nhwc_nc8hw8.cu",
    "cutlass_tensorop_s4_i8832fprop_1x1_roc_hswish_s4_128x64x64_64x64x64_1_nhwc_nc8hw8.cu",
    "cutlass_tensorop_s4_i8832fprop_id_s4_128x16x64_128x16x64_2_nhwc_nc16hw16.cu",
    "cutlass_tensorop_s4_i8832fprop_1x1_id_s4_128x16x64_128x16x64_2_nhwc_nc16hw16.cu",
    "cutlass_tensorop_s4_i8832fprop_relu_s4_128x16x64_128x16x64_2_nhwc_nc16hw16.cu",
    "cutlass_tensorop_s4_i8832fprop_1x1_relu_s4_128x16x64_128x16x64_2_nhwc_nc16hw16.cu",
    "cutlass_tensorop_s4_i8832fprop_hswish_s4_128x16x64_128x16x64_2_nhwc_nc16hw16.cu",
    "cutlass_tensorop_s4_i8832fprop_1x1_hswish_s4_128x16x64_128x16x64_2_nhwc_nc16hw16.cu",
    "cutlass_tensorop_s4_i8832fprop_id_s4_128x32x64_64x32x64_1_nhwc_nc16hw16.cu",
    "cutlass_tensorop_s4_i8832fprop_1x1_id_s4_128x32x64_64x32x64_1_nhwc_nc16hw16.cu",
    "cutlass_tensorop_s4_i8832fprop_relu_s4_128x32x64_64x32x64_1_nhwc_nc16hw16.cu",
@@ -863,6 +980,12 @@ cutlass_gen_list = [
    "cutlass_tensorop_s4_i8832fprop_1x1_roc_relu_s4_128x64x64_64x64x64_1_nhwc_nc16hw16.cu",
    "cutlass_tensorop_s4_i8832fprop_roc_hswish_s4_128x64x64_64x64x64_1_nhwc_nc16hw16.cu",
    "cutlass_tensorop_s4_i8832fprop_1x1_roc_hswish_s4_128x64x64_64x64x64_1_nhwc_nc16hw16.cu",
    "cutlass_tensorop_s4_i8832fprop_id_s4_128x16x64_128x16x64_2_nhwc_nc32hw32.cu",
    "cutlass_tensorop_s4_i8832fprop_1x1_id_s4_128x16x64_128x16x64_2_nhwc_nc32hw32.cu",
    "cutlass_tensorop_s4_i8832fprop_relu_s4_128x16x64_128x16x64_2_nhwc_nc32hw32.cu",
    "cutlass_tensorop_s4_i8832fprop_1x1_relu_s4_128x16x64_128x16x64_2_nhwc_nc32hw32.cu",
    "cutlass_tensorop_s4_i8832fprop_hswish_s4_128x16x64_128x16x64_2_nhwc_nc32hw32.cu",
    "cutlass_tensorop_s4_i8832fprop_1x1_hswish_s4_128x16x64_128x16x64_2_nhwc_nc32hw32.cu",
    "cutlass_tensorop_s4_i8832fprop_id_s4_128x32x64_64x32x64_1_nhwc_nc32hw32.cu",
    "cutlass_tensorop_s4_i8832fprop_1x1_id_s4_128x32x64_64x32x64_1_nhwc_nc32hw32.cu",
    "cutlass_tensorop_s4_i8832fprop_relu_s4_128x32x64_64x32x64_1_nhwc_nc32hw32.cu",
@@ -887,6 +1010,10 @@ cutlass_gen_list = [
    "cutlass_tensorop_s4_i8832fprop_1x1_roc_relu_s4_128x64x64_64x64x64_1_nhwc_nc32hw32.cu",
    "cutlass_tensorop_s4_i8832fprop_roc_hswish_s4_128x64x64_64x64x64_1_nhwc_nc32hw32.cu",
    "cutlass_tensorop_s4_i8832fprop_1x1_roc_hswish_s4_128x64x64_64x64x64_1_nhwc_nc32hw32.cu",
    "cutlass_tensorop_u4_i8832fprop_id_u4_s4_128x16x64_128x16x64_2_nhwc_nc8hw8.cu",
    "cutlass_tensorop_u4_i8832fprop_1x1_id_u4_s4_128x16x64_128x16x64_2_nhwc_nc8hw8.cu",
    "cutlass_tensorop_u4_i8832fprop_relu_u4_s4_128x16x64_128x16x64_2_nhwc_nc8hw8.cu",
    "cutlass_tensorop_u4_i8832fprop_1x1_relu_u4_s4_128x16x64_128x16x64_2_nhwc_nc8hw8.cu",
    "cutlass_tensorop_u4_i8832fprop_id_u4_s4_128x32x64_64x32x64_1_nhwc_nc8hw8.cu",
    "cutlass_tensorop_u4_i8832fprop_1x1_id_u4_s4_128x32x64_64x32x64_1_nhwc_nc8hw8.cu",
    "cutlass_tensorop_u4_i8832fprop_relu_u4_s4_128x32x64_64x32x64_1_nhwc_nc8hw8.cu",
@@ -903,6 +1030,10 @@ cutlass_gen_list = [
    "cutlass_tensorop_u4_i8832fprop_1x1_roc_id_u4_s4_128x64x64_64x64x64_1_nhwc_nc8hw8.cu",
    "cutlass_tensorop_u4_i8832fprop_roc_relu_u4_s4_128x64x64_64x64x64_1_nhwc_nc8hw8.cu",
    "cutlass_tensorop_u4_i8832fprop_1x1_roc_relu_u4_s4_128x64x64_64x64x64_1_nhwc_nc8hw8.cu",
    "cutlass_tensorop_u4_i8832fprop_id_u4_s4_128x16x64_128x16x64_2_nhwc_nc16hw16.cu",
    "cutlass_tensorop_u4_i8832fprop_1x1_id_u4_s4_128x16x64_128x16x64_2_nhwc_nc16hw16.cu",
    "cutlass_tensorop_u4_i8832fprop_relu_u4_s4_128x16x64_128x16x64_2_nhwc_nc16hw16.cu",
    "cutlass_tensorop_u4_i8832fprop_1x1_relu_u4_s4_128x16x64_128x16x64_2_nhwc_nc16hw16.cu",
    "cutlass_tensorop_u4_i8832fprop_id_u4_s4_128x32x64_64x32x64_1_nhwc_nc16hw16.cu",
    "cutlass_tensorop_u4_i8832fprop_1x1_id_u4_s4_128x32x64_64x32x64_1_nhwc_nc16hw16.cu",
    "cutlass_tensorop_u4_i8832fprop_relu_u4_s4_128x32x64_64x32x64_1_nhwc_nc16hw16.cu",
@@ -919,6 +1050,10 @@ cutlass_gen_list = [
    "cutlass_tensorop_u4_i8832fprop_1x1_roc_id_u4_s4_128x64x64_64x64x64_1_nhwc_nc16hw16.cu",
    "cutlass_tensorop_u4_i8832fprop_roc_relu_u4_s4_128x64x64_64x64x64_1_nhwc_nc16hw16.cu",
    "cutlass_tensorop_u4_i8832fprop_1x1_roc_relu_u4_s4_128x64x64_64x64x64_1_nhwc_nc16hw16.cu",
    "cutlass_tensorop_u4_i8832fprop_id_u4_s4_128x16x64_128x16x64_2_nhwc_nc32hw32.cu",
    "cutlass_tensorop_u4_i8832fprop_1x1_id_u4_s4_128x16x64_128x16x64_2_nhwc_nc32hw32.cu",
    "cutlass_tensorop_u4_i8832fprop_relu_u4_s4_128x16x64_128x16x64_2_nhwc_nc32hw32.cu",
    "cutlass_tensorop_u4_i8832fprop_1x1_relu_u4_s4_128x16x64_128x16x64_2_nhwc_nc32hw32.cu",
    "cutlass_tensorop_u4_i8832fprop_id_u4_s4_128x32x64_64x32x64_1_nhwc_nc32hw32.cu",
    "cutlass_tensorop_u4_i8832fprop_1x1_id_u4_s4_128x32x64_64x32x64_1_nhwc_nc32hw32.cu",
    "cutlass_tensorop_u4_i8832fprop_relu_u4_s4_128x32x64_64x32x64_1_nhwc_nc32hw32.cu",
@@ -935,5 +1070,155 @@ cutlass_gen_list = [
    "cutlass_tensorop_u4_i8832fprop_1x1_roc_id_u4_s4_128x64x64_64x64x64_1_nhwc_nc32hw32.cu",
    "cutlass_tensorop_u4_i8832fprop_roc_relu_u4_s4_128x64x64_64x64x64_1_nhwc_nc32hw32.cu",
    "cutlass_tensorop_u4_i8832fprop_1x1_roc_relu_u4_s4_128x64x64_64x64x64_1_nhwc_nc32hw32.cu",
    "cutlass_tensorop_s8_i8832fprop_id_s4_128x16x64_128x16x64_2_nhwc_nc8hw8.cu",
    "cutlass_tensorop_s8_i8832fprop_1x1_id_s4_128x16x64_128x16x64_2_nhwc_nc8hw8.cu",
    "cutlass_tensorop_s8_i8832fprop_relu_s4_128x16x64_128x16x64_2_nhwc_nc8hw8.cu",
    "cutlass_tensorop_s8_i8832fprop_1x1_relu_s4_128x16x64_128x16x64_2_nhwc_nc8hw8.cu",
    "cutlass_tensorop_s8_i8832fprop_hswish_s4_128x16x64_128x16x64_2_nhwc_nc8hw8.cu",
    "cutlass_tensorop_s8_i8832fprop_1x1_hswish_s4_128x16x64_128x16x64_2_nhwc_nc8hw8.cu",
    "cutlass_tensorop_s8_i8832fprop_id_s4_128x32x64_64x32x64_1_nhwc_nc8hw8.cu",
    "cutlass_tensorop_s8_i8832fprop_1x1_id_s4_128x32x64_64x32x64_1_nhwc_nc8hw8.cu",
    "cutlass_tensorop_s8_i8832fprop_relu_s4_128x32x64_64x32x64_1_nhwc_nc8hw8.cu",
    "cutlass_tensorop_s8_i8832fprop_1x1_relu_s4_128x32x64_64x32x64_1_nhwc_nc8hw8.cu",
    "cutlass_tensorop_s8_i8832fprop_hswish_s4_128x32x64_64x32x64_1_nhwc_nc8hw8.cu",
    "cutlass_tensorop_s8_i8832fprop_1x1_hswish_s4_128x32x64_64x32x64_1_nhwc_nc8hw8.cu",
    "cutlass_tensorop_s8_i8832fprop_roc_id_s4_128x32x64_64x32x64_1_nhwc_nc8hw8.cu",
    "cutlass_tensorop_s8_i8832fprop_1x1_roc_id_s4_128x32x64_64x32x64_1_nhwc_nc8hw8.cu",
    "cutlass_tensorop_s8_i8832fprop_roc_relu_s4_128x32x64_64x32x64_1_nhwc_nc8hw8.cu",
    "cutlass_tensorop_s8_i8832fprop_1x1_roc_relu_s4_128x32x64_64x32x64_1_nhwc_nc8hw8.cu",
    "cutlass_tensorop_s8_i8832fprop_roc_hswish_s4_128x32x64_64x32x64_1_nhwc_nc8hw8.cu",
    "cutlass_tensorop_s8_i8832fprop_1x1_roc_hswish_s4_128x32x64_64x32x64_1_nhwc_nc8hw8.cu",
    "cutlass_tensorop_s8_i8832fprop_id_s4_128x64x64_64x64x64_1_nhwc_nc8hw8.cu",
    "cutlass_tensorop_s8_i8832fprop_1x1_id_s4_128x64x64_64x64x64_1_nhwc_nc8hw8.cu",
    "cutlass_tensorop_s8_i8832fprop_relu_s4_128x64x64_64x64x64_1_nhwc_nc8hw8.cu",
    "cutlass_tensorop_s8_i8832fprop_1x1_relu_s4_128x64x64_64x64x64_1_nhwc_nc8hw8.cu",
    "cutlass_tensorop_s8_i8832fprop_hswish_s4_128x64x64_64x64x64_1_nhwc_nc8hw8.cu",
    "cutlass_tensorop_s8_i8832fprop_1x1_hswish_s4_128x64x64_64x64x64_1_nhwc_nc8hw8.cu",
    "cutlass_tensorop_s8_i8832fprop_roc_id_s4_128x64x64_64x64x64_1_nhwc_nc8hw8.cu",
    "cutlass_tensorop_s8_i8832fprop_1x1_roc_id_s4_128x64x64_64x64x64_1_nhwc_nc8hw8.cu",
    "cutlass_tensorop_s8_i8832fprop_roc_relu_s4_128x64x64_64x64x64_1_nhwc_nc8hw8.cu",
    "cutlass_tensorop_s8_i8832fprop_1x1_roc_relu_s4_128x64x64_64x64x64_1_nhwc_nc8hw8.cu",
    "cutlass_tensorop_s8_i8832fprop_roc_hswish_s4_128x64x64_64x64x64_1_nhwc_nc8hw8.cu",
    "cutlass_tensorop_s8_i8832fprop_1x1_roc_hswish_s4_128x64x64_64x64x64_1_nhwc_nc8hw8.cu",
    "cutlass_tensorop_s8_i8832fprop_id_s4_128x16x64_128x16x64_2_nhwc_nc16hw16.cu",
    "cutlass_tensorop_s8_i8832fprop_1x1_id_s4_128x16x64_128x16x64_2_nhwc_nc16hw16.cu",
    "cutlass_tensorop_s8_i8832fprop_relu_s4_128x16x64_128x16x64_2_nhwc_nc16hw16.cu",
    "cutlass_tensorop_s8_i8832fprop_1x1_relu_s4_128x16x64_128x16x64_2_nhwc_nc16hw16.cu",
    "cutlass_tensorop_s8_i8832fprop_hswish_s4_128x16x64_128x16x64_2_nhwc_nc16hw16.cu",
    "cutlass_tensorop_s8_i8832fprop_1x1_hswish_s4_128x16x64_128x16x64_2_nhwc_nc16hw16.cu",
    "cutlass_tensorop_s8_i8832fprop_id_s4_128x32x64_64x32x64_1_nhwc_nc16hw16.cu",
    "cutlass_tensorop_s8_i8832fprop_1x1_id_s4_128x32x64_64x32x64_1_nhwc_nc16hw16.cu",
    "cutlass_tensorop_s8_i8832fprop_relu_s4_128x32x64_64x32x64_1_nhwc_nc16hw16.cu",
    "cutlass_tensorop_s8_i8832fprop_1x1_relu_s4_128x32x64_64x32x64_1_nhwc_nc16hw16.cu",
    "cutlass_tensorop_s8_i8832fprop_hswish_s4_128x32x64_64x32x64_1_nhwc_nc16hw16.cu",
    "cutlass_tensorop_s8_i8832fprop_1x1_hswish_s4_128x32x64_64x32x64_1_nhwc_nc16hw16.cu",
    "cutlass_tensorop_s8_i8832fprop_roc_id_s4_128x32x64_64x32x64_1_nhwc_nc16hw16.cu",
    "cutlass_tensorop_s8_i8832fprop_1x1_roc_id_s4_128x32x64_64x32x64_1_nhwc_nc16hw16.cu",
    "cutlass_tensorop_s8_i8832fprop_roc_relu_s4_128x32x64_64x32x64_1_nhwc_nc16hw16.cu",
    "cutlass_tensorop_s8_i8832fprop_1x1_roc_relu_s4_128x32x64_64x32x64_1_nhwc_nc16hw16.cu",
    "cutlass_tensorop_s8_i8832fprop_roc_hswish_s4_128x32x64_64x32x64_1_nhwc_nc16hw16.cu",
    "cutlass_tensorop_s8_i8832fprop_1x1_roc_hswish_s4_128x32x64_64x32x64_1_nhwc_nc16hw16.cu",
    "cutlass_tensorop_s8_i8832fprop_id_s4_128x64x64_64x64x64_1_nhwc_nc16hw16.cu",
    "cutlass_tensorop_s8_i8832fprop_1x1_id_s4_128x64x64_64x64x64_1_nhwc_nc16hw16.cu",
    "cutlass_tensorop_s8_i8832fprop_relu_s4_128x64x64_64x64x64_1_nhwc_nc16hw16.cu",
    "cutlass_tensorop_s8_i8832fprop_1x1_relu_s4_128x64x64_64x64x64_1_nhwc_nc16hw16.cu",
    "cutlass_tensorop_s8_i8832fprop_hswish_s4_128x64x64_64x64x64_1_nhwc_nc16hw16.cu",
    "cutlass_tensorop_s8_i8832fprop_1x1_hswish_s4_128x64x64_64x64x64_1_nhwc_nc16hw16.cu",
    "cutlass_tensorop_s8_i8832fprop_roc_id_s4_128x64x64_64x64x64_1_nhwc_nc16hw16.cu",
    "cutlass_tensorop_s8_i8832fprop_1x1_roc_id_s4_128x64x64_64x64x64_1_nhwc_nc16hw16.cu",
    "cutlass_tensorop_s8_i8832fprop_roc_relu_s4_128x64x64_64x64x64_1_nhwc_nc16hw16.cu",
    "cutlass_tensorop_s8_i8832fprop_1x1_roc_relu_s4_128x64x64_64x64x64_1_nhwc_nc16hw16.cu",
    "cutlass_tensorop_s8_i8832fprop_roc_hswish_s4_128x64x64_64x64x64_1_nhwc_nc16hw16.cu",
    "cutlass_tensorop_s8_i8832fprop_1x1_roc_hswish_s4_128x64x64_64x64x64_1_nhwc_nc16hw16.cu",
    "cutlass_tensorop_s8_i8832fprop_id_s4_128x16x64_128x16x64_2_nhwc_nc32hw32.cu",
    "cutlass_tensorop_s8_i8832fprop_1x1_id_s4_128x16x64_128x16x64_2_nhwc_nc32hw32.cu",
    "cutlass_tensorop_s8_i8832fprop_relu_s4_128x16x64_128x16x64_2_nhwc_nc32hw32.cu",
    "cutlass_tensorop_s8_i8832fprop_1x1_relu_s4_128x16x64_128x16x64_2_nhwc_nc32hw32.cu",
    "cutlass_tensorop_s8_i8832fprop_hswish_s4_128x16x64_128x16x64_2_nhwc_nc32hw32.cu",
    "cutlass_tensorop_s8_i8832fprop_1x1_hswish_s4_128x16x64_128x16x64_2_nhwc_nc32hw32.cu",
    "cutlass_tensorop_s8_i8832fprop_id_s4_128x32x64_64x32x64_1_nhwc_nc32hw32.cu",
    "cutlass_tensorop_s8_i8832fprop_1x1_id_s4_128x32x64_64x32x64_1_nhwc_nc32hw32.cu",
    "cutlass_tensorop_s8_i8832fprop_relu_s4_128x32x64_64x32x64_1_nhwc_nc32hw32.cu",
    "cutlass_tensorop_s8_i8832fprop_1x1_relu_s4_128x32x64_64x32x64_1_nhwc_nc32hw32.cu",
    "cutlass_tensorop_s8_i8832fprop_hswish_s4_128x32x64_64x32x64_1_nhwc_nc32hw32.cu",
    "cutlass_tensorop_s8_i8832fprop_1x1_hswish_s4_128x32x64_64x32x64_1_nhwc_nc32hw32.cu",
    "cutlass_tensorop_s8_i8832fprop_roc_id_s4_128x32x64_64x32x64_1_nhwc_nc32hw32.cu",
    "cutlass_tensorop_s8_i8832fprop_1x1_roc_id_s4_128x32x64_64x32x64_1_nhwc_nc32hw32.cu",
    "cutlass_tensorop_s8_i8832fprop_roc_relu_s4_128x32x64_64x32x64_1_nhwc_nc32hw32.cu",
    "cutlass_tensorop_s8_i8832fprop_1x1_roc_relu_s4_128x32x64_64x32x64_1_nhwc_nc32hw32.cu",
    "cutlass_tensorop_s8_i8832fprop_roc_hswish_s4_128x32x64_64x32x64_1_nhwc_nc32hw32.cu",
    "cutlass_tensorop_s8_i8832fprop_1x1_roc_hswish_s4_128x32x64_64x32x64_1_nhwc_nc32hw32.cu",
    "cutlass_tensorop_s8_i8832fprop_id_s4_128x64x64_64x64x64_1_nhwc_nc32hw32.cu",
    "cutlass_tensorop_s8_i8832fprop_1x1_id_s4_128x64x64_64x64x64_1_nhwc_nc32hw32.cu",
    "cutlass_tensorop_s8_i8832fprop_relu_s4_128x64x64_64x64x64_1_nhwc_nc32hw32.cu",
    "cutlass_tensorop_s8_i8832fprop_1x1_relu_s4_128x64x64_64x64x64_1_nhwc_nc32hw32.cu",
    "cutlass_tensorop_s8_i8832fprop_hswish_s4_128x64x64_64x64x64_1_nhwc_nc32hw32.cu",
    "cutlass_tensorop_s8_i8832fprop_1x1_hswish_s4_128x64x64_64x64x64_1_nhwc_nc32hw32.cu",
    "cutlass_tensorop_s8_i8832fprop_roc_id_s4_128x64x64_64x64x64_1_nhwc_nc32hw32.cu",
    "cutlass_tensorop_s8_i8832fprop_1x1_roc_id_s4_128x64x64_64x64x64_1_nhwc_nc32hw32.cu",
    "cutlass_tensorop_s8_i8832fprop_roc_relu_s4_128x64x64_64x64x64_1_nhwc_nc32hw32.cu",
    "cutlass_tensorop_s8_i8832fprop_1x1_roc_relu_s4_128x64x64_64x64x64_1_nhwc_nc32hw32.cu",
    "cutlass_tensorop_s8_i8832fprop_roc_hswish_s4_128x64x64_64x64x64_1_nhwc_nc32hw32.cu",
    "cutlass_tensorop_s8_i8832fprop_1x1_roc_hswish_s4_128x64x64_64x64x64_1_nhwc_nc32hw32.cu",
    "cutlass_tensorop_s8_i8832fprop_id_u4_s4_128x16x64_128x16x64_2_nhwc_nc8hw8.cu",
    "cutlass_tensorop_s8_i8832fprop_1x1_id_u4_s4_128x16x64_128x16x64_2_nhwc_nc8hw8.cu",
    "cutlass_tensorop_s8_i8832fprop_relu_u4_s4_128x16x64_128x16x64_2_nhwc_nc8hw8.cu",
    "cutlass_tensorop_s8_i8832fprop_1x1_relu_u4_s4_128x16x64_128x16x64_2_nhwc_nc8hw8.cu",
    "cutlass_tensorop_s8_i8832fprop_id_u4_s4_128x32x64_64x32x64_1_nhwc_nc8hw8.cu",
    "cutlass_tensorop_s8_i8832fprop_1x1_id_u4_s4_128x32x64_64x32x64_1_nhwc_nc8hw8.cu",
    "cutlass_tensorop_s8_i8832fprop_relu_u4_s4_128x32x64_64x32x64_1_nhwc_nc8hw8.cu",
    "cutlass_tensorop_s8_i8832fprop_1x1_relu_u4_s4_128x32x64_64x32x64_1_nhwc_nc8hw8.cu",
    "cutlass_tensorop_s8_i8832fprop_roc_id_u4_s4_128x32x64_64x32x64_1_nhwc_nc8hw8.cu",
    "cutlass_tensorop_s8_i8832fprop_1x1_roc_id_u4_s4_128x32x64_64x32x64_1_nhwc_nc8hw8.cu",
    "cutlass_tensorop_s8_i8832fprop_roc_relu_u4_s4_128x32x64_64x32x64_1_nhwc_nc8hw8.cu",
    "cutlass_tensorop_s8_i8832fprop_1x1_roc_relu_u4_s4_128x32x64_64x32x64_1_nhwc_nc8hw8.cu",
    "cutlass_tensorop_s8_i8832fprop_id_u4_s4_128x64x64_64x64x64_1_nhwc_nc8hw8.cu",
    "cutlass_tensorop_s8_i8832fprop_1x1_id_u4_s4_128x64x64_64x64x64_1_nhwc_nc8hw8.cu",
    "cutlass_tensorop_s8_i8832fprop_relu_u4_s4_128x64x64_64x64x64_1_nhwc_nc8hw8.cu",
    "cutlass_tensorop_s8_i8832fprop_1x1_relu_u4_s4_128x64x64_64x64x64_1_nhwc_nc8hw8.cu",
    "cutlass_tensorop_s8_i8832fprop_roc_id_u4_s4_128x64x64_64x64x64_1_nhwc_nc8hw8.cu",
    "cutlass_tensorop_s8_i8832fprop_1x1_roc_id_u4_s4_128x64x64_64x64x64_1_nhwc_nc8hw8.cu",
    "cutlass_tensorop_s8_i8832fprop_roc_relu_u4_s4_128x64x64_64x64x64_1_nhwc_nc8hw8.cu",
    "cutlass_tensorop_s8_i8832fprop_1x1_roc_relu_u4_s4_128x64x64_64x64x64_1_nhwc_nc8hw8.cu",
    "cutlass_tensorop_s8_i8832fprop_id_u4_s4_128x16x64_128x16x64_2_nhwc_nc16hw16.cu",
    "cutlass_tensorop_s8_i8832fprop_1x1_id_u4_s4_128x16x64_128x16x64_2_nhwc_nc16hw16.cu",
    "cutlass_tensorop_s8_i8832fprop_relu_u4_s4_128x16x64_128x16x64_2_nhwc_nc16hw16.cu",
    "cutlass_tensorop_s8_i8832fprop_1x1_relu_u4_s4_128x16x64_128x16x64_2_nhwc_nc16hw16.cu",
    "cutlass_tensorop_s8_i8832fprop_id_u4_s4_128x32x64_64x32x64_1_nhwc_nc16hw16.cu",
    "cutlass_tensorop_s8_i8832fprop_1x1_id_u4_s4_128x32x64_64x32x64_1_nhwc_nc16hw16.cu",
    "cutlass_tensorop_s8_i8832fprop_relu_u4_s4_128x32x64_64x32x64_1_nhwc_nc16hw16.cu",
    "cutlass_tensorop_s8_i8832fprop_1x1_relu_u4_s4_128x32x64_64x32x64_1_nhwc_nc16hw16.cu",
    "cutlass_tensorop_s8_i8832fprop_roc_id_u4_s4_128x32x64_64x32x64_1_nhwc_nc16hw16.cu",
    "cutlass_tensorop_s8_i8832fprop_1x1_roc_id_u4_s4_128x32x64_64x32x64_1_nhwc_nc16hw16.cu",
    "cutlass_tensorop_s8_i8832fprop_roc_relu_u4_s4_128x32x64_64x32x64_1_nhwc_nc16hw16.cu",
    "cutlass_tensorop_s8_i8832fprop_1x1_roc_relu_u4_s4_128x32x64_64x32x64_1_nhwc_nc16hw16.cu",
    "cutlass_tensorop_s8_i8832fprop_id_u4_s4_128x64x64_64x64x64_1_nhwc_nc16hw16.cu",
    "cutlass_tensorop_s8_i8832fprop_1x1_id_u4_s4_128x64x64_64x64x64_1_nhwc_nc16hw16.cu",
    "cutlass_tensorop_s8_i8832fprop_relu_u4_s4_128x64x64_64x64x64_1_nhwc_nc16hw16.cu",
    "cutlass_tensorop_s8_i8832fprop_1x1_relu_u4_s4_128x64x64_64x64x64_1_nhwc_nc16hw16.cu",
    "cutlass_tensorop_s8_i8832fprop_roc_id_u4_s4_128x64x64_64x64x64_1_nhwc_nc16hw16.cu",
    "cutlass_tensorop_s8_i8832fprop_1x1_roc_id_u4_s4_128x64x64_64x64x64_1_nhwc_nc16hw16.cu",
    "cutlass_tensorop_s8_i8832fprop_roc_relu_u4_s4_128x64x64_64x64x64_1_nhwc_nc16hw16.cu",
    "cutlass_tensorop_s8_i8832fprop_1x1_roc_relu_u4_s4_128x64x64_64x64x64_1_nhwc_nc16hw16.cu",
    "cutlass_tensorop_s8_i8832fprop_id_u4_s4_128x16x64_128x16x64_2_nhwc_nc32hw32.cu",
    "cutlass_tensorop_s8_i8832fprop_1x1_id_u4_s4_128x16x64_128x16x64_2_nhwc_nc32hw32.cu",
    "cutlass_tensorop_s8_i8832fprop_relu_u4_s4_128x16x64_128x16x64_2_nhwc_nc32hw32.cu",
    "cutlass_tensorop_s8_i8832fprop_1x1_relu_u4_s4_128x16x64_128x16x64_2_nhwc_nc32hw32.cu",
    "cutlass_tensorop_s8_i8832fprop_id_u4_s4_128x32x64_64x32x64_1_nhwc_nc32hw32.cu",
    "cutlass_tensorop_s8_i8832fprop_1x1_id_u4_s4_128x32x64_64x32x64_1_nhwc_nc32hw32.cu",
    "cutlass_tensorop_s8_i8832fprop_relu_u4_s4_128x32x64_64x32x64_1_nhwc_nc32hw32.cu",
    "cutlass_tensorop_s8_i8832fprop_1x1_relu_u4_s4_128x32x64_64x32x64_1_nhwc_nc32hw32.cu",
    "cutlass_tensorop_s8_i8832fprop_roc_id_u4_s4_128x32x64_64x32x64_1_nhwc_nc32hw32.cu",
    "cutlass_tensorop_s8_i8832fprop_1x1_roc_id_u4_s4_128x32x64_64x32x64_1_nhwc_nc32hw32.cu",
    "cutlass_tensorop_s8_i8832fprop_roc_relu_u4_s4_128x32x64_64x32x64_1_nhwc_nc32hw32.cu",
    "cutlass_tensorop_s8_i8832fprop_1x1_roc_relu_u4_s4_128x32x64_64x32x64_1_nhwc_nc32hw32.cu",
    "cutlass_tensorop_s8_i8832fprop_id_u4_s4_128x64x64_64x64x64_1_nhwc_nc32hw32.cu",
    "cutlass_tensorop_s8_i8832fprop_1x1_id_u4_s4_128x64x64_64x64x64_1_nhwc_nc32hw32.cu",
    "cutlass_tensorop_s8_i8832fprop_relu_u4_s4_128x64x64_64x64x64_1_nhwc_nc32hw32.cu",
    "cutlass_tensorop_s8_i8832fprop_1x1_relu_u4_s4_128x64x64_64x64x64_1_nhwc_nc32hw32.cu",
    "cutlass_tensorop_s8_i8832fprop_roc_id_u4_s4_128x64x64_64x64x64_1_nhwc_nc32hw32.cu",
    "cutlass_tensorop_s8_i8832fprop_1x1_roc_id_u4_s4_128x64x64_64x64x64_1_nhwc_nc32hw32.cu",
    "cutlass_tensorop_s8_i8832fprop_roc_relu_u4_s4_128x64x64_64x64x64_1_nhwc_nc32hw32.cu",
    "cutlass_tensorop_s8_i8832fprop_1x1_roc_relu_u4_s4_128x64x64_64x64x64_1_nhwc_nc32hw32.cu",
    "all_conv2d_tensorop8832_operations.cu",
 ]
 ]
--- a/dnn/src/common/convolution.cpp
+++ b/dnn/src/common/convolution.cpp
@@ -553,7 +553,10 @@ void ConvolutionBase<Parameter>::check_or_deduce_dtype_fwd(DType src,
                dst.valid() && (dst.enumv() == src.enumv() ||
                                ((dst.enumv() == DTypeEnum::QuantizedS4 ||
                                  dst.enumv() == DTypeEnum::Quantized4Asymm) &&
                                 src.enumv() == DTypeEnum::QuantizedS8));
                                 src.enumv() == DTypeEnum::QuantizedS8) ||
                                ((src.enumv() == DTypeEnum::QuantizedS4 ||
                                  src.enumv() == DTypeEnum::Quantized4Asymm) &&
                                 dst.enumv() == DTypeEnum::QuantizedS8));
        if (cond_dst) {
            supported_dst_dtype.push_back(dst);
        }
--- a/dnn/src/cuda/conv_bias/algo.cpp
+++ b/dnn/src/cuda/conv_bias/algo.cpp
@@ -71,6 +71,9 @@ ConvBiasForwardImpl::AlgoPack::AlgoPack() {
    for (auto&& algo : int8_nchw32_imma) {
        all_algos.push_back(&algo);
    }
    for (auto&& algo : int8_nhwc_imma) {
        all_algos.push_back(&algo);
    }
    for (auto&& algo : int4_int4_nchw64_imma) {
        all_algos.push_back(&algo);
    }
@@ -236,7 +239,21 @@ void ConvBiasForwardImpl::AlgoPack::fill_imma_algos() {
        int8_nchw32_imma.emplace_back(
                AlgoParam{32, 128, 32, 32, 64, 32, 8, 8, 16, 1});
    }

    {
        using AlgoParam = AlgoInt8NHWCIMMAImplicitGemm::AlgoParam;
        int8_nhwc_imma.emplace_back(
                AlgoParam{64, 16, 32, 64, 16, 32, 8, 8, 16, 2, 16});
        int8_nhwc_imma.emplace_back(
                AlgoParam{64, 16, 32, 64, 16, 32, 8, 8, 16, 2, 8});
        int8_nhwc_imma.emplace_back(
                AlgoParam{64, 16, 32, 64, 16, 32, 8, 8, 16, 2, 4});
        int8_nhwc_imma.emplace_back(
                AlgoParam{128, 32, 32, 64, 32, 32, 8, 8, 16, 1, 16});
        int8_nhwc_imma.emplace_back(
                AlgoParam{128, 32, 32, 64, 32, 32, 8, 8, 16, 1, 8});
        int8_nhwc_imma.emplace_back(
                AlgoParam{128, 32, 32, 64, 32, 32, 8, 8, 16, 1, 4});
    }
    {
        using AlgoParam = AlgoInt4Int4NCHW64IMMAImplicitGemm::AlgoParam;
        int4_int4_nchw64_imma.emplace_back(
@@ -262,6 +279,12 @@ void ConvBiasForwardImpl::AlgoPack::fill_imma_algos() {
    {
        using AlgoParam = AlgoInt4Int4NHWCIMMAImplicitGemm::AlgoParam;
        int4_int4_nhwc_imma.emplace_back(
                AlgoParam{128, 16, 64, 128, 16, 64, 8, 8, 32, 2, 32});
        int4_int4_nhwc_imma.emplace_back(
                AlgoParam{128, 16, 64, 128, 16, 64, 8, 8, 32, 2, 16});
        int4_int4_nhwc_imma.emplace_back(
                AlgoParam{128, 16, 64, 128, 16, 64, 8, 8, 32, 2, 8});
        int4_int4_nhwc_imma.emplace_back(
                AlgoParam{128, 32, 64, 64, 32, 64, 8, 8, 32, 1, 32});
        int4_int4_nhwc_imma.emplace_back(
                AlgoParam{128, 32, 64, 64, 32, 64, 8, 8, 32, 1, 16});
@@ -277,6 +300,12 @@ void ConvBiasForwardImpl::AlgoPack::fill_imma_algos() {
    {
        using AlgoParam = AlgoUInt4Int4NHWCIMMAImplicitGemm::AlgoParam;
        uint4_int4_nhwc_imma.emplace_back(
                AlgoParam{128, 16, 64, 128, 16, 64, 8, 8, 32, 2, 32});
        uint4_int4_nhwc_imma.emplace_back(
                AlgoParam{128, 16, 64, 128, 16, 64, 8, 8, 32, 2, 16});
        uint4_int4_nhwc_imma.emplace_back(
                AlgoParam{128, 16, 64, 128, 16, 64, 8, 8, 32, 2, 8});
        uint4_int4_nhwc_imma.emplace_back(
                AlgoParam{128, 32, 64, 64, 32, 64, 8, 8, 32, 1, 32});
        uint4_int4_nhwc_imma.emplace_back(
                AlgoParam{128, 32, 64, 64, 32, 64, 8, 8, 32, 1, 16});
--- a/dnn/src/cuda/conv_bias/algo.h
+++ b/dnn/src/cuda/conv_bias/algo.h
@@ -72,6 +72,7 @@ public:
        CUDA_IMPLICIT_GEMM_REORDER_FILTER_CHWN4_IMMA_INT8,
        CUDA_IMPLICIT_GEMM_UNROLL_WIDTH_CHWN4_IMMA_INT8,
        CUDA_IMPLICIT_GEMM_IMMA_NCHW32_INT8,
        CUDA_IMPLICIT_GEMM_IMMA_NHWC_INT8,
        CUDA_IMPLICIT_GEMM_IMMA_NCHW64_INT4_INT4,
        CUDA_IMPLICIT_GEMM_IMMA_NCHW64_UINT4_INT4,
        CUDA_IMPLICIT_GEMM_IMMA_NHWC_INT4_INT4,
@@ -524,6 +525,7 @@ public:
 * +
 * +--- AlgoInt8NCHW4DotProdImplicitGemm
 * +--- AlgoInt8NCHW32IMMAImplicitGemm
 * +--- AlgoInt8NHWCIMMAImplicitGemm
 * +
 * +--- AlgoInt4NCHW64IMMAImplicitGemmBase
 * +----+--- AlgoInt4Int4NCHW64IMMAImplicitGemm
@@ -582,7 +584,7 @@ public:
    // operation (cutlass kernel) from the global OperationTable
    const cutlass::library::Operation* get_cutlass_conv_op(
            const SizeArgs& args, ConvOperator conv_op, ConvType conv_type,
            bool load_from_const, bool without_shared_load) const;
            bool use_conv_filter_unity_opt, bool without_shared_load) const;

    // execute the cutlass kernel found by get_cutlass_conv_op. we give
    // subclasses full freedom to decide where and how these arguments are
@@ -829,6 +831,47 @@ private:
    std::string m_name;
 };

 class ConvBiasForwardImpl::AlgoInt8NHWCIMMAImplicitGemm final
        : public AlgoCutlassConvolutionBase {
 public:
    AlgoInt8NHWCIMMAImplicitGemm(AlgoParam algo_param)
            : AlgoCutlassConvolutionBase(algo_param) {
        m_name = ConvBias::algo_name<ConvBias::DirectParam>(
                ssprintf("INT8_NHWC_IMMA_IMPLICIT_GEMM_%s",
                         to_string(m_algo_param).c_str()),
                ConvBias::DirectParam{});
    }
    bool is_available(const SizeArgs& args) const override;
    size_t get_workspace_in_bytes(const SizeArgs& args) const override;
    void exec(const ExecArgs& args) const override;
    const char* name() const override { return m_name.c_str(); }
    AlgoAttribute attribute() const override {
        return AlgoAttribute::REPRODUCIBLE;
    }
    static std::string to_string(AlgoParam algo_param);
    size_t get_preprocess_workspace_in_bytes(
            const SizeArgs& args) const override;
    SmallVector<TensorLayout> deduce_preprocessed_filter_layout(
            const SizeArgs& args) const override;
    void exec_preprocess(const ExecArgs& args) const override;
    MEGDNN_DECL_ALGO_TYPE(CUDA_IMPLICIT_GEMM_IMMA_NHWC_INT8)

    std::string param() const override {
        std::string ret;
        serialize_write_pod(m_algo_param, ret);
        return ret;
    }

 private:
    std::tuple<float, float, float, float, float> get_constants(
            const ExecArgs& args) const;

    void reorder_filter(const ExecArgs& args, int interleaved,
                        void* reordered_filter) const;

    std::string m_name;
 };

 class ConvBiasForwardImpl::AlgoInt4NCHW64IMMAImplicitGemmBase
        : public AlgoCutlassConvolutionBase {
 public:
@@ -1087,6 +1130,7 @@ public:
 #endif
 #if CUDA_VERSION >= 10020
    std::vector<AlgoInt8NCHW32IMMAImplicitGemm> int8_nchw32_imma;
    std::vector<AlgoInt8NHWCIMMAImplicitGemm> int8_nhwc_imma;
    std::vector<AlgoInt4Int4NCHW64IMMAImplicitGemm> int4_int4_nchw64_imma;
    std::vector<AlgoUInt4Int4NCHW64IMMAImplicitGemm> uint4_int4_nchw64_imma;
    std::vector<AlgoInt4Int4NHWCIMMAImplicitGemm> int4_int4_nhwc_imma;
--- a/dnn/src/cuda/conv_bias/cutlass_convolution_base.cpp
+++ b/dnn/src/cuda/conv_bias/cutlass_convolution_base.cpp
@@ -140,6 +140,11 @@ LayoutPack get_layout_pack(const param::ConvBias::Format format,
                    LayoutTypeID::kTensorNC64HW64};
        case Format::NHWC:
            switch (access_type) {
                case 4:
                    return {LayoutTypeID::kTensorNHWC,
                            LayoutTypeID::kTensorNC4HW4,
                            LayoutTypeID::kTensorNHWC,
                            LayoutTypeID::kTensorNHWC};
                case 8:
                    return {LayoutTypeID::kTensorNHWC,
                            LayoutTypeID::kTensorNC8HW8,
@@ -192,12 +197,18 @@ EpilogueType get_epilogue_type(const param::ConvBias::NonlineMode mode,
 const Operation*
 ConvBiasForwardImpl::AlgoCutlassConvolutionBase::get_cutlass_conv_op(
        const SizeArgs& args, ConvOperator conv_op, ConvType conv_type,
        bool load_from_const, bool without_shared_load) const {
    using Format = param::ConvBias::Format;
        bool use_conv_filter_unity_opt, bool without_shared_load) const {
    auto&& param = args.opr->param();
    auto layouts = get_layout_pack(param.format, m_algo_param.access_size);
    auto epilogue_type = get_epilogue_type(param.nonlineMode,
                                           param.format != Format::NCHW4_NCHW);
    auto epilogue_type = get_epilogue_type(
            param.nonlineMode,
            args.dst_layout->dtype.enumv() != DTypeEnum::Float32);

    cutlass::conv::SpecialOptimizeDesc special_optimization =
            (use_conv_filter_unity_opt)
                    ? cutlass::conv::SpecialOptimizeDesc::CONV_FILTER_UNITY
                    : cutlass::conv::SpecialOptimizeDesc::NONE;

    ConvolutionKey key{convert_conv_op(conv_op),
                       convert_dtype(args.src_layout->dtype.enumv()),
                       layouts.src,
@@ -219,7 +230,7 @@ ConvBiasForwardImpl::AlgoCutlassConvolutionBase::get_cutlass_conv_op(
                       m_algo_param.instruction_k,
                       epilogue_type,
                       m_algo_param.stage,
                       load_from_const,
                       special_optimization,
                       without_shared_load};

    return Singleton::get().operation_table.find_op(key);
--- a/dnn/src/cuda/conv_bias/cutlass_reorder_filter.cu
+++ b/dnn/src/cuda/conv_bias/cutlass_reorder_filter.cu
@@ -144,28 +144,48 @@ void megdnn::cuda::cutlass_wrapper::reorder_ncxhwx_imma_filter(
                                                   IC, FH, FW, trans_oc);
    after_kernel_launch();
 }

 template <uint32_t size_bits, uint32_t alignbits>
 template <uint32_t size_bits>
 void megdnn::cuda::cutlass_wrapper::reorder_nhwc_imma_filter(
        int8_t* dst_filter, const int8_t* src_filter, uint32_t OC, uint32_t IC,
        uint32_t FH, uint32_t FW, bool trans_oc, uint32_t oc_interleaved,
        cudaStream_t stream) {
    static constexpr uint32_t elements_per_access = alignbits / size_bits;
    uint32_t nr_threads =
            query_blocksize_for_kernel(reinterpret_cast<const void*>(
                    reorder_nhwc_imma_filter_kernel<size_bits, alignbits, 32>));
        uint32_t FH, uint32_t FW, bool trans_oc, uint32_t alignbits,
        uint32_t interleaved, cudaStream_t stream) {
    const uint32_t elements_per_access = alignbits / size_bits;

    void (*kern)(int8_t* __restrict__, const int8_t* __restrict__, uint32_t,
                 uint32_t, uint32_t, uint32_t, bool);
    kern = nullptr;

    auto get_kern = [&kern](const uint32_t alignbits,
                            const uint32_t interleaved) {
 #define DISPATCH_KERNEL(alignbits_, interleaved_)                     \
    if (alignbits == alignbits_ && interleaved == interleaved_) {     \
        kern = reorder_nhwc_imma_filter_kernel<size_bits, alignbits_, \
                                               interleaved_>;         \
        return;                                                       \
    }
        DISPATCH_KERNEL(128, 16);
        DISPATCH_KERNEL(64, 16);
        DISPATCH_KERNEL(32, 16);
        DISPATCH_KERNEL(128, 32);
        DISPATCH_KERNEL(64, 32);
        DISPATCH_KERNEL(32, 32);
        DISPATCH_KERNEL(128, 64);
        DISPATCH_KERNEL(64, 64);
        DISPATCH_KERNEL(32, 64);

 #undef DISPATCH_KERNEL
    };

    get_kern(alignbits, interleaved);

    uint32_t nr_threads = query_blocksize_for_kernel(kern);
    uint32_t vthreads = DIVUP(OC * IC * FH * FW, elements_per_access);
    nr_threads = std::min(nr_threads, vthreads);
    uint32_t nr_blocks = DIVUP(vthreads, nr_threads);
    if (oc_interleaved == 32) {
        reorder_nhwc_imma_filter_kernel<size_bits, alignbits, 32>
                <<<nr_blocks, nr_threads, 0, stream>>>(
                        dst_filter, src_filter, OC, IC, FH, FW, trans_oc);
    } else {
        reorder_nhwc_imma_filter_kernel<size_bits, alignbits, 64>
                <<<nr_blocks, nr_threads, 0, stream>>>(
                        dst_filter, src_filter, OC, IC, FH, FW, trans_oc);
    }

    kern<<<nr_blocks, nr_threads, 0, stream>>>(dst_filter, src_filter, OC, IC,
                                               FH, FW, trans_oc);

    after_kernel_launch();
 }

@@ -180,15 +200,14 @@ INST(8, 32)
 INST(4, 64)
 #undef INST

 #define INST(_size_bits, _alignbits)                                       \
    template void megdnn::cuda::cutlass_wrapper::reorder_nhwc_imma_filter< \
            _size_bits, _alignbits>(                                       \
            int8_t * dst_filter, const int8_t* src_filter, uint32_t OC,    \
            uint32_t IC, uint32_t FH, uint32_t FW, bool trans_oc,          \
            uint32_t oc_interleaved, cudaStream_t stream);
 INST(4, 32)
 INST(4, 64)
 INST(4, 128)
 #define INST(_size_bits)                                                 \
    template void                                                        \
    megdnn::cuda::cutlass_wrapper::reorder_nhwc_imma_filter<_size_bits>( \
            int8_t * dst_filter, const int8_t* src_filter, uint32_t OC,  \
            uint32_t IC, uint32_t FH, uint32_t FW, bool trans_oc,        \
            uint32_t alignbits, uint32_t interleaved, cudaStream_t stream);
 INST(4)
 INST(8)
 #undef INST

 // vim: syntax=cuda.doxygen
--- a/dnn/src/cuda/conv_bias/cutlass_reorder_filter.cuh
+++ b/dnn/src/cuda/conv_bias/cutlass_reorder_filter.cuh
@@ -23,11 +23,11 @@ void reorder_ncxhwx_imma_filter(int8_t* dst_filter, const int8_t* src_filter,
                                uint32_t FW, bool trans_oc,
                                cudaStream_t stream);

 template <uint32_t size_bits, uint32_t alignbits>
 template <uint32_t size_bits>
 void reorder_nhwc_imma_filter(int8_t* dst_filter, const int8_t* src_filter,
                              uint32_t OC, uint32_t IC, uint32_t FH,
                              uint32_t FW, bool trans_oc,
                              uint32_t oc_interleaved, cudaStream_t stream);
                              uint32_t FW, bool trans_oc, uint32_t alignbits,
                              uint32_t interleaved, cudaStream_t stream);
 }  // namespace cutlass_wrapper
 }  // namespace cuda
 }  // namespace megdnn
--- a/dnn/src/cuda/conv_bias/implicit_gemm_int4_int4_nhwc_imma.cpp
+++ b/dnn/src/cuda/conv_bias/implicit_gemm_int4_int4_nhwc_imma.cpp
@@ -68,13 +68,27 @@ ConvBiasForwardImpl::AlgoInt4Int4NHWCIMMAImplicitGemm::get_constants(
                  args.filter_layout->dtype.param<dtype::QuantizedS4>().scale,
          bias_scale =
                  args.bias_layout->dtype.param<dtype::QuantizedS32>().scale,
          dst_scale = args.dst_layout->dtype.param<dtype::QuantizedS4>().scale;
          dst_scale;

    if (args.dst_layout->dtype.enumv() == DTypeEnum::QuantizedS4) {
        dst_scale = args.dst_layout->dtype.param<dtype::QuantizedS4>().scale;
    } else {  // DTypeEnum::QuantizedS8
        megdnn_assert(args.dst_layout->dtype.enumv() == DTypeEnum::QuantizedS8);
        dst_scale = args.dst_layout->dtype.param<dtype::QuantizedS8>().scale;
    }

    float alpha = src_scale * filter_scale / dst_scale,
          beta = bias_scale / dst_scale, gamma = 0.f, delta = 0.f, theta = 0.f;

    if (args.z_layout->ndim > 0) {
        float z_scale = args.z_layout->dtype.param<dtype::QuantizedS4>().scale;
        float z_scale;
        if (args.z_layout->dtype.enumv() == DTypeEnum::QuantizedS4) {
            z_scale = args.z_layout->dtype.param<dtype::QuantizedS4>().scale;
        } else {  // DTypeEnum::QuantizedS8
            megdnn_assert(args.z_layout->dtype.enumv() ==
                          DTypeEnum::QuantizedS8);
            z_scale = args.z_layout->dtype.param<dtype::QuantizedS8>().scale;
        }
        gamma = z_scale / dst_scale;
    }

--- a/dnn/src/cuda/conv_bias/implicit_gemm_int4_nchw64_imma_base.cpp
+++ b/dnn/src/cuda/conv_bias/implicit_gemm_int4_nchw64_imma_base.cpp
@@ -76,6 +76,14 @@ bool ConvBiasForwardImpl::AlgoInt4NCHW64IMMAImplicitGemmBase::is_available(
    if (fh * fw > kMaxFilterPixels)
        return false;

    bool use_conv_filter_unity_opt = (fh == 1 && fw == 1);
    bool without_shared_load = true;
    const auto* op = get_cutlass_conv_op(
            args, ConvOperator::kFprop, ConvType::kConvolution,
            use_conv_filter_unity_opt, without_shared_load);
    if (op == nullptr)
        return false;

    return true;
 }

@@ -110,7 +118,7 @@ void ConvBiasForwardImpl::AlgoInt4NCHW64IMMAImplicitGemmBase::exec(
    float dst_scale = 0.f;
    float threshold = 0.f;
    uint8_t src_zero = 0;
    bool load_from_const = !(fh == 1 && fw == 1);
    bool use_conv_filter_unity_opt = (fh == 1 && fw == 1);
    bool without_shared_load = true;

    if (args.dst_layout->dtype.enumv() == DTypeEnum::Quantized4Asymm) {
@@ -126,7 +134,7 @@ void ConvBiasForwardImpl::AlgoInt4NCHW64IMMAImplicitGemmBase::exec(

    const auto* op = get_cutlass_conv_op(args, ConvOperator::kFprop,
                                         ConvType::kConvolution,
                                         load_from_const, without_shared_load);
                                         use_conv_filter_unity_opt, without_shared_load);

    execute_cutlass_conv_op(op, args.src_tensor->raw_ptr, filter_ptr, bias_ptr,
                            z_ptr, args.dst_tensor->raw_ptr, nullptr, n, hi, wi,
--- a/dnn/src/cuda/conv_bias/implicit_gemm_int4_nhwc_imma_base.cpp
+++ b/dnn/src/cuda/conv_bias/implicit_gemm_int4_nhwc_imma_base.cpp
@@ -56,8 +56,11 @@ bool ConvBiasForwardImpl::AlgoInt4NHWCIMMAImplicitGemmBase::is_available(

    if (args.src_layout->dtype.enumv() != src_dtype() ||
        args.filter_layout->dtype.enumv() != DTypeEnum::QuantizedS4 ||
        args.bias_layout->dtype.enumv() != DTypeEnum::QuantizedS32 ||
        args.dst_layout->dtype.enumv() != src_dtype())
        args.bias_layout->dtype.enumv() != DTypeEnum::QuantizedS32)
        return false;

    if (!(args.dst_layout->dtype.enumv() == src_dtype() ||
          args.dst_layout->dtype.enumv() == DTypeEnum::QuantizedS8))
        return false;

    // uint4 do not support H_SWISH activition
@@ -83,6 +86,16 @@ bool ConvBiasForwardImpl::AlgoInt4NHWCIMMAImplicitGemmBase::is_available(
    if ((co % 8 != 0) || (ci % m_algo_param.access_size != 0))
        return false;

    bool use_conv_filter_unity_opt = (fh == 1 && fw == 1);
    bool without_shared_load = ((co % m_algo_param.threadblock_n == 0) &&
                                (m_algo_param.threadblock_n == 32 ||
                                 m_algo_param.threadblock_n == 64));
    const auto* op = get_cutlass_conv_op(
            args, ConvOperator::kFprop, ConvType::kConvolution,
            use_conv_filter_unity_opt, without_shared_load);
    if (op == nullptr)
        return false;

    return true;
 }

@@ -117,26 +130,31 @@ void ConvBiasForwardImpl::AlgoInt4NHWCIMMAImplicitGemmBase::exec(
    float dst_scale = 0.f;
    float threshold = 0.f;
    uint8_t src_zero = 0;
    bool load_from_const = !(fh == 1 && fw == 1);
    bool use_conv_filter_unity_opt = (fh == 1 && fw == 1);

    bool without_shared_load = ((co % m_algo_param.threadblock_n == 0) &&
                                (m_algo_param.threadblock_n == 32 ||
                                 m_algo_param.threadblock_n == 64));

    if (args.src_layout->dtype.enumv() == DTypeEnum::Quantized4Asymm) {
        src_zero = args.src_layout->dtype.param<dtype::Quantized4Asymm>()
                           .zero_point;
    }

    if (args.dst_layout->dtype.enumv() == DTypeEnum::Quantized4Asymm) {
        dst_scale =
                args.dst_layout->dtype.param<dtype::Quantized4Asymm>().scale;
        src_zero = args.src_layout->dtype.param<dtype::Quantized4Asymm>()
                           .zero_point;
    } else {  // DTypeEnum::QuantizedS4
    } else if (args.dst_layout->dtype.enumv() == DTypeEnum::QuantizedS4) {
        dst_scale = args.dst_layout->dtype.param<dtype::QuantizedS4>().scale;
    } else {  // DTypeEnum::QuantizedS8
        dst_scale = args.dst_layout->dtype.param<dtype::QuantizedS8>().scale;
    }

    cudaStream_t stream = cuda_stream(args.opr->handle());

    const auto* op = get_cutlass_conv_op(args, ConvOperator::kFprop,
                                         ConvType::kConvolution,
                                         load_from_const, without_shared_load);
                                         use_conv_filter_unity_opt, without_shared_load);

    execute_cutlass_conv_op(op, args.src_tensor->raw_ptr, filter_ptr, bias_ptr,
                            z_ptr, args.dst_tensor->raw_ptr, nullptr, n, hi, wi,
@@ -166,29 +184,18 @@ void ConvBiasForwardImpl::AlgoInt4NHWCIMMAImplicitGemmBase::reorder_filter(
    cudaStream_t stream = cuda_stream(args.opr->handle());

    // reformat filter from nhwc to ncxhwx and reorder oc
    // use trans_oc threadblock_n must be 32 or 64
    // use trans_oc threadblock_n must be 32 or 64 and src dtype == dest dtype
    bool trans_oc = ((co % m_algo_param.threadblock_n == 0) &&
                     (m_algo_param.threadblock_n == 32 ||
                      m_algo_param.threadblock_n == 64));
    uint32_t oc_iterleave = (m_algo_param.threadblock_n == 64) ? 64 : 32;

    if (iterleaved == 8) {
        cutlass_wrapper::reorder_nhwc_imma_filter<4, 32>(
                reinterpret_cast<int8_t*>(reordered_filter),
                reinterpret_cast<int8_t*>(args.filter_tensor->raw_ptr), co, ci,
                fh, fw, trans_oc, oc_iterleave, stream);
    } else if (iterleaved == 16) {
        cutlass_wrapper::reorder_nhwc_imma_filter<4, 64>(
                reinterpret_cast<int8_t*>(reordered_filter),
                reinterpret_cast<int8_t*>(args.filter_tensor->raw_ptr), co, ci,
                fh, fw, trans_oc, oc_iterleave, stream);
    } else {
        megdnn_assert(iterleaved == 32);
        cutlass_wrapper::reorder_nhwc_imma_filter<4, 128>(
                reinterpret_cast<int8_t*>(reordered_filter),
                reinterpret_cast<int8_t*>(args.filter_tensor->raw_ptr), co, ci,
                fh, fw, trans_oc, oc_iterleave, stream);
    }
    uint32_t oc_iterleaved = (m_algo_param.threadblock_n == 64) ? 64 : 32;

    uint32_t alignbits = iterleaved * 4;

    cutlass_wrapper::reorder_nhwc_imma_filter<4>(
            reinterpret_cast<int8_t*>(reordered_filter),
            reinterpret_cast<int8_t*>(args.filter_tensor->raw_ptr), co, ci, fh,
            fw, trans_oc, alignbits, oc_iterleaved, stream);
 }
 #endif

--- a/dnn/src/cuda/conv_bias/implicit_gemm_int8_nchw32_imma.cpp
+++ b/dnn/src/cuda/conv_bias/implicit_gemm_int8_nchw32_imma.cpp
@@ -77,6 +77,14 @@ bool ConvBiasForwardImpl::AlgoInt8NCHW32IMMAImplicitGemm::is_available(
    // FIXME: too large filter size is not supported now
    size_t kMaxFilterPixels = 848 / (2 * m_algo_param.warp_k / 32) - 2;
    available &= fh * fw <= kMaxFilterPixels;

    bool use_conv_filter_unity_opt = (fh == 1 && fw == 1);
    bool without_shared_load = (param.format == Format::NCHW32);
    const auto* op = get_cutlass_conv_op(
            args, ConvOperator::kFprop, ConvType::kConvolution,
            use_conv_filter_unity_opt, without_shared_load);
    available &= (op != nullptr);

    return available;
 }

@@ -155,12 +163,12 @@ void ConvBiasForwardImpl::AlgoInt8NCHW32IMMAImplicitGemm::exec(
        gamma = z_scale / dst_scale;
    }
    float delta = 0.f, theta = 0.f, threshold = 0.f;
    bool load_from_const = !(fh == 1 && fw == 1);
    bool use_conv_filter_unity_opt = (fh == 1 && fw == 1);
    bool without_shared_load = (param.format == Format::NCHW32);

    const auto* op = get_cutlass_conv_op(args, ConvOperator::kFprop,
                                         ConvType::kConvolution,
                                         load_from_const, without_shared_load);
                                         use_conv_filter_unity_opt, without_shared_load);

    execute_cutlass_conv_op(
            op, args.src_tensor->raw_ptr, filter_ptr, args.bias_tensor->raw_ptr,
--- a/dnn/src/cuda/conv_bias/implicit_gemm_int8_nchw4_dp4a.cpp
+++ b/dnn/src/cuda/conv_bias/implicit_gemm_int8_nchw4_dp4a.cpp
@@ -98,7 +98,14 @@ bool ConvBiasForwardImpl::AlgoInt8NCHW4DotProdImplicitGemm::is_available(
    // FIXME: too large filter size is not supported now
    size_t kMaxFilterPixels = 848 / (2 * m_algo_param.warp_k / 4) - 2;
    available &= fh * fw <= kMaxFilterPixels;
    ;

    bool use_conv_filter_unity_opt = (fh == 1 && fw == 1);
    bool without_shared_load = false;
    const auto* op = get_cutlass_conv_op(
            args, ConvOperator::kFprop, ConvType::kConvolution,
            use_conv_filter_unity_opt, without_shared_load);
    available &= (op != nullptr);

    return available;
 }

@@ -213,12 +220,12 @@ void ConvBiasForwardImpl::AlgoInt8NCHW4DotProdImplicitGemm::exec(
        }
    }
    float threshold = 0.f;
    bool load_from_const = !(fh == 1 && fw == 1);
    bool use_conv_filter_unity_opt = (fh == 1 && fw == 1);
    bool without_shared_load = false;

    const auto* op = get_cutlass_conv_op(args, ConvOperator::kFprop,
                                         ConvType::kConvolution,
                                         load_from_const, without_shared_load);
    const auto* op = get_cutlass_conv_op(
            args, ConvOperator::kFprop, ConvType::kConvolution,
            use_conv_filter_unity_opt, without_shared_load);

    execute_cutlass_conv_op(
            op, args.src_tensor->raw_ptr, filter_ptr, args.bias_tensor->raw_ptr,
--- a/dnn/src/cuda/conv_bias/implicit_gemm_int8_nhwc_imma.cpp
+++ b/dnn/src/cuda/conv_bias/implicit_gemm_int8_nhwc_imma.cpp
@@ -0,0 +1,294 @@
 /**
 * \file dnn/src/cuda/conv_bias/implicit_gemm_int8_nhwc_imma.cpp
 * MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
 *
 * Copyright (c) 2014-2021 Megvii Inc. All rights reserved.
 *
 * Unless required by applicable law or agreed to in writing,
 * software distributed under the License is distributed on an
 * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or
 * implied.
 */

 #include "src/common/conv_bias.h"
 #include "src/cuda/conv_bias/algo.h"
 #include "src/cuda/conv_bias/cutlass_reorder_filter.cuh"
 #include "src/cuda/convolution_helper/parameter.cuh"
 #include "src/cuda/utils.h"

 using namespace megdnn;
 using namespace cuda;
 using namespace convolution;

 #if CUDA_VERSION >= 10020
 bool ConvBiasForwardImpl::AlgoInt8NHWCIMMAImplicitGemm::is_available(
        const SizeArgs& args) const {
    if (args.bias_layout->ndim <= 0)
        return false;

    using Param = param::ConvBias;
    using Format = Param::Format;
    using Sparse = Param::Sparse;
    using Mode = Param::Mode;
    using NonlineMode = megdnn::param::ConvBias::NonlineMode;

    auto&& param = args.opr->param();

    if (!check_bias_share_in_channel(*(args.bias_layout), param.format))
        return false;

    if (param.format != Format::NHWC || param.sparse != Sparse::DENSE ||
        param.mode != Mode::CROSS_CORRELATION)
        return false;

    if (param.nonlineMode != NonlineMode::IDENTITY &&
        param.nonlineMode != NonlineMode::RELU &&
        param.nonlineMode != NonlineMode::H_SWISH)
        return false;

    if (args.src_layout->dtype.enumv() != DTypeEnum::QuantizedS8 ||
        args.filter_layout->dtype.enumv() != DTypeEnum::QuantizedS8)
        return false;

    auto dst_dtype = args.dst_layout->dtype.enumv();

    if (!(dst_dtype == DTypeEnum::QuantizedS8 ||
          dst_dtype == DTypeEnum::QuantizedS4 ||
          dst_dtype == DTypeEnum::Quantized4Asymm ||
          dst_dtype == DTypeEnum::Float32))
        return false;

    if (!(args.bias_layout->dtype.enumv() == DTypeEnum::QuantizedS32 ||
          (args.bias_layout->dtype.enumv() == DTypeEnum::Float32 &&
           dst_dtype == DTypeEnum::Float32)))
        return false;

    if (!is_compute_capability_required(7, 5))
        return false;

    size_t co = args.filter_layout->operator[](0),
           ci = args.filter_layout->operator[](3),
           fh = args.filter_layout->operator[](1),
           fw = args.filter_layout->operator[](2);

    // param buffer size is 4K, use 3.4K to store precomputed offset
    size_t kMaxFilterPixels =
            848 / (m_algo_param.warp_k / m_algo_param.access_size) - 1;
    if (fh * fw > kMaxFilterPixels)
        return false;
    // co should be aligned with 4, and ci should be aligned with
    // algo_param.access_size
    if ((co % 4 != 0) || (ci % m_algo_param.access_size != 0))
        return false;

    bool use_conv_filter_unity_opt = (fh == 1 && fw == 1);
    bool without_shared_load = ((co % m_algo_param.threadblock_n == 0) &&
                                (m_algo_param.threadblock_n == 16 ||
                                 (m_algo_param.threadblock_n == 32 &&
                                  dst_dtype != DTypeEnum::Float32)));
    const auto* op = get_cutlass_conv_op(
            args, ConvOperator::kFprop, ConvType::kConvolution,
            use_conv_filter_unity_opt, without_shared_load);
    if (op == nullptr)
        return false;

    return true;
 }

 size_t
 ConvBiasForwardImpl::AlgoInt8NHWCIMMAImplicitGemm::get_workspace_in_bytes(
        const SizeArgs& args) const {
    if (args.preprocessed_filter) {
        return 0;
    } else {
        return args.filter_layout->span().dist_byte();
    }
 }

 size_t ConvBiasForwardImpl::AlgoInt8NHWCIMMAImplicitGemm::
        get_preprocess_workspace_in_bytes(const SizeArgs& args) const {
    return 0;
 }

 SmallVector<TensorLayout> ConvBiasForwardImpl::AlgoInt8NHWCIMMAImplicitGemm::
        deduce_preprocessed_filter_layout(const SizeArgs& args) const {
    return {args.filter_layout->collapse_contiguous()};
 }

 void ConvBiasForwardImpl::AlgoInt8NHWCIMMAImplicitGemm::exec_preprocess(
        const ExecArgs& args) const {
    void* filter_ptr = args.preprocessed_filter->tensors[0].raw_ptr;
    reorder_filter(args, m_algo_param.access_size, filter_ptr);
 }

 std::tuple<float, float, float, float, float>
 ConvBiasForwardImpl::AlgoInt8NHWCIMMAImplicitGemm::get_constants(
        const ExecArgs& args) const {
    float src_scale = args.src_layout->dtype.param<dtype::QuantizedS8>().scale,
          filter_scale =
                  args.filter_layout->dtype.param<dtype::QuantizedS8>().scale,
          bias_scale = 1.f, dst_scale;

    if (args.bias_layout->dtype.enumv() == DTypeEnum::QuantizedS32) {
        bias_scale = args.bias_layout->dtype.param<dtype::QuantizedS32>().scale;
    }

    uint8_t dst_zero = 0;

    if (args.dst_layout->dtype.enumv() == DTypeEnum::QuantizedS8) {
        dst_scale = args.dst_layout->dtype.param<dtype::QuantizedS8>().scale;
    } else if (args.dst_layout->dtype.enumv() == DTypeEnum::QuantizedS4) {
        dst_scale = args.dst_layout->dtype.param<dtype::QuantizedS4>().scale;
    } else if (args.dst_layout->dtype.enumv() == DTypeEnum::Quantized4Asymm) {
        dst_scale =
                args.dst_layout->dtype.param<dtype::Quantized4Asymm>().scale;
        dst_zero = args.dst_layout->dtype.param<dtype::Quantized4Asymm>()
                           .zero_point;
    } else {  // DTypeEnum::Float32
        megdnn_assert(args.dst_layout->dtype.enumv() == DTypeEnum::Float32);
        dst_scale = 1.f;
    }

    float alpha = src_scale * filter_scale / dst_scale,
          beta = bias_scale / dst_scale, gamma = 0.f, delta = 0.f,
          theta = dst_zero;

    if (args.z_layout->ndim > 0) {
        float z_scale;
        if (args.z_layout->dtype.enumv() == DTypeEnum::QuantizedS8) {
            z_scale = args.z_layout->dtype.param<dtype::QuantizedS8>().scale;
            gamma = z_scale / dst_scale;
        } else if (args.z_layout->dtype.enumv() == DTypeEnum::QuantizedS4) {
            z_scale = args.z_layout->dtype.param<dtype::QuantizedS4>().scale;
            gamma = z_scale / dst_scale;
        } else if (args.z_layout->dtype.enumv() == DTypeEnum::Quantized4Asymm) {
            z_scale =
                    args.z_layout->dtype.param<dtype::Quantized4Asymm>().scale;
            uint8_t z_zero =
                    args.z_layout->dtype.param<dtype::Quantized4Asymm>()
                            .zero_point;
            gamma = z_scale / dst_scale;
            delta = -z_zero * gamma;
        } else {  // DTypeEnum::Float32
            megdnn_assert(args.z_layout->dtype.enumv() == DTypeEnum::Float32);
            gamma = 1.f;
        }
    }

    if (args.opr->param().nonlineMode ==
        param::ConvBias::NonlineMode::IDENTITY) {
        delta += theta;
        theta = 0.f;
    }

    return {alpha, beta, gamma, delta, theta};
 }

 void ConvBiasForwardImpl::AlgoInt8NHWCIMMAImplicitGemm::exec(
        const ExecArgs& args) const {
    auto&& param = args.opr->param();
    auto&& fm = args.filter_meta;
    size_t n = args.src_layout->operator[](0),
           ci = args.src_layout->operator[](3),
           hi = args.src_layout->operator[](1),
           wi = args.src_layout->operator[](2);
    size_t co = args.dst_layout->operator[](3),
           ho = args.dst_layout->operator[](1),
           wo = args.dst_layout->operator[](2);
    UNPACK_CONV_PARAMETER(fm, param);
    MARK_USED_VAR

    void* filter_ptr = nullptr;
    void* bias_ptr = nullptr;
    void* z_ptr = nullptr;

    if (args.preprocessed_filter) {
        filter_ptr = args.preprocessed_filter->tensors[0].raw_ptr;
    } else {
        filter_ptr = reinterpret_cast<void*>(args.workspace.raw_ptr);
        reorder_filter(args, m_algo_param.access_size, filter_ptr);
    }
    bias_ptr = args.bias_tensor->raw_ptr;

    if (args.z_layout->ndim > 0)
        z_ptr = args.z_tensor->raw_ptr;

    // \note these constants of cutlass epilogue will be passed to method
    // `execute_cutlass_conv_op` by pointer and interpreted as ElementCompute*,
    // a different dtype here results in undefined epilogue behaviors
    float alpha, beta, gamma, delta, theta;
    std::tie(alpha, beta, gamma, delta, theta) = get_constants(args);

    float dst_scale = 1.f;
    float threshold = 0.f;
    bool use_conv_filter_unity_opt = (fh == 1 && fw == 1);

    auto dst_dtype = args.dst_layout->dtype.enumv();

    bool without_shared_load = ((co % m_algo_param.threadblock_n == 0) &&
                                (m_algo_param.threadblock_n == 16 ||
                                 (m_algo_param.threadblock_n == 32 &&
                                  dst_dtype != DTypeEnum::Float32)));

    if (dst_dtype == DTypeEnum::QuantizedS8) {  // DTypeEnum::QuantizedS8
        dst_scale = args.dst_layout->dtype.param<dtype::QuantizedS8>().scale;
    } else if (dst_dtype == DTypeEnum::QuantizedS4) {
        dst_scale = args.dst_layout->dtype.param<dtype::QuantizedS4>().scale;
    } else if (dst_dtype == DTypeEnum::Quantized4Asymm) {
        dst_scale =
                args.dst_layout->dtype.param<dtype::Quantized4Asymm>().scale;
    } else {  // DTypeEnum::Float32
        dst_scale = 1.f;
    }

    cudaStream_t stream = cuda_stream(args.opr->handle());

    const auto* op = get_cutlass_conv_op(
            args, ConvOperator::kFprop, ConvType::kConvolution,
            use_conv_filter_unity_opt, without_shared_load);

    execute_cutlass_conv_op(op, args.src_tensor->raw_ptr, filter_ptr, bias_ptr,
                            z_ptr, args.dst_tensor->raw_ptr, nullptr, n, hi, wi,
                            ci, co, fh, fw, ho, wo, ph, pw, sh, sw, dh, dw,
                            &alpha, &beta, &gamma, &delta, &theta, &threshold,
                            &dst_scale, stream);

    after_kernel_launch();
 }

 std::string ConvBiasForwardImpl::AlgoInt8NHWCIMMAImplicitGemm::to_string(
        AlgoParam algo_param) {
    return ssprintf("%dX%dX%d_%dX%dX%d_%d_%d", algo_param.threadblock_m,
                    algo_param.threadblock_n, algo_param.threadblock_k,
                    algo_param.warp_m, algo_param.warp_n, algo_param.warp_k,
                    algo_param.stage, algo_param.access_size);
 }

 void ConvBiasForwardImpl::AlgoInt8NHWCIMMAImplicitGemm::reorder_filter(
        const ExecArgs& args, const int iterleaved,
        void* reordered_filter) const {
    size_t co = args.filter_layout->operator[](0),
           ci = args.filter_layout->operator[](3),
           fh = args.filter_layout->operator[](1),
           fw = args.filter_layout->operator[](2);

    cudaStream_t stream = cuda_stream(args.opr->handle());

    // reformat filter from nhwc to ncxhwx and reorder oc
    // use trans_oc threadblock_n must be 16 or 32 and src dtype == dest dtype
    bool trans_oc = ((co % m_algo_param.threadblock_n == 0) &&
                     (m_algo_param.threadblock_n == 16 ||
                      (m_algo_param.threadblock_n == 32 &&
                       args.dst_layout->dtype.enumv() != DTypeEnum::Float32)));
    uint32_t oc_iterleaved = (m_algo_param.threadblock_n == 32) ? 32 : 16;

    uint32_t alignbits = iterleaved * 8;

    cutlass_wrapper::reorder_nhwc_imma_filter<8>(
            reinterpret_cast<int8_t*>(reordered_filter),
            reinterpret_cast<int8_t*>(args.filter_tensor->raw_ptr), co, ci, fh,
            fw, trans_oc, alignbits, oc_iterleaved, stream);
 }
 #endif

 // vim: syntax=cpp.doxygen
--- a/dnn/src/cuda/conv_bias/implicit_gemm_uint4_int4_nhwc_imma.cpp
+++ b/dnn/src/cuda/conv_bias/implicit_gemm_uint4_int4_nhwc_imma.cpp
@@ -102,22 +102,41 @@ ConvBiasForwardImpl::AlgoUInt4Int4NHWCIMMAImplicitGemm::get_constants(
                  args.filter_layout->dtype.param<dtype::QuantizedS4>().scale,
          bias_scale =
                  args.bias_layout->dtype.param<dtype::QuantizedS32>().scale,
          dst_scale =
                  args.dst_layout->dtype.param<dtype::Quantized4Asymm>().scale;
          dst_scale;

    uint8_t dst_zero = 0;

    if (args.dst_layout->dtype.enumv() == DTypeEnum::Quantized4Asymm) {
        dst_scale =
                args.dst_layout->dtype.param<dtype::Quantized4Asymm>().scale;

        dst_zero = args.dst_layout->dtype.param<dtype::Quantized4Asymm>()
                           .zero_point;
    } else {  // DTypeEnum::QuantizedS8
        megdnn_assert(args.dst_layout->dtype.enumv() == DTypeEnum::QuantizedS8);
        dst_scale = args.dst_layout->dtype.param<dtype::QuantizedS8>().scale;
    }

    uint8_t dst_zero =
            args.dst_layout->dtype.param<dtype::Quantized4Asymm>().zero_point;
    float alpha = src_scale * filter_scale / dst_scale,
          beta = bias_scale / dst_scale, gamma = 0.f, delta = 0.f,
          theta = dst_zero;

    if (args.z_layout->ndim > 0) {
        float z_scale =
                args.z_layout->dtype.param<dtype::Quantized4Asymm>().scale;
        gamma = z_scale / dst_scale;
        uint8_t z_zero =
                args.z_layout->dtype.param<dtype::Quantized4Asymm>().zero_point;
        delta = -z_zero * gamma;
        float z_scale;
        if (args.z_layout->dtype.enumv() == DTypeEnum::Quantized4Asymm) {
            z_scale =
                    args.z_layout->dtype.param<dtype::Quantized4Asymm>().scale;
            uint8_t z_zero =
                    args.z_layout->dtype.param<dtype::Quantized4Asymm>()
                            .zero_point;
            gamma = z_scale / dst_scale;
            delta = -z_zero * gamma;
        } else {  // DTypeEnum::QuantizedS8
            megdnn_assert(args.z_layout->dtype.enumv() ==
                          DTypeEnum::QuantizedS8);
            z_scale = args.z_layout->dtype.param<dtype::QuantizedS8>().scale;
            gamma = z_scale / dst_scale;
        }
    }

    // identity epilogue has no theta:
--- a/dnn/src/cuda/conv_bias/opr_impl.h
+++ b/dnn/src/cuda/conv_bias/opr_impl.h
@@ -65,6 +65,7 @@ public:
    class AlgoInt8CHWN4IMMAImplicitGemmReorderFilter;
    class AlgoInt8CHWN4IMMAImplicitGemmUnrollWidth;
    class AlgoInt8NCHW32IMMAImplicitGemm;
    class AlgoInt8NHWCIMMAImplicitGemm;
    class AlgoInt4NCHW64IMMAImplicitGemmBase;
    class AlgoInt4Int4NCHW64IMMAImplicitGemm;
    class AlgoUInt4Int4NCHW64IMMAImplicitGemm;
--- a/dnn/src/cuda/convolution/backward_data/algo.h
+++ b/dnn/src/cuda/convolution/backward_data/algo.h
@@ -275,6 +275,7 @@ public:
 private:
    WorkspaceBundle get_workspace_bundle(dt_byte* raw_ptr,
                                         const SizeArgs& args) const;
    const void* get_available_op(const SizeArgs& args) const;
    AlgoParam m_algo_param;
    std::string m_name;
 };
@@ -295,6 +296,7 @@ public:
 private:
    WorkspaceBundle get_workspace_bundle(dt_byte* raw_ptr,
                                         const SizeArgs& args) const;
    const void* get_available_op(const SizeArgs& args) const;
 };

 class ConvolutionBackwardDataImpl::AlgoPack : NonCopyableObj {
--- a/dnn/src/cuda/convolution/backward_data/implicit_gemm_int8_nchw4_dp4a.cpp
+++ b/dnn/src/cuda/convolution/backward_data/implicit_gemm_int8_nchw4_dp4a.cpp
@@ -20,6 +20,43 @@
 using namespace megdnn;
 using namespace cuda;

 const void*
 ConvolutionBackwardDataImpl::AlgoInt8NCHW4DotProdImplicitGemm::get_available_op(
        const SizeArgs& args) const {
    using namespace cutlass::library;
    auto&& fm = args.filter_meta;
    size_t sh = fm.stride[0], sw = fm.stride[1];
    cutlass::conv::SpecialOptimizeDesc special_optimization =
            (sh == 2 && sw == 2) ? cutlass::conv::SpecialOptimizeDesc::
                                           DECONV_DOUBLE_UPSAMPLING
                                 : cutlass::conv::SpecialOptimizeDesc::NONE;
    ConvolutionKey key{
            cutlass::conv::Operator::kDgrad,
            NumericTypeID::kS8,
            LayoutTypeID::kTensorNC4HW4,
            NumericTypeID::kS8,
            LayoutTypeID::kTensorK4RSC4,
            NumericTypeID::kS8,
            LayoutTypeID::kTensorNC4HW4,
            NumericTypeID::kS32,
            LayoutTypeID::kTensorNC4HW4,
            cutlass::conv::ConvType::kConvolution,
            m_algo_param.threadblock_m,
            m_algo_param.threadblock_n,
            m_algo_param.threadblock_k,
            m_algo_param.warp_m,
            m_algo_param.warp_n,
            m_algo_param.warp_k,
            1,
            1,
            4,
            cutlass::epilogue::EpilogueType::kBiasAddLinearCombinationClamp,
            m_algo_param.stage,
            special_optimization,
            false};
    return (void*)Singleton::get().operation_table.find_op(key);
 }

 bool ConvolutionBackwardDataImpl::AlgoInt8NCHW4DotProdImplicitGemm::
        is_available(const SizeArgs& args) const {
    auto&& fm = args.filter_meta;
@@ -51,6 +88,7 @@ bool ConvolutionBackwardDataImpl::AlgoInt8NCHW4DotProdImplicitGemm::
    // FIXME: too large filter size is not supported now
    available &= fm.spatial[0] * fm.spatial[1] <=
                 (uint32_t)(848 / (2 * m_algo_param.warp_k / 4) - 2);
    available &= (get_available_op(args) != nullptr);
    // only support sm_61 or later, platform should have fast native int8
    // support
    available &= is_compute_capability_required(6, 1);
@@ -105,40 +143,14 @@ void ConvolutionBackwardDataImpl::AlgoInt8NCHW4DotProdImplicitGemm::exec(
                  args.grad_layout->dtype.param<dtype::QuantizedS8>().scale;

    // \note these constants of cutlass epilogue will be passed to struct
    // `ConvolutionArguments` by pointer and interpreted as ElementCompute*, a
    // different dtype here results in undefined epilogue behaviors
    // `ConvolutionArguments` by pointer and interpreted as ElementCompute*,
    // a different dtype here results in undefined epilogue behaviors
    float alpha = diff_scale * filter_scale / grad_scale, beta = 0.f,
          gamma = 0.f, delta = 0.f;

    using namespace cutlass::library;

    // only use 16x64x8_16x64x8_2stages impl
    ConvolutionKey key{
            cutlass::conv::Operator::kDgrad,
            NumericTypeID::kS8,
            LayoutTypeID::kTensorNC4HW4,
            NumericTypeID::kS8,
            LayoutTypeID::kTensorK4RSC4,
            NumericTypeID::kS8,
            LayoutTypeID::kTensorNC4HW4,
            NumericTypeID::kS32,
            LayoutTypeID::kTensorNC4HW4,
            cutlass::conv::ConvType::kConvolution,
            m_algo_param.threadblock_m,
            m_algo_param.threadblock_n,
            m_algo_param.threadblock_k,
            m_algo_param.warp_m,
            m_algo_param.warp_n,
            m_algo_param.warp_k,
            1,
            1,
            4,
            cutlass::epilogue::EpilogueType::kBiasAddLinearCombinationClamp,
            m_algo_param.stage,
            true,
            false};

    const Operation* op = Singleton::get().operation_table.find_op(key);
    const Operation* op = (const Operation*)get_available_op(args);

    // gcc prints warnings when size_t values are implicitly narrowed to int
    cutlass::conv::Conv2dProblemSize problem_size{
@@ -167,7 +179,6 @@ void ConvolutionBackwardDataImpl::AlgoPack::fill_int8_dp4a_algos() {
    int8_nchw4_dotprod.emplace_back(AlgoParam{16, 128, 16, 16, 64, 16, 2});
    int8_nchw4_dotprod.emplace_back(AlgoParam{16, 128, 16, 16, 128, 16, 1});
    int8_nchw4_dotprod.emplace_back(AlgoParam{32, 128, 32, 32, 64, 32, 2});
    int8_nchw4_dotprod.emplace_back(AlgoParam{64, 128, 32, 64, 32, 32, 2});
 }

 // vim: syntax=cpp.doxygen
--- a/dnn/src/cuda/convolution/backward_data/implicit_gemm_int8_nchw_dp4a.cpp
+++ b/dnn/src/cuda/convolution/backward_data/implicit_gemm_int8_nchw_dp4a.cpp
@@ -19,6 +19,44 @@
 using namespace megdnn;
 using namespace cuda;

 const void*
 ConvolutionBackwardDataImpl::AlgoInt8NCHWDotProdImplicitGemm::get_available_op(
        const SizeArgs& args) const {
    using namespace cutlass::library;
    auto&& fm = args.filter_meta;
    size_t sh = fm.stride[0], sw = fm.stride[1];
    cutlass::conv::SpecialOptimizeDesc special_optimization =
            (sh == 2 && sw == 2) ? cutlass::conv::SpecialOptimizeDesc::
                                           DECONV_DOUBLE_UPSAMPLING
                                 : cutlass::conv::SpecialOptimizeDesc::NONE;
    // only use 16x64x8_16x64x8_2stages impl
    ConvolutionKey key{
            cutlass::conv::Operator::kDgrad,
            NumericTypeID::kS8,
            LayoutTypeID::kTensorNC4HW4,
            NumericTypeID::kS8,
            LayoutTypeID::kTensorK4RSC4,
            NumericTypeID::kS8,
            LayoutTypeID::kTensorNC4HW4,
            NumericTypeID::kS32,
            LayoutTypeID::kTensorNC4HW4,
            cutlass::conv::ConvType::kConvolution,
            16,
            64,
            8,
            16,
            64,
            8,
            1,
            1,
            4,
            cutlass::epilogue::EpilogueType::kBiasAddLinearCombinationClamp,
            2,
            special_optimization,
            false};
    return (void*)Singleton::get().operation_table.find_op(key);
 }

 bool ConvolutionBackwardDataImpl::AlgoInt8NCHWDotProdImplicitGemm::is_available(
        const SizeArgs& args) const {
    auto&& fm = args.filter_meta;
@@ -52,6 +90,9 @@ bool ConvolutionBackwardDataImpl::AlgoInt8NCHWDotProdImplicitGemm::is_available(
    available &= (fm.dilation[0] == 1 && fm.dilation[1] == 1);
    // FIXME: too large filter size is not supported now
    available &= fm.spatial[0] * fm.spatial[1] <= (848 / (2 * 8 / 4) - 2);

    available &= (get_available_op(args) != nullptr);

    // only support sm_61 or later, platform should have fast native int8
    // support
    available &= is_compute_capability_required(6, 1);
@@ -138,33 +179,7 @@ void ConvolutionBackwardDataImpl::AlgoInt8NCHWDotProdImplicitGemm::exec(

    using namespace cutlass::library;

    // only use 16x64x8_16x64x8_2stages impl
    ConvolutionKey key{
            cutlass::conv::Operator::kDgrad,
            NumericTypeID::kS8,
            LayoutTypeID::kTensorNC4HW4,
            NumericTypeID::kS8,
            LayoutTypeID::kTensorK4RSC4,
            NumericTypeID::kS8,
            LayoutTypeID::kTensorNC4HW4,
            NumericTypeID::kS32,
            LayoutTypeID::kTensorNC4HW4,
            cutlass::conv::ConvType::kConvolution,
            16,
            64,
            8,
            16,
            64,
            8,
            1,
            1,
            4,
            cutlass::epilogue::EpilogueType::kBiasAddLinearCombinationClamp,
            2,
            true,
            false};

    const Operation* op = Singleton::get().operation_table.find_op(key);
    const Operation* op = (const Operation*)get_available_op(args);

    // gcc prints warnings when size_t values are implicitly narrowed to int
    cutlass::conv::Conv2dProblemSize problem_size{
--- a/dnn/src/cuda/cutlass/convolution_operation.h
+++ b/dnn/src/cuda/cutlass/convolution_operation.h
@@ -119,8 +119,8 @@ public:
        m_description.threadblock_swizzle = ThreadblockSwizzleMap<
                typename Operator::ThreadblockSwizzle>::kId;

        m_description.need_load_from_const_mem =
                Operator::kNeedLoadFromConstMem;
        m_description.special_optimization =
                Operator::kSpecialOpt;
        m_description.gemm_mode = Operator::kGemmMode;
        m_description.without_shared_load = Operator::kWithoutSharedLoad;
    }
--- a/dnn/src/cuda/cutlass/library.h
+++ b/dnn/src/cuda/cutlass/library.h
@@ -487,7 +487,7 @@ struct ConvolutionDescription : public OperationDescription {

    ThreadblockSwizzleID threadblock_swizzle;

    bool need_load_from_const_mem;
    conv::SpecialOptimizeDesc special_optimization;
    conv::ImplicitGemmMode gemm_mode;
    bool without_shared_load;
 };
--- a/dnn/src/cuda/cutlass/operation_table.cpp
+++ b/dnn/src/cuda/cutlass/operation_table.cpp
@@ -124,7 +124,7 @@ ConvolutionKey get_convolution_key_from_desc(
    key.epilogue_type = desc.epilogue_type;

    key.stages = desc.tile_description.threadblock_stages;
    key.need_load_from_const_mem = desc.need_load_from_const_mem;
    key.special_optimization = desc.special_optimization;
    key.without_shared_load = desc.without_shared_load;

    return key;
@@ -156,23 +156,25 @@ void OperationTable::append(Manifest const& manifest) {
 /////////////////////////////////////////////////////////////////////////////////////////////////

 Operation const* OperationTable::find_op(GemmKey const& key) const {
    megdnn_assert(gemm_operations.count(key) > 0,
                  "key not found in cutlass operation table");
    auto const& ops = gemm_operations.at(key);
    megdnn_assert(ops.size() == 1, "exactly one kernel expected, got %zu",
                  ops.size());
    return ops[0];
    if (gemm_operations.count(key)) {
        auto const& ops = gemm_operations.at(key);
        megdnn_assert(ops.size() == 1, "exactly one kernel expected, got %zu",
                      ops.size());
        return ops[0];
    }
    return nullptr;
 }

 /////////////////////////////////////////////////////////////////////////////////////////////////

 Operation const* OperationTable::find_op(ConvolutionKey const& key) const {
    megdnn_assert(convolution_operations.count(key) > 0,
                  "key not found in cutlass operation table");
    auto const& ops = convolution_operations.at(key);
    megdnn_assert(ops.size() == 1, "exactly one kernel expected, got %zu",
                  ops.size());
    return ops[0];
    if (convolution_operations.count(key) > 0) {
        auto const& ops = convolution_operations.at(key);
        megdnn_assert(ops.size() == 1, "exactly one kernel expected, got %zu",
                      ops.size());
        return ops[0];
    }
    return nullptr;
 }

 /////////////////////////////////////////////////////////////////////////////////////////////////
--- a/dnn/src/cuda/cutlass/operation_table.h
+++ b/dnn/src/cuda/cutlass/operation_table.h
@@ -211,7 +211,7 @@ struct ConvolutionKey {

    epilogue::EpilogueType epilogue_type;
    int stages;
    bool need_load_from_const_mem;
    conv::SpecialOptimizeDesc special_optimization;
    bool without_shared_load;

    inline bool operator==(ConvolutionKey const& rhs) const {
@@ -234,7 +234,7 @@ struct ConvolutionKey {
               (instruction_shape_n == rhs.instruction_shape_n) &&
               (instruction_shape_k == rhs.instruction_shape_k) &&
               (epilogue_type == rhs.epilogue_type) && (stages == rhs.stages) &&
               (need_load_from_const_mem == rhs.need_load_from_const_mem) &&
               (special_optimization == rhs.special_optimization) &&
               (without_shared_load == rhs.without_shared_load);
    }

@@ -270,8 +270,8 @@ struct ConvolutionKey {
               "\n    instruction_shape: " + instruction_shape_str +
               "\n    epilogue_type: " + to_string(epilogue_type) +
               "\n    stages: " + std::to_string(stages) +
               "\n    need_load_from_const_mem: " +
               to_string(need_load_from_const_mem) +
               "\n    special_optimization: " +
               to_string(special_optimization) +
               "\n    without_shared_load: " + to_string(without_shared_load) +
               "\n}";
    }
@@ -308,8 +308,8 @@ struct ConvolutionKeyHasher {
                        sizeof(key.instruction_shape_k))
                .update(&key.epilogue_type, sizeof(key.epilogue_type))
                .update(&key.stages, sizeof(key.stages))
                .update(&key.need_load_from_const_mem,
                        sizeof(key.need_load_from_const_mem))
                .update(&key.special_optimization,
                        sizeof(key.special_optimization))
                .update(&key.without_shared_load,
                        sizeof(key.without_shared_load))
                .digest();
--- a/dnn/src/cuda/cutlass/util.cu
+++ b/dnn/src/cuda/cutlass/util.cu
@@ -1569,6 +1569,35 @@ char const* to_string(MathOperationID math_op, bool pretty) {
 static struct {
    char const* text;
    char const* pretty;
    conv::SpecialOptimizeDesc enumerant;
 } SpecialOptimizeDesc_enumerants[] = {
        {"none_special_opt", "NoneSpecialOpt", conv::SpecialOptimizeDesc::NONE},
        {"conv_filter_unity", "ConvFilterUnity",
         conv::SpecialOptimizeDesc::CONV_FILTER_UNITY},
        {"deconv_double_upsampling", "DeconvDoubleUpsampling",
         conv::SpecialOptimizeDesc::DECONV_DOUBLE_UPSAMPLING},
 };

 /// Converts an SpecialOptimizeDesc enumerant to a string
 char const* to_string(conv::SpecialOptimizeDesc special_opt, bool pretty) {
    for (auto const& possible : SpecialOptimizeDesc_enumerants) {
        if (special_opt == possible.enumerant) {
            if (pretty) {
                return possible.pretty;
            } else {
                return possible.text;
            }
        }
    }

    return pretty ? "Invalid" : "invalid";
 }

 ///////////////////////////////////////////////////////////////////////////////////////////////////

 static struct {
    char const* text;
    char const* pretty;
    conv::ImplicitGemmMode enumerant;
 } ImplicitGemmMode_enumerants[] = {
        {"gemm_nt", "GemmNT", conv::ImplicitGemmMode::GEMM_NT},
--- a/dnn/src/cuda/cutlass/util.h
+++ b/dnn/src/cuda/cutlass/util.h
@@ -207,6 +207,10 @@ char const* to_string(bool val, bool pretty = false);
 /// Converts a MathOperationID enumerant to a string
 char const* to_string(MathOperationID math_op, bool pretty = false);

 /// Converts a SpecialOptimizeDesc enumerant to a string
 char const* to_string(conv::SpecialOptimizeDesc special_opt,
                      bool pretty = false);

 /// Converts an ImplicitGemmMode enumerant to a string
 char const* to_string(conv::ImplicitGemmMode mode, bool pretty = false);

--- a/dnn/src/cuda/matrix_mul/algos.h
+++ b/dnn/src/cuda/matrix_mul/algos.h
@@ -235,6 +235,7 @@ public:
              m_name{ssprintf("CUTLASS_FLOAT32_SIMT_%s",
                              m_algo_param.to_string().c_str())} {}
    bool is_available(const SizeArgs& args) const override;

    size_t get_workspace_in_bytes(const SizeArgs& args) const override;
    const char* name() const override { return m_name.c_str(); }
    AlgoAttribute attribute() const override {
@@ -260,6 +261,7 @@ private:
    void do_exec(const ExecArgs& args) const override;
    int min_alignment_requirement() const override { return 1; }
    std::string m_name;
    const void* get_available_op(const SizeArgs& args) const;
 };

 class MatrixMulForwardImpl::AlgoFloat32SIMTSplitK final
@@ -270,6 +272,7 @@ public:
              m_name{ssprintf("CUTLASS_FLOAT32_SIMT_SPLIT_K_%s",
                              m_algo_param.to_string().c_str())} {}
    bool is_available(const SizeArgs& args) const override;
    
    size_t get_workspace_in_bytes(const SizeArgs& args) const override;
    const char* name() const override { return m_name.c_str(); }
    AlgoAttribute attribute() const override {
@@ -297,6 +300,7 @@ private:
    void do_exec(const ExecArgs& args) const override;
    int min_alignment_requirement() const override { return 1; }
    std::string m_name;
    const void* get_available_op(const SizeArgs& args) const;
 };

 class MatrixMulForwardImpl::AlgoFloat32SIMTGemvBatchedStrided final
--- a/dnn/src/cuda/matrix_mul/cutlass_float32_simt.cpp
+++ b/dnn/src/cuda/matrix_mul/cutlass_float32_simt.cpp
@@ -19,6 +19,39 @@
 using namespace megdnn;
 using namespace cuda;

 const void* MatrixMulForwardImpl::AlgoFloat32SIMT::get_available_op(
        const SizeArgs& args) const {
    using namespace cutlass::library;
    auto&& param = args.opr->param();
    auto layoutA = param.transposeA ? LayoutTypeID::kColumnMajor
                                    : LayoutTypeID::kRowMajor;
    auto layoutB = param.transposeB ? LayoutTypeID::kColumnMajor
                                    : LayoutTypeID::kRowMajor;

    int alignment = min_alignment_requirement();
    GemmKey key{NumericTypeID::kF32,
                layoutA,
                NumericTypeID::kF32,
                layoutB,
                NumericTypeID::kF32,
                LayoutTypeID::kRowMajor,
                NumericTypeID::kF32,
                m_algo_param.threadblock_m,
                m_algo_param.threadblock_n,
                m_algo_param.threadblock_k,
                m_algo_param.warp_m,
                m_algo_param.warp_n,
                m_algo_param.warp_k,
                1,
                1,
                1,
                2,
                alignment,
                alignment,
                SplitKMode::kNone};
    return (void*)Singleton::get().operation_table.find_op(key);
 }

 bool MatrixMulForwardImpl::AlgoFloat32SIMT::is_available(
        const SizeArgs& args) const {
    bool available =
@@ -34,6 +67,8 @@ bool MatrixMulForwardImpl::AlgoFloat32SIMT::is_available(
                          m_algo_param.threadblock_n <=
                  y_grid_limit);

    available &= (get_available_op(args) != nullptr);

    return available;
 }

@@ -61,34 +96,7 @@ void MatrixMulForwardImpl::AlgoFloat32SIMT::do_exec(

    using namespace cutlass::library;

    auto layoutA = param.transposeA ? LayoutTypeID::kColumnMajor
                                    : LayoutTypeID::kRowMajor;
    auto layoutB = param.transposeB ? LayoutTypeID::kColumnMajor
                                    : LayoutTypeID::kRowMajor;

    int alignment = min_alignment_requirement();
    GemmKey key{NumericTypeID::kF32,
                layoutA,
                NumericTypeID::kF32,
                layoutB,
                NumericTypeID::kF32,
                LayoutTypeID::kRowMajor,
                NumericTypeID::kF32, 
                m_algo_param.threadblock_m,
                m_algo_param.threadblock_n,
                m_algo_param.threadblock_k,
                m_algo_param.warp_m,
                m_algo_param.warp_n,
                m_algo_param.warp_k,
                1,
                1,
                1, 
                2, 
                alignment, 
                alignment, 
                SplitKMode::kNone};

    const Operation* op = Singleton::get().operation_table.find_op(key);
    const Operation* op = (const Operation*)get_available_op(args);

    GemmArguments gemm_args{problem_size,
                            args.tensor_a.raw_ptr,
--- a/dnn/src/cuda/matrix_mul/cutlass_float32_simt_split_k.cpp
+++ b/dnn/src/cuda/matrix_mul/cutlass_float32_simt_split_k.cpp
@@ -19,6 +19,39 @@
 using namespace megdnn;
 using namespace cuda;

 const void* MatrixMulForwardImpl::AlgoFloat32SIMTSplitK::get_available_op(
        const SizeArgs& args) const {
    using namespace cutlass::library;
    auto&& param = args.opr->param();
    auto layoutA = param.transposeA ? LayoutTypeID::kColumnMajor
                                    : LayoutTypeID::kRowMajor;
    auto layoutB = param.transposeB ? LayoutTypeID::kColumnMajor
                                    : LayoutTypeID::kRowMajor;

    int alignment = min_alignment_requirement();
    GemmKey key{NumericTypeID::kF32,
                layoutA,
                NumericTypeID::kF32,
                layoutB,
                NumericTypeID::kF32,
                LayoutTypeID::kRowMajor,
                NumericTypeID::kF32,
                m_algo_param.threadblock_m,
                m_algo_param.threadblock_n,
                m_algo_param.threadblock_k,
                m_algo_param.warp_m,
                m_algo_param.warp_n,
                m_algo_param.warp_k,
                1,
                1,
                1,
                2,
                alignment,
                alignment,
                SplitKMode::kParallel};
    return (void*)Singleton::get().operation_table.find_op(key);
 }

 bool MatrixMulForwardImpl::AlgoFloat32SIMTSplitK::is_available(
        const SizeArgs& args) const {
    auto&& param = args.opr->param();
@@ -35,6 +68,8 @@ bool MatrixMulForwardImpl::AlgoFloat32SIMTSplitK::is_available(
    available &= ((m + m_algo_param.threadblock_m - 1) /
                          m_algo_param.threadblock_m <=
                  y_grid_limit);
    available &= (get_available_op(args) != nullptr);

    return available;
 }

@@ -66,35 +101,7 @@ void MatrixMulForwardImpl::AlgoFloat32SIMTSplitK::do_exec(
    float alpha = 1.f, beta = 0.f;

    using namespace cutlass::library;

    auto layoutA = param.transposeA ? LayoutTypeID::kColumnMajor
                                    : LayoutTypeID::kRowMajor;
    auto layoutB = param.transposeB ? LayoutTypeID::kColumnMajor
                                    : LayoutTypeID::kRowMajor;

    int alignment = min_alignment_requirement();
    GemmKey key{NumericTypeID::kF32,
                layoutA,
                NumericTypeID::kF32,
                layoutB,
                NumericTypeID::kF32,
                LayoutTypeID::kRowMajor,
                NumericTypeID::kF32, 
                m_algo_param.threadblock_m,
                m_algo_param.threadblock_n,
                m_algo_param.threadblock_k,
                m_algo_param.warp_m,
                m_algo_param.warp_n,
                m_algo_param.warp_k,
                1,
                1,
                1,
                2, 
                alignment, 
                alignment, 
                SplitKMode::kParallel};

    Operation const* op = Singleton::get().operation_table.find_op(key);
    const Operation* op = (const Operation*)get_available_op(args);

    GemmArguments gemm_args{problem_size,
                            args.tensor_a.raw_ptr,
--- a/dnn/test/cuda/conv_bias_int8.cpp
+++ b/dnn/test/cuda/conv_bias_int8.cpp
@@ -882,6 +882,125 @@ TEST_F(CUDA, CUTLASS_CONV_BIAS_INT8_NCHW32_IMMA) {
            ConvBias::DirectParam{});
    check(algo);
 }

 TEST_F(CUDA, CUTLASS_CONV_BIAS_INT8_NHWC) {
    require_compute_capability(7, 5);
    Checker<ConvBiasForward> checker(handle_cuda());
    auto check = [&checker](const std::string& algo) {
        checker.set_before_exec_callback(
                conv_bias::ConvBiasAlgoChecker<ConvBiasForward>(algo.c_str()));
        UniformIntRNG rng{-8, 8};
        UniformIntRNG bias_rng{-50, 50};
        checker.set_rng(0, &rng)
                .set_rng(1, &rng)
                .set_rng(2, &bias_rng)
                .set_rng(3, &rng)
                .set_dtype(0, dtype::QuantizedS8{1.2f})
                .set_dtype(1, dtype::QuantizedS8{1.3f})
                .set_dtype(2, dtype::QuantizedS32{1.2f * 1.3f})
                .set_dtype(3, dtype::QuantizedS8{19.990229f})
                .set_dtype(4, dtype::QuantizedS8{19.990228f})
                .set_epsilon(1e-3);
        param::ConvBias param;
        param.pad_h = param.pad_w = 1;
        param.stride_h = param.stride_w = 1;
        param.format = param::ConvBias::Format::NHWC;
        checker.set_param(param).execs(
                {{16, 7, 7, 16}, {32, 3, 3, 16}, {1, 1, 1, 32}, {}, {}});
        param.pad_h = param.pad_w = 0;
        param.nonlineMode = param::ConvBias::NonlineMode::RELU;
        checker.set_param(param).execs(
                {{16, 7, 7, 16}, {16, 1, 1, 16}, {1, 1, 1, 16}, {}, {}});
    };
    std::string algo = ConvBias::algo_name<ConvBias::DirectParam>(
            "INT8_NHWC_IMMA_IMPLICIT_GEMM_64X16X32_64X16X32_2_16",
            ConvBias::DirectParam{});
    check(algo);
    algo = ConvBias::algo_name<ConvBias::DirectParam>(
            "INT8_NHWC_IMMA_IMPLICIT_GEMM_128X32X32_64X32X32_1_16",
            ConvBias::DirectParam{});
    check(algo);
 }

 TEST_F(CUDA, CUTLASS_CONV_BIAS_INT8_NHWC_UINT4_WEIGHT_PREPROCESS) {
    require_compute_capability(7, 5);
    Checker<ConvBiasForward, OprWeightPreprocessProxy<ConvBiasForward>> checker(
            handle_cuda());
    auto check = [&checker](const std::string& algo) {
        checker.set_before_exec_callback(
                conv_bias::ConvBiasAlgoChecker<ConvBiasForward>(algo.c_str()));
        UniformIntRNG rng{-8, 8};
        UniformIntRNG bias_rng{-50, 50};
        UniformIntRNG rng_u4{0, 15};
        checker.set_rng(0, &rng)
                .set_rng(1, &rng)
                .set_rng(2, &bias_rng)
                .set_rng(3, &rng_u4)
                .set_dtype(0, dtype::QuantizedS8{0.2f})
                .set_dtype(1, dtype::QuantizedS8{0.3f})
                .set_dtype(2, dtype::QuantizedS32{0.2f * 0.3f})
                .set_dtype(3, dtype::Quantized4Asymm{0.5f, 8})
                .set_dtype(4, dtype::Quantized4Asymm{0.5f, 4})
                .set_epsilon(1 + 1e-3);
        param::ConvBias param;
        param.pad_h = param.pad_w = 1;
        param.stride_h = param.stride_w = 1;
        param.format = param::ConvBias::Format::NHWC;
        checker.set_param(param).execs(
                {{16, 7, 7, 16}, {32, 3, 3, 16}, {1, 1, 1, 32}, {}, {}});
        param.pad_h = param.pad_w = 0;
        param.nonlineMode = param::ConvBias::NonlineMode::RELU;
        checker.set_param(param).execs(
                {{16, 7, 7, 16}, {16, 1, 1, 16}, {1, 1, 1, 16}, {}, {}});
    };
    std::string algo = ConvBias::algo_name<ConvBias::DirectParam>(
            "INT8_NHWC_IMMA_IMPLICIT_GEMM_64X16X32_64X16X32_2_16",
            ConvBias::DirectParam{});
    check(algo);
    algo = ConvBias::algo_name<ConvBias::DirectParam>(
            "INT8_NHWC_IMMA_IMPLICIT_GEMM_128X32X32_64X32X32_1_16",
            ConvBias::DirectParam{});
    check(algo);
 }

 TEST_F(CUDA, CUTLASS_CONV_BIAS_INT8_NHWC_FLOAT) {
    require_compute_capability(7, 5);
    Checker<ConvBiasForward> checker(handle_cuda());
    auto check = [&checker](const std::string& algo) {
        checker.set_before_exec_callback(
                conv_bias::ConvBiasAlgoChecker<ConvBiasForward>(algo.c_str()));
        UniformIntRNG rng{-8, 8};
        UniformFloatRNG float_rng{-50, 50};
        checker.set_rng(0, &rng)
                .set_rng(1, &rng)
                .set_rng(2, &float_rng)
                .set_rng(3, &float_rng)
                .set_dtype(0, dtype::QuantizedS8(1.9980618f))
                .set_dtype(1, dtype::QuantizedS8(1.9980927f))
                .set_dtype(2, dtype::Float32())
                .set_dtype(3, dtype::Float32())
                .set_dtype(4, dtype::Float32());
        param::ConvBias param;
        param.pad_h = param.pad_w = 1;
        param.stride_h = param.stride_w = 1;
        param.format = param::ConvBias::Format::NHWC;
        checker.set_param(param).execs(
                {{16, 7, 7, 16}, {32, 3, 3, 16}, {1, 1, 1, 32}, {}, {}});
        param.pad_h = param.pad_w = 0;
        param.nonlineMode = param::ConvBias::NonlineMode::RELU;
        checker.set_param(param).execs(
                {{16, 7, 7, 16}, {16, 1, 1, 16}, {1, 1, 1, 16}, {}, {}});
    };
    std::string algo = ConvBias::algo_name<ConvBias::DirectParam>(
            "INT8_NHWC_IMMA_IMPLICIT_GEMM_64X16X32_64X16X32_2_16",
            ConvBias::DirectParam{});
    check(algo);
    algo = ConvBias::algo_name<ConvBias::DirectParam>(
            "INT8_NHWC_IMMA_IMPLICIT_GEMM_128X32X32_64X32X32_1_16",
            ConvBias::DirectParam{});
    check(algo);
 }

 #endif

 TEST_F(CUDA, CUTLASS_CONV_BIAS_INT8_NCHW4_NCHW) {
@@ -969,7 +1088,7 @@ TEST_F(CUDA, CUTLASS_CONV_BIAS_INT8_NCHW32_NCHW4) {
    checker.set_before_exec_callback(conv_bias::ConvBiasAlgoChecker<
                                     ConvBiasForward>(
            ConvBias::algo_name<ConvBias::DirectParam>(
                    "INT8_NCHW32_IMMA_IMPLICIT_GEMM_128X128X64_64X64X64_2",
                    "INT8_NCHW32_IMMA_IMPLICIT_GEMM_32X128X32_32X64X32_1",
                    ConvBias::DirectParam{})
                    .c_str()));
    checker.set_dtype(0, dtype::QuantizedS8(1.9980618f))
@@ -1109,6 +1228,16 @@ TEST_F(CUDA, BENCHMARK_CUTLASS_CONV_BIAS_INT8_NCHW32) {
            "DIRECT:INT8_NCHW32_IMMA_IMPLICIT_GEMM",
            param::ConvBias::Format::NCHW32);
 }

 TEST_F(CUDA, BENCHMARK_CUTLASS_CONV_BIAS_INT8_NHWC) {
    require_compute_capability(7, 5);
    benchmark_target_algo_with_cudnn_tsc(
            handle_cuda(), get_det_first_bench_args(16),
            dtype::QuantizedS8{1.2f}, dtype::QuantizedS8{1.3f},
            dtype::QuantizedS32{1.2f * 1.3f}, dtype::QuantizedS8{1.0f},
            "DIRECT:INT8_NHWC_IMMA_IMPLICIT_GEMM",
            param::ConvBias::Format::NHWC);
 }
 #endif

 TEST_F(CUDA, BENCHMARK_CUTLASS_CONV_BIAS_INT8_NCHW4) {
--- a/dnn/test/cuda/conv_test_utils.cpp
+++ b/dnn/test/cuda/conv_test_utils.cpp
@@ -102,9 +102,7 @@ std::vector<BenchArgs> get_det_first_bench_args(size_t batch) {
    args.emplace_back(BenchArgs{batch, 16, 384, 640, 16, 3, 1});
    args.emplace_back(BenchArgs{batch, 16, 384, 640, 32, 3, 2});
    args.emplace_back(BenchArgs{batch, 32, 184, 320, 32, 3, 1});
    args.emplace_back(BenchArgs{batch, 32, 384, 640, 64, 3, 2});
    args.emplace_back(BenchArgs{batch, 32, 184, 320, 32, 1, 1});
    args.emplace_back(BenchArgs{batch, 32, 384, 640, 64, 1, 2});
    return args;
 }

@@ -333,6 +331,9 @@ void benchmark_target_algo_with_cudnn_tsc(
                            .reshape({shape[0], shape[1] / 4, 4, shape[2],
                                      shape[3]})
                            .dimshuffle({1, 3, 4, 0, 2}));
        } else if (format == Format::NHWC) {
            ret = static_cast<TensorShape>(
                    TensorLayout{shape, dtype}.dimshuffle({0, 2, 3, 1}));
        }
        return ret;
    };
@@ -363,6 +364,9 @@ void benchmark_target_algo_with_cudnn_tsc(
        if ((format == Format::CHWN4 || format == Format::NCHW4) &&
            (arg.ci % 16 != 0))
            continue;
        // skip testcase which cannot enable nhwc tensorcore
        if ((format == Format::NHWC) && (arg.ci % 4 != 0 || arg.co % 4 != 0))
            continue;
        Format format_cudnn = arg.ci % 32 == 0 && arg.co % 32 == 0
                                      ? Format::NCHW32
                                      : Format::NCHW4;
--- a/dnn/test/cuda/convolution.cpp
+++ b/dnn/test/cuda/convolution.cpp
@@ -327,7 +327,6 @@ TEST_F(CUDA, CONVOLUTION_BACKWARD_DATA_INT8_NCHW4_DP4A) {
    all_params.emplace_back(AlgoParam{16, 128, 16, 16, 64, 16, 2});
    all_params.emplace_back(AlgoParam{16, 128, 16, 16, 128, 16, 1});
    all_params.emplace_back(AlgoParam{32, 128, 32, 32, 64, 32, 2});
    all_params.emplace_back(AlgoParam{64, 128, 32, 64, 32, 32, 2});

    for (auto algo_param : all_params) {
        Checker<ConvolutionBackwardData> checker(handle_cuda());