feat(dnn/cuda): add implicit bmm large kernel dwconv2d fprop impl

GitOrigin-RevId: feb09ebb58
3 years ago · 96050073a2
--- a/dnn/scripts/cutlass_generator/conv2d_operation.py
+++ b/dnn/scripts/cutlass_generator/conv2d_operation.py
--- a/dnn/scripts/cutlass_generator/gemm_operation.py
+++ b/dnn/scripts/cutlass_generator/gemm_operation.py
--- a/dnn/scripts/cutlass_generator/generator.py
+++ b/dnn/scripts/cutlass_generator/generator.py
--- a/dnn/scripts/cutlass_generator/library.py
+++ b/dnn/scripts/cutlass_generator/library.py
--- a/dnn/scripts/cutlass_generator/manifest.py
+++ b/dnn/scripts/cutlass_generator/manifest.py
@@ -10,24 +10,25 @@ import shutil

 from library import *
 from gemm_operation import *
 from conv2d_operation import *  
 from conv2d_operation import *

 ###################################################################################################


 class EmitOperationKindLibrary:
  def __init__(self, generated_path, kind, args):
    self.generated_path = generated_path
    self.kind = kind
    self.args = args
    def __init__(self, generated_path, kind, args):
        self.generated_path = generated_path
        self.kind = kind
        self.args = args

    self.emitters = {
      OperationKind.Gemm: EmitGemmConfigurationLibrary
      , OperationKind.Conv2d: EmitConv2dConfigurationLibrary
    }
        self.emitters = {
            OperationKind.Gemm: EmitGemmConfigurationLibrary,
            OperationKind.Conv2d: EmitConv2dConfigurationLibrary,
        }

    self.configurations = [];
        self.configurations = []

    self.header_template ="""
        self.header_template = """
 /*
 Generated by manifest.py - Do not edit.
 */
@@ -42,17 +43,19 @@ namespace library {
 ///////////////////////////////////////////////////////////////////////////////////////////////////

 """
    self.entry_template = """
        self.entry_template = """

 //
 // Entry point to construct operations
 //
 void initialize_all_${operation_name}_operations(Manifest &manifest) {
 """
    self.configuration_prototype_template = "void initialize_${configuration_name}(Manifest &manifest);\n"
    self.configuration_template ="  initialize_${configuration_name}(manifest);\n"
        self.configuration_prototype_template = (
            "void initialize_${configuration_name}(Manifest &manifest);\n"
        )
        self.configuration_template = "  initialize_${configuration_name}(manifest);\n"

    self.epilogue_template ="""
        self.epilogue_template = """

 }

@@ -63,91 +66,118 @@ void initialize_all_${operation_name}_operations(Manifest &manifest) {

 """

  #
  def __enter__(self):
    self.operation_path = os.path.join(self.generated_path, OperationKindNames[self.kind])
    os.mkdir(self.operation_path)
    #
    def __enter__(self):
        self.operation_path = os.path.join(
            self.generated_path, OperationKindNames[self.kind]
        )
        os.mkdir(self.operation_path)

        self.top_level_path = os.path.join(
            self.operation_path, "all_%s_operations.cu" % OperationKindNames[self.kind]
        )

        self.top_level_file = open(self.top_level_path, "w")
        self.top_level_file.write(self.header_template)

    self.top_level_path = os.path.join(self.operation_path, "all_%s_operations.cu" % OperationKindNames[self.kind])
        self.source_files = [self.top_level_path]

    self.top_level_file = open(self.top_level_path, "w")
    self.top_level_file.write(self.header_template)
        return self

    self.source_files = [self.top_level_path,]
    #
    def emit(self, configuration_name, operations):

    return self
        with self.emitters[self.kind](
            self.operation_path, configuration_name
        ) as configuration_emitter:
            for operation in operations:
                configuration_emitter.emit(operation)

  #
  def emit(self, configuration_name, operations):
            self.source_files.append(configuration_emitter.configuration_path)

    with self.emitters[self.kind](self.operation_path, configuration_name) as configuration_emitter:
      for operation in operations:
        configuration_emitter.emit(operation)
      
      self.source_files.append(configuration_emitter.configuration_path)
        self.configurations.append(configuration_name)
        self.top_level_file.write(
            SubstituteTemplate(
                self.configuration_prototype_template,
                {"configuration_name": configuration_name},
            )
        )

    self.configurations.append(configuration_name)
    self.top_level_file.write(SubstituteTemplate(self.configuration_prototype_template, {'configuration_name': configuration_name} ))
    #
    def __exit__(self, exception_type, exception_value, traceback):
        self.top_level_file.write(
            SubstituteTemplate(
                self.entry_template, {"operation_name": OperationKindNames[self.kind]}
            )
        )

  #
  def __exit__(self, exception_type, exception_value, traceback):
    self.top_level_file.write(SubstituteTemplate(self.entry_template, {'operation_name': OperationKindNames[self.kind]}))
        for configuration_name in self.configurations:
            self.top_level_file.write(
                SubstituteTemplate(
                    self.configuration_template,
                    {"configuration_name": configuration_name},
                )
            )

    for configuration_name in self.configurations:
      self.top_level_file.write(SubstituteTemplate(self.configuration_template, {'configuration_name': configuration_name}))
        self.top_level_file.write(self.epilogue_template)
        self.top_level_file.close()

    self.top_level_file.write(self.epilogue_template)
    self.top_level_file.close()

 ###################################################################################################
 ###################################################################################################


 class Options:
  def __init__(self):
    pass
    def __init__(self):
        pass


 ###################################################################################################

 #
 class Manifest:

  #
  def __init__(self, args):
    self.operations = {}
    self.args = args
    #
    def __init__(self, args):
        self.operations = {}
        self.args = args

        architectures = (
            args.architectures.split(";") if len(args.architectures) else ["50"]
        )
        self.compute_capabilities = [int(x) for x in architectures]

    architectures = args.architectures.split(';') if len(args.architectures) else ['50',]
    self.compute_capabilities = [int(x) for x in architectures]
    
    self.selected_kernels = []
    
    if args.operations == 'all':
      self.operations_enabled = []
    else:
        self.selected_kernels = []

      operations_list = [
        OperationKind.Gemm
        , OperationKind.Conv2d     
      ] 
        if args.operations == "all":
            self.operations_enabled = []
        else:

      self.operations_enabled = [x for x in operations_list if OperationKindNames[x] in args.operations.split(',')]
            operations_list = [OperationKind.Gemm, OperationKind.Conv2d]

    if args.kernels == 'all':
      self.kernel_names = []
    else:
      self.kernel_names = [x for x in args.kernels.split(',') if x != '']
            self.operations_enabled = [
                x
                for x in operations_list
                if OperationKindNames[x] in args.operations.split(",")
            ]

    self.ignore_kernel_names = [x for x in args.ignore_kernels.split(',') if x != '']
        if args.kernels == "all":
            self.kernel_names = []
        else:
            self.kernel_names = [x for x in args.kernels.split(",") if x != ""]

    if args.kernel_filter_file is None:
        self.kernel_filter_list = []
    else:
        self.kernel_filter_list = self.get_kernel_filters(args.kernel_filter_file)
        self.ignore_kernel_names = [
            x for x in args.ignore_kernels.split(",") if x != ""
        ]

        if args.kernel_filter_file is None:
            self.kernel_filter_list = []
        else:
            self.kernel_filter_list = self.get_kernel_filters(args.kernel_filter_file)

    self.operation_count = 0
    self.operations_by_name = {}
    self.top_level_prologue = '''
        self.operation_count = 0
        self.operations_by_name = {}
        self.top_level_prologue = """

 #include "cutlass/library/library.h"
 #include "cutlass/library/manifest.h"
@@ -159,208 +189,241 @@ ${prototypes}

 void initialize_all(Manifest &manifest) {

 '''
    self.top_level_reserve = '  manifest.reserve(${operation_count});\n\n'
    self.top_level_epilogue = '''
 """
        self.top_level_reserve = "  manifest.reserve(${operation_count});\n\n"
        self.top_level_epilogue = """
 }

 } // namespace library
 } // namespace cutlass

 '''


  def get_kernel_filters (self, kernelListFile):
    if os.path.isfile(kernelListFile):
        with open(kernelListFile, 'r') as fileReader:
            lines = [line.rstrip() for line in fileReader if not line.startswith("#")]
        
        lines = [re.compile(line) for line in lines if line]
        return lines
    else:
        return []
 """

    def get_kernel_filters(self, kernelListFile):
        if os.path.isfile(kernelListFile):
            with open(kernelListFile, "r") as fileReader:
                lines = [
                    line.rstrip() for line in fileReader if not line.startswith("#")
                ]

            lines = [re.compile(line) for line in lines if line]
            return lines
        else:
            return []

  def filter_out_kernels(self, kernel_name, kernel_filter_list):
    def filter_out_kernels(self, kernel_name, kernel_filter_list):

    for kernel_filter_re in kernel_filter_list:
        if kernel_filter_re.search(kernel_name) is not None:
            return True
        
    return False
        for kernel_filter_re in kernel_filter_list:
            if kernel_filter_re.search(kernel_name) is not None:
                return True

    
  #
  def _filter_string_matches(self, filter_string, haystack):
    ''' Returns true if all substrings appear in the haystack in order'''
    substrings = filter_string.split('*')
    for sub in substrings:
      idx = haystack.find(sub)
      if idx < 0:
        return False
      haystack = haystack[idx + len(sub):]
    return True

  #
  def filter(self, operation):
    ''' Filtering operations based on various criteria'''

    # filter based on compute capability
    enabled = False
    for cc in self.compute_capabilities:
      if cc >= operation.tile_description.minimum_compute_capability and \
        cc <= operation.tile_description.maximum_compute_capability:

        enabled = True
        break

    if not enabled:
      return False

    if len(self.operations_enabled) and not operation.operation_kind in self.operations_enabled:
      return False

    # eliminate duplicates
    if operation.procedural_name() in self.operations_by_name.keys():
      return False

    # Filter based on list of valid substrings
    if len(self.kernel_names):
      name = operation.procedural_name()
      enabled = False

      # compare against the include list
      for name_substr in self.kernel_names:
        if self._filter_string_matches(name_substr, name):
          enabled = True
          break

      # compare against the exclude list
      for name_substr in self.ignore_kernel_names:
        if self._filter_string_matches(name_substr, name):
          enabled = False
          break
          
    if len(self.kernel_filter_list) > 0:

    #
    def _filter_string_matches(self, filter_string, haystack):
        """ Returns true if all substrings appear in the haystack in order"""
        substrings = filter_string.split("*")
        for sub in substrings:
            idx = haystack.find(sub)
            if idx < 0:
                return False
            haystack = haystack[idx + len(sub) :]
        return True

    #
    def filter(self, operation):
        """ Filtering operations based on various criteria"""

        # filter based on compute capability
        enabled = False
        if self.filter_out_kernels(operation.procedural_name(), self.kernel_filter_list):
            enabled = True
        for cc in self.compute_capabilities:
            if (
                cc >= operation.tile_description.minimum_compute_capability
                and cc <= operation.tile_description.maximum_compute_capability
            ):

                enabled = True
                break

        if not enabled:
            return False

        if (
            len(self.operations_enabled)
            and not operation.operation_kind in self.operations_enabled
        ):
            return False

        # eliminate duplicates
        if operation.procedural_name() in self.operations_by_name.keys():
            return False

        # Filter based on list of valid substrings
        if len(self.kernel_names):
            name = operation.procedural_name()
            enabled = False

            # compare against the include list
            for name_substr in self.kernel_names:
                if self._filter_string_matches(name_substr, name):
                    enabled = True
                    break

            # compare against the exclude list
            for name_substr in self.ignore_kernel_names:
                if self._filter_string_matches(name_substr, name):
                    enabled = False
                    break

        if len(self.kernel_filter_list) > 0:
            enabled = False
            if self.filter_out_kernels(
                operation.procedural_name(), self.kernel_filter_list
            ):
                enabled = True

        # todo: filter based on compute data type
        return enabled

    #

    #
    def append(self, operation):
        """ 
      Inserts the operation.

      operation_kind -> configuration_name -> []
    """

    # todo: filter based on compute data type
    return enabled
  #
        if self.filter(operation):

  #
  def append(self, operation):
    ''' 
      Inserts the operation.
            self.selected_kernels.append(operation.procedural_name())

      operation_kind -> configuration_name -> []
    '''
            self.operations_by_name[operation.procedural_name()] = operation

    if self.filter(operation):
    
      self.selected_kernels.append(operation.procedural_name())
            # add the configuration
            configuration_name = operation.configuration_name()

      self.operations_by_name[operation.procedural_name()] = operation
            if operation.operation_kind not in self.operations.keys():
                self.operations[operation.operation_kind] = {}

      # add the configuration
      configuration_name = operation.configuration_name()
            if (
                configuration_name
                not in self.operations[operation.operation_kind].keys()
            ):
                self.operations[operation.operation_kind][configuration_name] = []

      if operation.operation_kind not in self.operations.keys():
        self.operations[operation.operation_kind] = {}
            self.operations[operation.operation_kind][configuration_name].append(
                operation
            )
            self.operation_count += 1

      if configuration_name not in self.operations[operation.operation_kind].keys():
        self.operations[operation.operation_kind][configuration_name] = []
    #

      self.operations[operation.operation_kind][configuration_name].append(operation)
      self.operation_count += 1
  #
    #
    def emit(self, target=GeneratorTarget.Library):

  #
  def emit(self, target = GeneratorTarget.Library):
        operation_emitters = {GeneratorTarget.Library: EmitOperationKindLibrary}

    operation_emitters = {
      GeneratorTarget.Library: EmitOperationKindLibrary 
    }
        generated_path = os.path.join(self.args.curr_build_dir, "generated")

    generated_path = os.path.join(self.args.curr_build_dir, 'generated')
        # create generated/
        if os.path.exists(generated_path):
            shutil.rmtree(generated_path)

    # create generated/
    if os.path.exists(generated_path):
      shutil.rmtree(generated_path)
        os.mkdir(generated_path)

    os.mkdir(generated_path)
        source_files = []

    source_files = []
        top_level_path = os.path.join(generated_path, "initialize_all.cpp")
        with open(top_level_path, "w") as top_level_file:

    top_level_path = os.path.join(generated_path, 'initialize_all.cpp')
    with open(top_level_path, 'w') as top_level_file:
            if target == GeneratorTarget.Library:
                source_files.append(top_level_path)

      if target == GeneratorTarget.Library:
        source_files.append(top_level_path)
            prototypes = []
            for operation_kind, configurations in self.operations.items():
                prototypes.append(
                    SubstituteTemplate(
                        "void initialize_all_${operation_kind}_operations(Manifest &manifest);",
                        {"operation_kind": OperationKindNames[operation_kind]},
                    )
                )

      prototypes = []
      for operation_kind, configurations in self.operations.items():
        prototypes.append(SubstituteTemplate(
          "void initialize_all_${operation_kind}_operations(Manifest &manifest);",
          {'operation_kind': OperationKindNames[operation_kind]}))
            top_level_file.write(
                SubstituteTemplate(
                    self.top_level_prologue, {"prototypes": "\n".join(prototypes)}
                )
            )

      top_level_file.write(SubstituteTemplate(self.top_level_prologue,
        {'prototypes': "\n".join(prototypes)}))
            top_level_file.write(
                SubstituteTemplate(
                    self.top_level_reserve,
                    {"operation_count": str(self.operation_count)},
                )
            )

      top_level_file.write(SubstituteTemplate(
        self.top_level_reserve, {'operation_count': str(self.operation_count)}))
            # for each operation kind, emit initializer for all configurations
            for operation_kind, configurations in self.operations.items():

      # for each operation kind, emit initializer for all configurations
      for operation_kind, configurations in self.operations.items():
        
        with operation_emitters[target](generated_path, operation_kind, self.args) as operation_kind_emitter:
          for configuration_name, operations in configurations.items():
            operation_kind_emitter.emit(configuration_name, operations)
                with operation_emitters[target](
                    generated_path, operation_kind, self.args
                ) as operation_kind_emitter:
                    for configuration_name, operations in configurations.items():
                        operation_kind_emitter.emit(configuration_name, operations)

          source_files += operation_kind_emitter.source_files
                    source_files += operation_kind_emitter.source_files

        top_level_file.write(SubstituteTemplate(
          "  initialize_all_${operation_kind}_operations(manifest);\n",
          {'operation_kind': OperationKindNames[operation_kind]}))
                top_level_file.write(
                    SubstituteTemplate(
                        "  initialize_all_${operation_kind}_operations(manifest);\n",
                        {"operation_kind": OperationKindNames[operation_kind]},
                    )
                )

      top_level_file.write(self.top_level_epilogue)
            top_level_file.write(self.top_level_epilogue)

    # write the manifest.cmake file containing paths from all targets
    manifest_path = os.path.join(generated_path, "manifest.cmake")
    with open(manifest_path, "w") as manifest_file:
        # write the manifest.cmake file containing paths from all targets
        manifest_path = os.path.join(generated_path, "manifest.cmake")
        with open(manifest_path, "w") as manifest_file:

      target_name = 'cutlass_library_objs'
            target_name = "cutlass_library_objs"

      target_text = SubstituteTemplate("""cutlass_target_sources(
            target_text = SubstituteTemplate(
                """cutlass_target_sources(
  ${target_name}
  BATCH_SOURCES ON
  PRIVATE
 """, { 'target_name': target_name})
 """,
                {"target_name": target_name},
            )

      manifest_file.write(target_text)
            manifest_file.write(target_text)

            for source_file in source_files:
                manifest_file.write("    %s\n" % str(source_file.replace("\\", "/")))
            manifest_file.write(")")

    #

      for source_file in source_files:
        manifest_file.write("    %s\n" % str(source_file.replace('\\', '/')))
      manifest_file.write(")")
  #

 ###################################################################################################


 def GenerateManifest(args, operations, output_dir):
  assert isinstance(operations, list)
  if len(operations) == 0:
    return
  op = operations[0]
  required_cuda_ver_major = op.required_cuda_ver_major
  required_cuda_ver_minor = op.required_cuda_ver_minor

  manifest_path = os.path.join(output_dir, "all_%s_%s_operations.cu" % (args.operations, args.type))
  f = open(manifest_path, "w")
  f.write("""
    assert isinstance(operations, list)
    if len(operations) == 0:
        return
    op = operations[0]
    required_cuda_ver_major = op.required_cuda_ver_major
    required_cuda_ver_minor = op.required_cuda_ver_minor

    manifest_path = os.path.join(
        output_dir, "all_%s_%s_operations.cu" % (args.operations, args.type)
    )
    f = open(manifest_path, "w")
    f.write(
        """
 /*
 Generated by generator.py - Do not edit.
 */
@@ -374,24 +437,35 @@ def GenerateManifest(args, operations, output_dir):
 namespace cutlass {
 namespace library {

 """ % (str(required_cuda_ver_major), str(required_cuda_ver_major), str(required_cuda_ver_minor)))

  for op in operations:
      f.write("void initialize_%s(Manifest &manifest);\n" % op.procedural_name())

  f.write("""
 """
        % (
            str(required_cuda_ver_major),
            str(required_cuda_ver_major),
            str(required_cuda_ver_minor),
        )
    )

    for op in operations:
        f.write("void initialize_%s(Manifest &manifest);\n" % op.procedural_name())

    f.write(
        """
 void initialize_all_%s_%s_operations(Manifest &manifest) {
 """ % (args.operations, args.type))
 """
        % (args.operations, args.type)
    )

  for op in operations:
    f.write("    initialize_%s(manifest);\n" % op.procedural_name())
    for op in operations:
        f.write("    initialize_%s(manifest);\n" % op.procedural_name())

  f.write("""
    f.write(
        """
 }

 }  // namespace library
 }  // namespace cutlass

 #endif
 """)
  f.close()
 """
    )
    f.close()
--- a/dnn/src/CMakeLists.txt
+++ b/dnn/src/CMakeLists.txt
@@ -181,6 +181,8 @@ if(MGE_WITH_CUDA)
  gen_cutlass_kimpl(conv2d simt CUTLASS_SOURCES)
  gen_cutlass_kimpl(conv2d tensorop8816 CUTLASS_SOURCES)
  gen_cutlass_kimpl(conv2d tensorop8832 CUTLASS_SOURCES)
  gen_cutlass_kimpl(dwconv2d_fprop simt CUTLASS_SOURCES)
  gen_cutlass_kimpl(dwconv2d_fprop tensorop884 CUTLASS_SOURCES)
  list(APPEND SOURCES ${CUTLASS_SOURCES})
  list(APPEND SOURCES ${CUSOURCES})
 endif()
--- a/dnn/src/cuda/conv_bias/algo.cpp
+++ b/dnn/src/cuda/conv_bias/algo.cpp
@@ -92,6 +92,7 @@ ConvBiasForwardImpl::AlgoPack::AlgoPack() {
    for (auto&& algo : int8_nchw4_dotprod) {
        all_algos.push_back(&algo);
    }
    fill_dwconv_algos();
    all_algos.push_back(&int8_chwn4_dotprod);
    all_algos.push_back(&fallback_nchw_qs8);
    for (size_t i = all_algo_size; i < all_algos.size(); ++i) {
@@ -301,6 +302,32 @@ void ConvBiasForwardImpl::AlgoPack::fill_imma_algos() {
 }
 #endif

 void ConvBiasForwardImpl::AlgoPack::fill_dwconv_algos() {
    using AlgoParam = AlgoCutlassConvolutionBase::AlgoParam;
    f32_implicit_bmm.emplace_back(AlgoParam{128, 128, 8, 32, 64, 8, 1, 1, 1, 2});
    f32_implicit_bmm.emplace_back(AlgoParam{128, 64, 8, 64, 32, 8, 1, 1, 1, 2});
    f32_implicit_bmm.emplace_back(AlgoParam{128, 32, 8, 64, 32, 8, 1, 1, 1, 2});
    f32_implicit_bmm.emplace_back(AlgoParam{32, 128, 8, 32, 64, 8, 1, 1, 1, 2});
    f32_implicit_bmm.emplace_back(AlgoParam{64, 128, 8, 64, 32, 8, 1, 1, 1, 2});
    f32_implicit_bmm.emplace_back(AlgoParam{64, 64, 8, 64, 32, 8, 1, 1, 1, 2});
    f32_implicit_bmm.emplace_back(AlgoParam{32, 64, 8, 32, 64, 8, 1, 1, 1, 2});
    f32_implicit_bmm.emplace_back(AlgoParam{32, 32, 8, 32, 32, 8, 1, 1, 1, 2});
    f32_implicit_bmm.emplace_back(AlgoParam{64, 32, 8, 64, 32, 8, 1, 1, 1, 2});
    for (auto&& algo : f32_implicit_bmm) {
        all_algos.push_back(&algo);
    }
 #if CUDA_VERSION >= 10020
    f16_implicit_bmm.emplace_back(AlgoParam{128, 128, 32, 32, 32, 32, 8, 8, 4, 2});
    f16_implicit_bmm.emplace_back(AlgoParam{128, 256, 32, 64, 64, 32, 8, 8, 4, 2});
    f16_implicit_bmm.emplace_back(AlgoParam{128, 64, 32, 32, 32, 32, 8, 8, 4, 2});
    f16_implicit_bmm.emplace_back(AlgoParam{64, 128, 32, 32, 32, 32, 8, 8, 4, 2});
    f16_implicit_bmm.emplace_back(AlgoParam{64, 64, 32, 32, 32, 32, 8, 8, 4, 2});
    for (auto&& algo : f16_implicit_bmm) {
        all_algos.push_back(&algo);
    }
 #endif
 }

 void ConvBiasForwardImpl::AlgoPack::fill_dp4a_algos() {
    using AlgoParam = AlgoInt8NCHW4DotProdImplicitGemm::AlgoParam;
    int8_nchw4_dotprod.emplace_back(AlgoParam{128, 128, 32, 64, 32, 32, 1, 1, 4, 2});
--- a/dnn/src/cuda/conv_bias/algo.h
+++ b/dnn/src/cuda/conv_bias/algo.h
@@ -84,7 +84,9 @@ public:
        CUDA_IMPLICIT_GEMM_1X1_SASS_NCHW32_IMMA_INT8,
        CUDA_IMPLICIT_GEMM_SASS_NCHW64_IMMA_INT4_INT4,
        CUDA_IMPLICIT_GEMM_SASS_NCHW64_IMMA_UINT4_INT4,
        CUDA_FALLBACK_NCHW_INT4
        CUDA_FALLBACK_NCHW_INT4,
        CUDA_IMPLICIT_BATCHED_GEMM_FMA_NCHW_F32,
        CUDA_IMPLICIT_BATCHED_GEMM_HMMA_NCHW_F16,
    };
    using Mapper = std::unordered_map<AlgorithmDesc, AlgoBase*>;

@@ -503,6 +505,8 @@ public:
 * +----+--- AlgoInt4Int4NHWCIMMAImplicitGemm
 * +----+--- AlgoUInt4Int4NHWCIMMAImplicitGemm
 * +
 * +--- AlgoFloat32NCHWImplicitBatchedGemm
 * +--- AlgoFloat16NCHWHMMAImplicitBatchedGemm
 */

 /*
@@ -516,7 +520,13 @@ public:

    // corresponds to cutlass::conv::ConvType. we hope that algo.h does not
    // depend on cutlass headers
    enum class ConvType { kConvolution, kBatchConvolution, kLocal, kLocalShare };
    enum class ConvType {
        kConvolution,
        kBatchConvolution,
        kLocal,
        kLocalShare,
        kDepthwiseConvolution,
    };

    // common parameters for operation selection
    struct AlgoParam {
@@ -558,7 +568,8 @@ public:
            size_t wo, size_t ph, size_t pw, size_t sh, size_t sw, size_t dh, size_t dw,
            const void* alpha, const void* beta, const void* gamma, const void* delta,
            const void* theta, const void* threshold, const void* dst_scale,
            cudaStream_t stream, const void* extra_param = nullptr) const;
            cudaStream_t stream, const void* extra_param = nullptr,
            size_t groups = 1) const;

 protected:
    AlgoParam m_algo_param;
@@ -992,6 +1003,54 @@ private:
 };
 #endif

 class ConvBiasForwardImpl::AlgoFloat32NCHWFMAImplicitBatchedGemm final
        : public AlgoCutlassConvolutionBase {
 public:
    AlgoFloat32NCHWFMAImplicitBatchedGemm(AlgoParam algo_param)
            : AlgoCutlassConvolutionBase(algo_param) {
        m_name = ConvBias::algo_name<ConvBias::DirectParam>(
                ssprintf(
                        "FLOAT32_NCHW_FMA_IMPLICIT_BATCHED_GEMM%s",
                        m_algo_param.to_string().c_str()),
                ConvBias::DirectParam{});
    }
    bool is_available(const SizeArgs& args) const override;
    size_t get_workspace_in_bytes(const SizeArgs& /* args */) const override {
        return 0;
    }
    void exec(const ExecArgs& args) const override;
    const char* name() const override { return m_name.c_str(); };
    AlgoAttribute attribute() const override { return AlgoAttribute::REPRODUCIBLE; }
    MEGDNN_DECL_ALGO_TYPE(CUDA_IMPLICIT_BATCHED_GEMM_FMA_NCHW_F32);

 private:
    std::string m_name;
 };

 class ConvBiasForwardImpl::AlgoFloat16NCHWHMMAImplicitBatchedGemm final
        : public AlgoCutlassConvolutionBase {
 public:
    AlgoFloat16NCHWHMMAImplicitBatchedGemm(AlgoParam algo_param)
            : AlgoCutlassConvolutionBase(algo_param) {
        m_name = ConvBias::algo_name<ConvBias::DirectParam>(
                ssprintf(
                        "FLOAT16_NCHW_HMMA_IMPLICIT_BATCHED_GEMM%s",
                        m_algo_param.to_string().c_str()),
                ConvBias::DirectParam{});
    }
    bool is_available(const SizeArgs& args) const override;
    size_t get_workspace_in_bytes(const SizeArgs& /* args */) const override {
        return 0;
    }
    void exec(const ExecArgs& args) const override;
    const char* name() const override { return m_name.c_str(); };
    AlgoAttribute attribute() const override { return AlgoAttribute::REPRODUCIBLE; }
    MEGDNN_DECL_ALGO_TYPE(CUDA_IMPLICIT_BATCHED_GEMM_HMMA_NCHW_F16);

 private:
    std::string m_name;
 };

 class ConvBiasForwardImpl::AlgoBFloat16 final : public AlgoBase {
 public:
    bool is_available(const SizeArgs& args) const override;
@@ -1048,6 +1107,8 @@ public:
    std::vector<AlgoInt4Int4NHWCIMMAImplicitGemm> int4_int4_nhwc_imma;
    std::vector<AlgoUInt4Int4NHWCIMMAImplicitGemm> uint4_int4_nhwc_imma;
 #endif
    std::vector<AlgoFloat32NCHWFMAImplicitBatchedGemm> f32_implicit_bmm;
    std::vector<AlgoFloat16NCHWHMMAImplicitBatchedGemm> f16_implicit_bmm;
    AlgoGroupConvGeneral group;
    AlgoBFloat16 bfloat16;

@@ -1063,6 +1124,7 @@ private:
 #endif
    void fill_cudnn_algos();
    void fill_dp4a_algos();
    void fill_dwconv_algos();
 };

 }  // namespace cuda
--- a/dnn/src/cuda/conv_bias/cutlass_convolution_base.cpp
+++ b/dnn/src/cuda/conv_bias/cutlass_convolution_base.cpp
@@ -74,13 +74,18 @@ cutlass::conv::ConvType convert_conv_type(Base::ConvType conv_type) {
            return cutlass::conv::ConvType::kLocal;
        case Base::ConvType::kLocalShare:
            return cutlass::conv::ConvType::kLocalShare;
        case Base::ConvType::kDepthwiseConvolution:
            return cutlass::conv::ConvType::kDepthwiseConvolution;
        default:
            megdnn_assert(0, "invalid conv type");
    }
 }

 NumericTypeID convert_dtype(DTypeEnum dtype) {
    switch (dtype) {
 NumericTypeID convert_dtype(DType dtype) {
    // just make convolution with no bias happy
    if (!dtype.valid())
        return NumericTypeID::kF32;
    switch (dtype.enumv()) {
        case DTypeEnum::Float32:
            return NumericTypeID::kF32;
        case DTypeEnum::Float16:
@@ -100,6 +105,21 @@ NumericTypeID convert_dtype(DTypeEnum dtype) {
    }
 }

 NumericTypeID get_accumulator_dtype(
        DType dtype, const param::ConvBias::ComputeMode comp_mode) {
    if (dtype.category() == DTypeCategory::QUANTIZED) {
        return NumericTypeID::kS32;
    } else {
        megdnn_assert(dtype.category() == DTypeCategory::FLOAT);
        if (comp_mode == param::ConvBias::ComputeMode::DEFAULT) {
            return convert_dtype(dtype);
        } else {
            megdnn_assert(comp_mode == param::ConvBias::ComputeMode::FLOAT32);
            return NumericTypeID::kF32;
        }
    }
 }

 struct LayoutPack {
    LayoutTypeID src;
    LayoutTypeID filter;
@@ -149,6 +169,9 @@ LayoutPack get_layout_pack(const param::ConvBias::Format format, int access_type
                default:
                    megdnn_assert(0, "invalid access_type");
            }
        case Format::NCHW:
            return {LayoutTypeID::kTensorNCHW, LayoutTypeID::kTensorNCHW,
                    LayoutTypeID::kTensorNCHW, LayoutTypeID::kTensorNCHW};
        default:
            megdnn_assert(0, "invalid format");
    }
@@ -177,6 +200,93 @@ EpilogueType get_epilogue_type(const param::ConvBias::NonlineMode mode, bool cla
    megdnn_assert(0, "invalid nonlinear mode");
 }

 std::pair<int, int> get_tensor_alignment(
        const param::ConvBias::Format format, const TensorLayout& src,
        const TensorLayout& filter, const Base::AlgoParam& algo_param,
        bool is_chanwise) {
    int alignment_src = 0;
    int alignment_filter = 0;

    using Format = param::ConvBias::Format;

    // get tensor alignment for tensor op operations
    // for tensor op operations, the alignment is determined by the size of a vector
    auto get_tensor_alignment_tensor_op = [&]() {
        switch (format) {
            /// case int8
            case Format::NCHW32:
            case Format::NCHW32_NCHW4:
                alignment_src = 16;
                alignment_filter = 16;
                break;
            /// case int4 or uint4
            case Format::NCHW64:
                alignment_src = 32;
                alignment_filter = 32;
                break;
            case Format::NHWC:
                alignment_src = alignment_filter = algo_param.access_size;
                break;
            default:
                megdnn_throw("invalid format");
        };
    };

    // get tensor alignment for dot product operations
    // for integer dot product operations, alignment src is always 4
    // and the alignment filter is determined by the threadblock shape
    auto get_tensor_alignment_dp4a = [&]() {
        megdnn_assert(
                format == Format::NCHW4 || format == Format::NCHW4_NCHW ||
                format == Format::NCHW4_NHWC || format == Format::NCHW4_NCHW32);
        alignment_src = 4;
        // determine alignment filter
        constexpr int warp_size = 32;
        int threads = warp_size * algo_param.threadblock_m * algo_param.threadblock_n *
                      algo_param.threadblock_k /
                      (algo_param.warp_m * algo_param.warp_n * algo_param.warp_k);
        int threadblock_loads = filter.dtype.size(
                algo_param.threadblock_m * algo_param.threadblock_n *
                algo_param.threadblock_k);
        int load_per_thread = threadblock_loads / threads;
        if (load_per_thread >= 16)
            alignment_filter = 16;
        else if (load_per_thread >= 8)
            alignment_filter = 8;
        else {
            megdnn_assert(load_per_thread >= 4);
            alignment_filter = 4;
        }
    };

    // get tensor alignment for depthwise convolution
    auto get_tensor_alignment_dwconv2d_nchw = [&]() {
        alignment_filter = 1;
        size_t wi = src.dtype.size(src[3]);  // width extent in bytes
        for (size_t candidate : {16, 4, 2}) {
            if (wi % candidate == 0) {
                alignment_src = candidate;
                break;
            }
        }
        alignment_src /= src.dtype.size(1);
    };

    if (format == Format::NCHW32 || format == Format::NCHW32_NCHW4 ||
        format == Format::NCHW64 || format == Format::NCHW64) {
        get_tensor_alignment_tensor_op();
    } else if (
            format == Format::NCHW4 || format == Format::NCHW4_NCHW ||
            format == Format::NCHW4_NHWC || format == Format::NCHW4_NCHW32) {
        get_tensor_alignment_dp4a();
    } else {
        /// the following is used for depthwise convolution
        megdnn_assert(format == Format::NCHW && is_chanwise);
        get_tensor_alignment_dwconv2d_nchw();
    }
    megdnn_assert(alignment_src >= 1 && alignment_filter >= 1);
    return {alignment_src, alignment_filter};
 }
 }  // namespace

 const Operation* ConvBiasForwardImpl::AlgoCutlassConvolutionBase::get_cutlass_conv_op(
@@ -185,23 +295,36 @@ const Operation* ConvBiasForwardImpl::AlgoCutlassConvolutionBase::get_cutlass_co
    auto&& param = args.opr->param();
    auto layouts = get_layout_pack(param.format, m_algo_param.access_size);
    auto epilogue_type = get_epilogue_type(
            param.nonlineMode, args.dst_layout->dtype.enumv() != DTypeEnum::Float32);
            param.nonlineMode,
            args.dst_layout->dtype.category() != DTypeCategory::FLOAT);

    cutlass::conv::SpecialOptimizeDesc special_optimization =
            (use_conv_filter_unity_opt)
                    ? cutlass::conv::SpecialOptimizeDesc::CONV_FILTER_UNITY
                    : cutlass::conv::SpecialOptimizeDesc::NONE;

    int alignment_src, alignment_filter;
    auto&& fm = args.filter_meta;
    bool is_chanwise = param.sparse == param::ConvBias::Sparse::GROUP && fm.icpg == 1 &&
                       fm.ocpg == 1;
    std::tie(alignment_src, alignment_filter) = get_tensor_alignment(
            param.format, *args.src_layout, *args.filter_layout, m_algo_param,
            is_chanwise);

    auto accumulator_dtype =
            get_accumulator_dtype(args.src_layout->dtype, param.compute_mode);

    ConvolutionKey key{
            convert_conv_op(conv_op),
            convert_dtype(args.src_layout->dtype.enumv()),
            convert_dtype(args.src_layout->dtype),
            layouts.src,
            convert_dtype(args.filter_layout->dtype.enumv()),
            convert_dtype(args.filter_layout->dtype),
            layouts.filter,
            convert_dtype(args.dst_layout->dtype.enumv()),
            convert_dtype(args.dst_layout->dtype),
            layouts.dst,
            convert_dtype(args.bias_layout->dtype.enumv()),
            convert_dtype(args.bias_layout->dtype),
            layouts.bias,
            accumulator_dtype,
            convert_conv_type(conv_type),
            m_algo_param.threadblock_m,
            m_algo_param.threadblock_n,
@@ -215,6 +338,8 @@ const Operation* ConvBiasForwardImpl::AlgoCutlassConvolutionBase::get_cutlass_co
            epilogue_type,
            m_algo_param.stage,
            special_optimization,
            alignment_src,
            alignment_filter,
            without_shared_load};

    return Singleton::get().operation_table.find_op(key);
@@ -227,13 +352,16 @@ void ConvBiasForwardImpl::AlgoCutlassConvolutionBase::execute_cutlass_conv_op(
        size_t pw, size_t sh, size_t sw, size_t dh, size_t dw, const void* alpha,
        const void* beta, const void* gamma, const void* delta, const void* theta,
        const void* threshold, const void* dst_scale, cudaStream_t stream,
        const void* extra_param) const {
        const void* extra_param, size_t groups) const {
    // gcc prints warnings when size_t values are implicitly narrowed to int
    cutlass::conv::Conv2dProblemSize problem_size{
            int(n),  int(hi), int(wi), int(ci),
            int(co), int(fh), int(fw), int(ho),
            int(wo), int(ph), int(pw), int(sh),
            int(sw), int(dh), int(dw), cutlass::conv::Mode::kCrossCorrelation};
            int(n),      int(hi), int(wi), int(ci),
            int(co),     int(fh), int(fw), int(ho),
            int(wo),     int(ph), int(pw), int(sh),
            int(sw),     int(dh), int(dw), cutlass::conv::Mode::kCrossCorrelation,
            1,            // split k slices, always 1
            int(groups),  // groups
    };

    ConvolutionArguments conv_args{
            problem_size, src,   filter, bias,  z,         dst,       alpha,
--- a/dnn/src/cuda/conv_bias/implicit_batched_gemm_float16_nchw_hmma.cpp
+++ b/dnn/src/cuda/conv_bias/implicit_batched_gemm_float16_nchw_hmma.cpp
@@ -0,0 +1,95 @@
 /**
 * \file dnn/src/cuda/conv_bias/implicit_batched_gemm_float16_nchw_hmma.cpp
 * MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
 *
 * Copyright (c) 2014-2021 Megvii Inc. All rights reserved.
 *
 * Unless required by applicable law or agreed to in writing,
 * software distributed under the License is distributed on an
 * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or
 * implied.
 */

 #include "src/common/conv_bias.h"
 #include "src/cuda/conv_bias/algo.h"
 #include "src/cuda/convolution_helper/parameter.cuh"
 #include "src/cuda/utils.h"

 using namespace megdnn;
 using namespace cuda;

 bool ConvBiasForwardImpl::AlgoFloat16NCHWHMMAImplicitBatchedGemm::is_available(
        const SizeArgs& args) const {
 #define RETURN_IF_FALSE(stmt_) \
    if (!(stmt_))              \
        return false;
    RETURN_IF_FALSE(
            args.src_layout->is_contiguous() && args.dst_layout->is_contiguous());
    using Param = param::ConvBias;
    using Format = Param::Format;
    using Sparse = Param::Sparse;
    using Mode = Param::Mode;
    auto&& param = args.opr->param();
    auto&& fm = args.filter_meta;
    RETURN_IF_FALSE(
            param.format == Format::NCHW &&
            args.src_layout->dtype.enumv() == DTypeEnum::Float16 &&
            args.filter_layout->dtype.enumv() == DTypeEnum::Float16 &&
            args.dst_layout->dtype.enumv() == DTypeEnum::Float16);
    RETURN_IF_FALSE(
            args.bias_layout->ndim <= 0 ||
            (args.bias_layout->dtype.enumv() == DTypeEnum::Float16 &&
             check_bias_share_in_channel(*args.bias_layout, param.format)));
    RETURN_IF_FALSE(
            args.z_layout->ndim <= 0 ||
            args.z_layout->dtype.enumv() == DTypeEnum::Float16);
    RETURN_IF_FALSE(param.sparse == Sparse::GROUP);
    RETURN_IF_FALSE(param.mode == Mode::CROSS_CORRELATION);
    // check if channelwise convolution
    RETURN_IF_FALSE(fm.icpg == 1 && fm.ocpg == 1);
    const auto* op = get_cutlass_conv_op(
            args, ConvOperator::kFprop, ConvType::kDepthwiseConvolution, false, false);
    RETURN_IF_FALSE(op != nullptr);
    return true;
 #undef RETURN_IF_FALSE
 }

 void ConvBiasForwardImpl::AlgoFloat16NCHWHMMAImplicitBatchedGemm::exec(
        const ExecArgs& args) const {
    auto&& param = args.opr->param();
    auto&& fm = args.filter_meta;
    size_t n = args.src_layout->operator[](0), hi = args.src_layout->operator[](2),
           wi = args.src_layout->operator[](3);
    size_t ho = args.dst_layout->operator[](2), wo = args.dst_layout->operator[](3);
    size_t co = fm.group;
    size_t ci = co;
    // check if channelwise convolution
    megdnn_assert(fm.icpg == 1 && fm.ocpg == 1);
    auto&& stream = cuda_stream(args.opr->handle());

    float alpha = 1.f;
    float beta = args.bias_layout->ndim > 0 ? 1.f : 0.f;
    void* bias_ptr = args.bias_layout->ndim > 0 ? args.bias_tensor->raw_ptr() : nullptr;
    float gamma = args.z_layout->ndim > 0 ? 1.f : 0.f;
    void* z_ptr = args.z_layout->ndim > 0 ? args.z_tensor->raw_ptr() : nullptr;

    // dummy parameters, used for quantization cases
    float theta = 0.f;
    float delta = 0.f;
    float threshold = 0.f;

    const auto* op = get_cutlass_conv_op(
            args, ConvOperator::kFprop, ConvType::kDepthwiseConvolution, false, false);

    UNPACK_CONV_PARAMETER(fm, param);
    MARK_USED_VAR
    execute_cutlass_conv_op(
            op, args.src_tensor->raw_ptr(), args.filter_tensor->raw_ptr(), bias_ptr,
            z_ptr, args.dst_tensor->raw_ptr(), nullptr, n, hi, wi, ci, co, fh, fw, ho,
            wo, ph, pw, sh, sw, dh, dw, &alpha, &beta, &gamma, &delta, &theta,
            &threshold, nullptr, stream, nullptr, fm.group);

    after_kernel_launch();
 }

 // vim: syntax=cpp.doxygen
--- a/dnn/src/cuda/conv_bias/implicit_batched_gemm_float32_nchw_fma.cpp
+++ b/dnn/src/cuda/conv_bias/implicit_batched_gemm_float32_nchw_fma.cpp
@@ -0,0 +1,95 @@
 /**
 * \file dnn/src/cuda/conv_bias/implicit_batched_gemm_float32_nchw_fma.cpp
 * MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
 *
 * Copyright (c) 2014-2021 Megvii Inc. All rights reserved.
 *
 * Unless required by applicable law or agreed to in writing,
 * software distributed under the License is distributed on an
 * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or
 * implied.
 */

 #include "src/common/conv_bias.h"
 #include "src/cuda/conv_bias/algo.h"
 #include "src/cuda/convolution_helper/parameter.cuh"
 #include "src/cuda/utils.h"

 using namespace megdnn;
 using namespace cuda;

 bool ConvBiasForwardImpl::AlgoFloat32NCHWFMAImplicitBatchedGemm::is_available(
        const SizeArgs& args) const {
 #define RETURN_IF_FALSE(stmt_) \
    if (!(stmt_))              \
        return false;
    RETURN_IF_FALSE(
            args.src_layout->is_contiguous() && args.dst_layout->is_contiguous());
    using Param = param::ConvBias;
    using Format = Param::Format;
    using Sparse = Param::Sparse;
    using Mode = Param::Mode;
    auto&& param = args.opr->param();
    auto&& fm = args.filter_meta;
    RETURN_IF_FALSE(
            param.format == Format::NCHW &&
            args.src_layout->dtype.enumv() == DTypeEnum::Float32 &&
            args.filter_layout->dtype.enumv() == DTypeEnum::Float32 &&
            args.dst_layout->dtype.enumv() == DTypeEnum::Float32);
    RETURN_IF_FALSE(
            args.bias_layout->ndim <= 0 ||
            (args.bias_layout->dtype.enumv() == DTypeEnum::Float32 &&
             check_bias_share_in_channel(*args.bias_layout, param.format)));
    RETURN_IF_FALSE(
            args.z_layout->ndim <= 0 ||
            args.z_layout->dtype.enumv() == DTypeEnum::Float32);
    RETURN_IF_FALSE(param.sparse == Sparse::GROUP);
    RETURN_IF_FALSE(param.mode == Mode::CROSS_CORRELATION);
    // check if channelwise convolution
    RETURN_IF_FALSE(fm.icpg == 1 && fm.ocpg == 1);
    const auto* op = get_cutlass_conv_op(
            args, ConvOperator::kFprop, ConvType::kDepthwiseConvolution, false, false);
    RETURN_IF_FALSE(op != nullptr);
    return true;
 #undef RETURN_IF_FALSE
 }

 void ConvBiasForwardImpl::AlgoFloat32NCHWFMAImplicitBatchedGemm::exec(
        const ExecArgs& args) const {
    auto&& param = args.opr->param();
    auto&& fm = args.filter_meta;
    size_t n = args.src_layout->operator[](0), hi = args.src_layout->operator[](2),
           wi = args.src_layout->operator[](3);
    size_t ho = args.dst_layout->operator[](2), wo = args.dst_layout->operator[](3);
    size_t co = fm.group;
    size_t ci = co;
    // check if channelwise convolution
    megdnn_assert(fm.icpg == 1 && fm.ocpg == 1);
    auto&& stream = cuda_stream(args.opr->handle());

    float alpha = 1.f;
    float beta = args.bias_layout->ndim > 0 ? 1.f : 0.f;
    void* bias_ptr = args.bias_layout->ndim > 0 ? args.bias_tensor->raw_ptr() : nullptr;
    float gamma = args.z_layout->ndim > 0 ? 1.f : 0.f;
    void* z_ptr = args.z_layout->ndim > 0 ? args.z_tensor->raw_ptr() : nullptr;

    // dummy parameters, used for quantization cases
    float theta = 0.f;
    float delta = 0.f;
    float threshold = 0.f;

    const auto* op = get_cutlass_conv_op(
            args, ConvOperator::kFprop, ConvType::kDepthwiseConvolution, false, false);

    UNPACK_CONV_PARAMETER(fm, param);
    MARK_USED_VAR
    execute_cutlass_conv_op(
            op, args.src_tensor->raw_ptr(), args.filter_tensor->raw_ptr(), bias_ptr,
            z_ptr, args.dst_tensor->raw_ptr(), nullptr, n, hi, wi, ci, co, fh, fw, ho,
            wo, ph, pw, sh, sw, dh, dw, &alpha, &beta, &gamma, &delta, &theta,
            &threshold, nullptr, stream, nullptr, fm.group);

    after_kernel_launch();
 }

 // vim: syntax=cpp.doxygen
--- a/dnn/src/cuda/conv_bias/opr_impl.h
+++ b/dnn/src/cuda/conv_bias/opr_impl.h
@@ -71,6 +71,9 @@ public:
    class AlgoInt4Int4NHWCIMMAImplicitGemm;
    class AlgoUInt4Int4NHWCIMMAImplicitGemm;
    class AlgoBFloat16;
    // The following algorithms are suitable for channel wise convolution
    class AlgoFloat32NCHWFMAImplicitBatchedGemm;
    class AlgoFloat16NCHWHMMAImplicitBatchedGemm;

    class AlgoPack;

--- a/dnn/src/cuda/convolution/backward_data/implicit_gemm_int8_nchw4_dp4a.cpp
+++ b/dnn/src/cuda/convolution/backward_data/implicit_gemm_int8_nchw4_dp4a.cpp
@@ -39,6 +39,7 @@ const void* ConvolutionBackwardDataImpl::AlgoInt8NCHW4DotProdImplicitGemm::
            LayoutTypeID::kTensorNC4HW4,
            NumericTypeID::kS32,
            LayoutTypeID::kTensorNC4HW4,
            NumericTypeID::kS32,
            cutlass::conv::ConvType::kConvolution,
            m_algo_param.threadblock_m,
            m_algo_param.threadblock_n,
@@ -52,6 +53,8 @@ const void* ConvolutionBackwardDataImpl::AlgoInt8NCHW4DotProdImplicitGemm::
            cutlass::epilogue::EpilogueType::kBiasAddLinearCombinationClamp,
            m_algo_param.stage,
            special_optimization,
            4,
            16,
            false};
    return (void*)Singleton::get().operation_table.find_op(key);
 }
--- a/dnn/src/cuda/convolution/backward_data/implicit_gemm_int8_nchw_dp4a.cpp
+++ b/dnn/src/cuda/convolution/backward_data/implicit_gemm_int8_nchw_dp4a.cpp
@@ -39,6 +39,7 @@ const void* ConvolutionBackwardDataImpl::AlgoInt8NCHWDotProdImplicitGemm::
            LayoutTypeID::kTensorNC4HW4,
            NumericTypeID::kS32,
            LayoutTypeID::kTensorNC4HW4,
            NumericTypeID::kS32,
            cutlass::conv::ConvType::kConvolution,
            16,
            64,
@@ -52,6 +53,8 @@ const void* ConvolutionBackwardDataImpl::AlgoInt8NCHWDotProdImplicitGemm::
            cutlass::epilogue::EpilogueType::kBiasAddLinearCombinationClamp,
            2,
            special_optimization,
            4,
            4,
            false};
    return (void*)Singleton::get().operation_table.find_op(key);
 }
--- a/dnn/src/cuda/convolution/backward_data/implicit_gemm_int8_nhwc_imma.cpp
+++ b/dnn/src/cuda/convolution/backward_data/implicit_gemm_int8_nhwc_imma.cpp
@@ -50,6 +50,7 @@ const void* ConvolutionBackwardDataImpl::AlgoInt8NHWCIMMAImplicitGemm::get_avail
            LayoutTypeID::kTensorNHWC,
            NumericTypeID::kS32,
            LayoutTypeID::kTensorNHWC,
            NumericTypeID::kS32,
            cutlass::conv::ConvType::kConvolution,
            m_algo_param.threadblock_m,
            m_algo_param.threadblock_n,
@@ -63,6 +64,8 @@ const void* ConvolutionBackwardDataImpl::AlgoInt8NHWCIMMAImplicitGemm::get_avail
            cutlass::epilogue::EpilogueType::kBiasAddLinearCombinationClamp,
            m_algo_param.stage,
            special_optimization,
            m_algo_param.access_size,
            m_algo_param.access_size,
            false};
    return (void*)Singleton::get().operation_table.find_op(key);
 }
--- a/dnn/src/cuda/cutlass/initialize_all.cu
+++ b/dnn/src/cuda/cutlass/initialize_all.cu
@@ -54,24 +54,28 @@ namespace library {
 void initialize_all_gemm_simt_operations(Manifest& manifest);
 void initialize_all_conv2d_simt_operations(Manifest& manifest);
 void initialize_all_deconv_simt_operations(Manifest& manifest);
 void initialize_all_dwconv2d_fprop_simt_operations(Manifest& manifest);
 #if defined(CUTLASS_ARCH_MMA_SM75_SUPPORTED) && CUTLASS_ARCH_MMA_SM75_SUPPORTED
 void initialize_all_gemm_tensorop884_operations(Manifest& manifest);
 void initialize_all_gemm_tensorop1688_operations(Manifest& manifest);
 void initialize_all_conv2d_tensorop8816_operations(Manifest& manifest);
 void initialize_all_conv2d_tensorop8832_operations(Manifest& manifest);
 void initialize_all_deconv_tensorop8816_operations(Manifest& manifest);
 void initialize_all_dwconv2d_fprop_tensorop884_operations(Manifest& manifest);
 #endif

 void initialize_all(Manifest& manifest) {
    initialize_all_gemm_simt_operations(manifest);
    initialize_all_conv2d_simt_operations(manifest);
    initialize_all_deconv_simt_operations(manifest);
    initialize_all_dwconv2d_fprop_simt_operations(manifest);
 #if defined(CUTLASS_ARCH_MMA_SM75_SUPPORTED) && CUTLASS_ARCH_MMA_SM75_SUPPORTED
    initialize_all_gemm_tensorop884_operations(manifest);
    initialize_all_gemm_tensorop1688_operations(manifest);
    initialize_all_conv2d_tensorop8816_operations(manifest);
    initialize_all_conv2d_tensorop8832_operations(manifest);
    initialize_all_deconv_tensorop8816_operations(manifest);
    initialize_all_dwconv2d_fprop_tensorop884_operations(manifest);
 #endif
 }

--- a/dnn/src/cuda/cutlass/library.h
+++ b/dnn/src/cuda/cutlass/library.h
@@ -223,6 +223,9 @@ enum class ThreadblockSwizzleID {
    kConvolutionFpropTrans,
    kConvolutionDgradNCxHWx,
    kConvolutionDgradTrans,
    kDepthwiseConvolutionFprop,
    kDepthwiseConvolutionDgrad,
    kDepthwiseConvolutionWgrad,
    kInvalid
 };

--- a/dnn/src/cuda/cutlass/library_internal.h
+++ b/dnn/src/cuda/cutlass/library_internal.h
@@ -570,6 +570,27 @@ struct ThreadblockSwizzleMap<
            ThreadblockSwizzleID::kConvolutionDgradTrans;
 };

 template <>
 struct ThreadblockSwizzleMap<
        conv::threadblock::DepthwiseConvolutionFpropThreadblockSwizzle> {
    static ThreadblockSwizzleID const kId =
            ThreadblockSwizzleID::kDepthwiseConvolutionFprop;
 };

 template <>
 struct ThreadblockSwizzleMap<
        conv::threadblock::DepthwiseConvolutionDgradThreadblockSwizzle> {
    static ThreadblockSwizzleID const kId =
            ThreadblockSwizzleID::kDepthwiseConvolutionDgrad;
 };

 template <>
 struct ThreadblockSwizzleMap<
        conv::threadblock::DepthwiseConvolutionWgradThreadblockSwizzle> {
    static ThreadblockSwizzleID const kId =
            ThreadblockSwizzleID::kDepthwiseConvolutionWgrad;
 };

 /////////////////////////////////////////////////////////////////////////////////////////////////

 template <typename Element, typename Layout>
--- a/dnn/src/cuda/cutlass/operation_table.cpp
+++ b/dnn/src/cuda/cutlass/operation_table.cpp
@@ -99,6 +99,8 @@ ConvolutionKey get_convolution_key_from_desc(const ConvolutionDescription& desc)
    key.layout_dst = desc.dst.layout;
    key.element_bias = desc.bias.element;
    key.layout_bias = desc.bias.layout;
    key.element_accumulator =
            desc.tile_description.math_instruction.element_accumulator;

    key.convolution_type = desc.convolution_type;

@@ -124,6 +126,8 @@ ConvolutionKey get_convolution_key_from_desc(const ConvolutionDescription& desc)

    key.stages = desc.tile_description.threadblock_stages;
    key.special_optimization = desc.special_optimization;
    key.alignment_src = desc.src.alignment;
    key.alignment_filter = desc.filter.alignment;
    key.without_shared_load = desc.without_shared_load;

    return key;
--- a/dnn/src/cuda/cutlass/operation_table.h
+++ b/dnn/src/cuda/cutlass/operation_table.h
@@ -188,6 +188,7 @@ struct ConvolutionKey {
    library::LayoutTypeID layout_dst;
    library::NumericTypeID element_bias;
    library::LayoutTypeID layout_bias;
    NumericTypeID element_accumulator;

    conv::ConvType convolution_type;

@@ -206,6 +207,10 @@ struct ConvolutionKey {
    epilogue::EpilogueType epilogue_type;
    int stages;
    conv::SpecialOptimizeDesc special_optimization;

    int alignment_src;
    int alignment_filter;

    bool without_shared_load;

    inline bool operator==(ConvolutionKey const& rhs) const {
@@ -215,6 +220,7 @@ struct ConvolutionKey {
               (layout_filter == rhs.layout_filter) &&
               (element_dst == rhs.element_dst) && (layout_dst == rhs.layout_dst) &&
               (element_bias == rhs.element_bias) && (layout_bias == rhs.layout_bias) &&
               (element_accumulator == rhs.element_accumulator) &&
               (convolution_type == rhs.convolution_type) &&
               (threadblock_shape_m == rhs.threadblock_shape_m) &&
               (threadblock_shape_n == rhs.threadblock_shape_n) &&
@@ -227,6 +233,8 @@ struct ConvolutionKey {
               (instruction_shape_k == rhs.instruction_shape_k) &&
               (epilogue_type == rhs.epilogue_type) && (stages == rhs.stages) &&
               (special_optimization == rhs.special_optimization) &&
               (alignment_src == rhs.alignment_src) &&
               (alignment_filter == rhs.alignment_filter) &&
               (without_shared_load == rhs.without_shared_load);
    }

@@ -254,6 +262,7 @@ struct ConvolutionKey {
               "\n    layout_dst: " + to_string(layout_dst) +
               "\n    element_bias: " + to_string(element_bias) +
               "\n    layout_bias: " + to_string(layout_bias) +
               "\n    element_accumulator: " + to_string(element_accumulator) +
               "\n    convolution_type: " + to_string(convolution_type) +
               "\n    threadblock_shape: " + threadblock_shape_str +
               "\n    warp_shape: " + warp_shape_str +
@@ -261,6 +270,8 @@ struct ConvolutionKey {
               "\n    epilogue_type: " + to_string(epilogue_type) +
               "\n    stages: " + std::to_string(stages) +
               "\n    special_optimization: " + to_string(special_optimization) +
               "\n    alignment_src: " + std::to_string(alignment_src) +
               "\n    alignment_filter: " + std::to_string(alignment_filter) +
               "\n    without_shared_load: " + to_string(without_shared_load) + "\n}";
    }
 };
@@ -278,6 +289,7 @@ struct ConvolutionKeyHasher {
                .update(&key.layout_dst, sizeof(key.layout_dst))
                .update(&key.element_bias, sizeof(key.element_bias))
                .update(&key.layout_bias, sizeof(key.layout_bias))
                .update(&key.element_accumulator, sizeof(key.element_accumulator))
                .update(&key.convolution_type, sizeof(key.convolution_type))
                .update(&key.threadblock_shape_m, sizeof(key.threadblock_shape_m))
                .update(&key.threadblock_shape_n, sizeof(key.threadblock_shape_n))
@@ -291,6 +303,8 @@ struct ConvolutionKeyHasher {
                .update(&key.epilogue_type, sizeof(key.epilogue_type))
                .update(&key.stages, sizeof(key.stages))
                .update(&key.special_optimization, sizeof(key.special_optimization))
                .update(&key.alignment_src, sizeof(key.alignment_src))
                .update(&key.alignment_filter, sizeof(key.alignment_filter))
                .update(&key.without_shared_load, sizeof(key.without_shared_load))
                .digest();
    }
--- a/dnn/test/cuda/chanwise_convolution.cpp
+++ b/dnn/test/cuda/chanwise_convolution.cpp
@@ -38,8 +38,10 @@ bool check_need_full_bench() {
 }
 #endif

 Convolution::Param gconv_param(Convolution::Param p) {
 Convolution::Param gconv_param(Convolution::Param p, bool io16xc32 = false) {
    p.sparse = Convolution::Param::Sparse::GROUP;
    if (io16xc32)
        p.compute_mode = Convolution::Param::ComputeMode::FLOAT32;
    return p;
 }

@@ -421,6 +423,129 @@ TEST_F(CUDA, CHANWISE_CONVOLUTION_BACKWARD_FILTER) {
    }
 }

 namespace {
 template <typename Op>
 struct AlgoCheckerMaker {
    static auto make(const char* name, bool* require_algo) {
        return AlgoChecker<Op>(name, require_algo);
    }
 };

 template <>
 struct AlgoCheckerMaker<ConvolutionForward> {
    static auto make(const char* name, bool* require_algo) {
        return AlgoChecker<ConvolutionForward>(
                ExecutionPolicyAlgoName{
                        "DEFAULT",
                        {{ConvBiasForward::algo_name<ConvBiasForward::DirectParam>(
                                  name, {})
                                  .c_str(),
                          {}}}},
                require_algo);
    }
 };

 template <typename Op>
 void check_chanwise(DType io_type, DType comp_type, Handle* handle, const char* name) {
    Checker<Op> checker(handle);
    bool require_algo = false;
    checker.set_before_exec_callback(AlgoCheckerMaker<Op>::make(name, &require_algo));
    checker.set_dtype(0, io_type).set_dtype(1, io_type).set_dtype(2, io_type);
    bool io16xc32 = false;
    if (io_type == dtype::Float16()) {
        if (comp_type == dtype::Float16()) {
            checker.set_epsilon(1e-1);
        } else {
            io16xc32 = true;
        }
    }
    // dispatch testcase by operation
    if (std::is_same<Op, ConvolutionForward>::value) {
        // align 8
        checker.set_param(gconv_param({M, 7, 7, 1, 1}, io16xc32))
                .execs({{8, 2, 16, 16}, {2, 1, 1, 15, 15}, {}});
        // align 1
        checker.set_param(gconv_param({M, 7, 7, 1, 1}, io16xc32))
                .execs({{8, 2, 15, 15}, {2, 1, 1, 15, 15}, {}});
        // align 2
        checker.set_param(gconv_param({M, 7, 7, 1, 1}, io16xc32))
                .execs({{8, 2, 14, 14}, {2, 1, 1, 15, 15}, {}});
        // custom padding
        checker.set_param(gconv_param({M, 3, 3, 1, 1}, io16xc32))
                .execs({{8, 2, 16, 16}, {2, 1, 1, 15, 15}, {}});
        // custom stride
        checker.set_param(gconv_param({M, 7, 7, 2, 2}, io16xc32))
                .execs({{8, 2, 16, 16}, {2, 1, 1, 15, 15}, {}});
    } else if (std::is_same<Op, ConvolutionBackwardData>::value) {
        // align 8
        checker.set_param(gconv_param({M, 7, 7, 1, 1}, io16xc32))
                .execs({{2, 1, 1, 15, 15}, {8, 2, 16, 16}, {8, 2, 16, 16}});
        // align 1
        checker.set_param(gconv_param({M, 7, 7, 1, 1}, io16xc32))
                .execs({{2, 1, 1, 15, 15}, {8, 2, 15, 15}, {8, 2, 15, 15}});
        // align 2
        checker.set_param(gconv_param({M, 7, 7, 1, 1}, io16xc32))
                .execs({{2, 1, 1, 15, 15}, {8, 2, 14, 14}, {8, 2, 14, 14}});
        // custom padding
        checker.set_param(gconv_param({M, 3, 3, 1, 1}, io16xc32))
                .execs({{2, 1, 1, 15, 15}, {8, 2, 8, 8}, {8, 2, 16, 16}});
        // custom stride
        checker.set_param(gconv_param({M, 7, 7, 2, 2}, io16xc32))
                .execs({{2, 1, 1, 15, 15}, {8, 2, 7, 7}, {8, 2, 14, 14}});
    } else if (std::is_same<Op, ConvolutionBackwardFilter>::value) {
    }
 }
 }  // namespace

 #define MEGDNN_FOREACH_CUTLASS_CHANWISE_CONV_FWD_FMA_KERNEL(cb) \
    cb(1, 128, 128, 8, 32, 64, 8);                              \
    cb(2, 128, 64, 8, 64, 32, 8);                               \
    cb(3, 128, 32, 8, 64, 32, 8);                               \
    cb(4, 64, 128, 8, 64, 32, 8);                               \
    cb(5, 32, 128, 8, 32, 64, 8);                               \
    cb(6, 64, 64, 8, 64, 32, 8);                                \
    cb(7, 32, 64, 8, 32, 64, 8);                                \
    cb(8, 32, 32, 8, 32, 32, 8);                                \
    cb(9, 64, 32, 8, 64, 32, 8);

 #define cb(tag, tbm, tbn, tbk, wm, wn, wk)                                       \
    TEST_F(CUDA, CHANWISE_CONVOLUTION_FORWARD_CUTLASS_FMA_##tag) {               \
        check_chanwise<ConvolutionForward>(                                      \
                dtype::Float32(), dtype::Float32(), handle_cuda(),               \
                "FLOAT32_NCHW_FMA_IMPLICIT_BATCHED_GEMM_" #tbm "X" #tbn "X" #tbk \
                "_" #wm "X" #wn "X" #wk "_2stage");                              \
    }

 MEGDNN_FOREACH_CUTLASS_CHANWISE_CONV_FWD_FMA_KERNEL(cb)

 #undef cb
 #undef MEGDNN_FOREACH_CUTLASS_CHANWISE_CONV_FWD_FMA_KERNEL

 #define MEGDNN_FOREACH_CUTLASS_CHANWISE_CONV_FWD_HMMA_KERNEL(cb) \
    cb(1, 128, 128, 32, 32, 32, 32);                             \
    cb(2, 128, 256, 32, 64, 64, 32);                             \
    cb(3, 128, 64, 32, 32, 32, 32);                              \
    cb(4, 64, 128, 32, 32, 32, 32);                              \
    cb(5, 64, 64, 32, 32, 32, 32);

 // check both ioc16 and io16xc32
 #define cb(tag, tbm, tbn, tbk, wm, wn, wk)                                        \
    TEST_F(CUDA, CHANWISE_CONVOLUTION_FORWARD_CUTLASS_HMMA_##tag) {               \
        check_chanwise<ConvolutionForward>(                                       \
                dtype::Float16(), dtype::Float16(), handle_cuda(),                \
                "FLOAT16_NCHW_HMMA_IMPLICIT_BATCHED_GEMM_" #tbm "X" #tbn "X" #tbk \
                "_" #wm "X" #wn "X" #wk "_2stage");                               \
        check_chanwise<ConvolutionForward>(                                       \
                dtype::Float16(), dtype::Float32(), handle_cuda(),                \
                "FLOAT16_NCHW_HMMA_IMPLICIT_BATCHED_GEMM_" #tbm "X" #tbn "X" #tbk \
                "_" #wm "X" #wn "X" #wk "_2stage");                               \
    }

 MEGDNN_FOREACH_CUTLASS_CHANWISE_CONV_FWD_HMMA_KERNEL(cb)

 #undef cb
 #undef MEGDNN_FOREACH_CUTLASS_CHANWISE_CONV_FWD_HMMA_KERNEL

 #if MEGDNN_WITH_BENCHMARK
 TEST_F(CUDA, CHANWISE_CONVOLUTION_FORWARD_BENCH_CHECK) {
    auto handle = handle_cuda();
@@ -1123,6 +1248,82 @@ TEST_F(CUDA, BENCHMARK_CHANWISE_CONV_BWD_FILTER) {
    // clang-format on
 }

 TEST_F(CUDA, BENCHMARK_CHANWISE_CONV_FORWARD_LARGE_KERNEL) {
    CUBenchmarker<ConvolutionForward> bencher(handle_cuda());
    size_t RUNS = 100;
    bencher.set_display(false).set_times(RUNS);
    std::unique_ptr<OprProxy<ConvolutionForward>> proxy{
            new OprProxy<ConvolutionForward>{true}};
    bencher.set_proxy(proxy);

    Convolution::Param param;
    param.format = ConvBias::Param::Format::NCHW;
    param.sparse = Convolution::Param::Sparse::GROUP;
    NormalRNG rng;

    auto run = [&](size_t batch, size_t c, size_t ih, size_t iw, size_t f, size_t s) {
        param.pad_h = f / 2;
        param.pad_w = f / 2;
        param.stride_h = s;
        param.stride_w = s;
        param.compute_mode = param::Convolution::ComputeMode::DEFAULT;

        TensorShape src = {batch, c, ih, iw}, filter = {c, 1, 1, f, f};

        TensorLayout dst_layout;
        auto opr = handle_cuda()->create_operator<Convolution>();
        opr->param() = param;
        opr->deduce_layout(
                {src, dtype::Float32()}, {filter, dtype::Float32()}, dst_layout);
        float bandwith = static_cast<float>(
                                 src.total_nr_elems() + filter.total_nr_elems() +
                                 dst_layout.total_nr_elems()) /
                         (1024 * 1024 * 1024) * 1e3;

        bencher.set_param(param)
                .set_dtype(0, dtype::Float32())
                .set_dtype(1, dtype::Float32())
                .set_dtype(2, dtype::Float32())
                .set_rng(0, &rng)
                .set_rng(1, &rng);
        bencher.proxy()->target_execution_policy = {};
        auto time_in_ms_fp32 = bencher.execs({src, filter, {}}) / RUNS;

        bencher.set_param(param)
                .set_dtype(0, dtype::Float16())
                .set_dtype(1, dtype::Float16())
                .set_dtype(2, dtype::Float16())
                .set_rng(0, &rng)
                .set_rng(1, &rng);
        bencher.proxy()->target_execution_policy = {};
        auto time_in_ms_fp16 = bencher.execs({src, filter, {}}) / RUNS;

        bencher.proxy()->target_execution_policy.algo.reset();
        param.compute_mode = param::Convolution::ComputeMode::FLOAT32;
        bencher.set_param(param);
        auto time_in_ms_pseudo_fp16 = bencher.execs({src, filter, {}}) / RUNS;

        printf("stride=%zu src=%s, filter=%s, float32: %.2fms %.2fGB/s "
               "float16: %.2fms %.2fGB/s "
               "pseudo float16: %.2fms %.2fGB/s "
               "speedup: "
               "%0.2f (fp16/fp32) %.2f (fp16/pseudo fp16)\n",
               s, src.to_string().c_str(), filter.to_string().c_str(), time_in_ms_fp32,
               bandwith * 4 / time_in_ms_fp32, time_in_ms_fp16,
               bandwith * 2 / time_in_ms_fp16, time_in_ms_pseudo_fp16,
               bandwith * 2 / time_in_ms_pseudo_fp16, time_in_ms_fp32 / time_in_ms_fp16,
               time_in_ms_pseudo_fp16 / time_in_ms_fp16);
    };

    // clang-format off
    for (size_t b : {32, 64})
    for (size_t f : {3, 5, 7, 9, 11, 13, 15, 17, 19, 21, 23, 25, 27, 29, 31}) {
        run(b, 384, 32, 32, f, 1);
        run(b, 384, 64, 64, f, 1);
    }
    // clang-format on
 }

 #endif

 // vim: syntax=cpp.doxygen